您的位置：首页 > 大数据 > 人工智能

soft raid5阅读笔记之七--MD中的bitmap

2016-07-23 12:40 411 查看

本节主要介绍MD中的bitmap的机制，该机制主要用于减少不必要的同步操作。在真正的数据IO写操作之前先将该chunk对应的bitmap内存中的bit位设置为1，写入磁盘文件bitmap文件中，而在真正的数据写完成之后，再将bitmap文件中的bit位清零。这样，在进行一次IO写操作中，就多了两次磁盘的写操作，势必影响IO的效率，因此，在linux内核中，关于这部分做了两个方面的优化：1）批量写入；2）延迟清除。使得bitmap的操作现在缓存中操作，必要时再写入磁盘。
在进入主题之前，先看看这部分涉及的几个主要的数据结构：
1）超级块位于磁盘文件开始前256个字节，用于记录bitmap文件的管理信息，主要的域为chunksize（bitmap文件中一个bit对应的chunk的大小）。

typedef struct bitmap_super_s {
__le32 magic;        /*  0  BITMAP_MAGIC */
__le32 version;      /*  4  the bitmap major for now, could change... */
__u8  uuid[16];      /*  8  128 bit uuid - must match md device uuid */
__le64 events;       /* 24  event counter for the bitmap (1)*/
__le64 events_cleared;/*32  event counter when last bit cleared (2) */
__le64 sync_size;    /* 40  the size of the md device's sync range(3) */
__le32 state;        /* 48  bitmap state information */
__le32 chunksize;    /* 52  the bitmap chunk size in bytes */
__le32 daemon_sleep; /* 56  seconds between disk flushes */
__le32 write_behind; /* 60  number of outstanding write-behind writes */

__u8  pad[256 - 64]; /* set to zero */
} bitmap_super_t;

2）从注释可以看出，该结构体代表了bitmap在内存中的页；

/* the in-memory bitmap is represented by bitmap_pages */
struct bitmap_page {
/*
* map points to the actual memory page映射到实际物理页的指针
*/
char *map;
/*
* in emergencies (when map cannot be alloced), hijack the map特殊情况下，使用映射的指针作为计数器，因为一个计数器的大小为16位，因此，
* pointer and use it as two counters itself可以将指针作为两个计数器来使用；
*/
unsigned int hijacked:1;
/*
* count of dirty bits on the page     在一个物理页中的dirty位的计数器
*/
unsigned int  count:31;
};

3）bitmap在磁盘中的文件表现，每个mddev（磁盘阵列）包含一个bitmap文件。

/* the main bitmap structure - one per mddev */
struct bitmap {
struct bitmap_page *bp;     /*bitmap文件对应的物理内存页的数组*/
unsigned long pages; /* total number of pages in the bitmap bitmap文件映射到内存中总共占用的页数*/
unsigned long missing_pages; /* number of pages not yet allocated */

mddev_t *mddev; /* the md device that the bitmap is for */

int counter_bits; /* how many bits per block counter */

/* bitmap chunksize -- how much data does each bit represent?每个bit位代表的数据chunk大小 */
unsigned long chunksize;
unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
unsigned long chunks; /* total number of data chunks for the array 阵列中总共包含的数据chunk的数量*/

/* We hold a count on the chunk currently being synced, and drop
* it when the last block is started.  If the resync is aborted
* midway, we need to be able to drop that count, so we remember
* the counted chunk..
*/
unsigned long syncchunk;

__u64     events_cleared;
int need_sync;

/* bitmap spinlock */
spinlock_t lock;
/*bitmap有两种表现形式：1）存放在MD设备之外，此时file指向的就是bitmap文件对应的file；2）存放在MD设备中，此时offset代表了bitmap距离superblock的偏移值*/
long offset; /* offset from superblock if file is NULL */
struct file *file; /* backing disk file */
struct page *sb_page; /* cached copy of the bitmap file superblock                bimap文件的superblock对应的内存页*/
struct page **filemap; /* list of cache pages for the file                          bitmap文件映射到内存中所在的物理页框的指针数组*/
unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
unsigned long file_pages; /* number of pages in the file                bitmap文件映射到内存页的数量*/
int last_page_size; /* bytes in the last page */

unsigned long flags;

int allclean;

unsigned long max_write_behind; /* write-behind mode */
atomic_t behind_writes;

/*
* the bitmap daemon - periodically wakes up and sweeps the bitmap     bitmap的后台程序成员---周期性的被唤醒，清除响应的bit位，
* file, cleaning up bits and flushing out pages to disk as necessary          并在必要时写入磁盘中
*/
unsigned long daemon_lastrun; /* jiffies of last run */
unsigned long daemon_sleep; /* how many seconds between updates? */
unsigned long last_end_sync; /* when we lasted called end_sync to
* update bitmap with resync progress */

atomic_t pending_writes; /* pending writes to the bitmap file */
wait_queue_head_t write_wait;
wait_queue_head_t overflow_wait;

};
* in-memory bitmap:     内存中的bitmap：使用16位的块计数器来跟踪挂起的写到每个chunk上的请求的计数，高两位用于特殊的目的，
* 第一位表示是否需要同步，第二位表示同步是否处于激活状态
* Use 16 bit block counters to track pending writes to each "chunk".
* The 2 high order bits are special-purpose, the first is a flag indicating
* whether a resync is needed.  The second is a flag indicating whether a
* resync is active.
* This means that the counter is actually 14 bits:
*
* +--------+--------+------------------------------------------------+
* | resync | resync |               counter                          |
* | needed | active |                                                |
* |  (0-1) |  (0-1) |              (0-16383)                         |
* +--------+--------+------------------------------------------------+

下面我们来看看相关的一些宏定义，有助于理解bitmap的实现：

#define PAGE_BITS (PAGE_SIZE << 3)     /*一页包含的bit位的数量*/    =4KB*8=32Kbit
#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)     /*一页包含的bit位的数量的偏移值*/     =12+3=15

typedef __u16 bitmap_counter_t;
#define COUNTER_BITS 16          /*计数器包含的bit位数量*/
#define COUNTER_BIT_SHIFT 4     /*计数器包含的bit位数量的偏移值*/
#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)     /*计数器包含多少字节*/    =16/8=2byte
#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)     /*计数器包含字节的偏移值*/     =4-3=1
/*记录挂起的写操作的计数器相关的宏定义*/
#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)

/* how many counters per page? */
#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)     /*一页包含的计数器的个数*/
/* same, e
4000
xcept a shift value for more efficient bitops */
#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)     /*一页包含的计数器个数的偏移值*/
/* same, except a mask value for more efficient bitops */
#define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)          /*一页包含的计数器个数的掩码*/

#define BITMAP_BLOCK_SIZE 512     /*bitmap的块大小=扇区的大小*/
#define BITMAP_BLOCK_SHIFT 9     /*bitmap的快大小的偏移值*/

/* how many blocks per chunk? (this is variable) */
#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)     /*一个chunk包含的块的个数*/
#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)     /*偏移值*/
#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)                    /*掩码*/

/* when hijacked, the counters and bits represent even larger "chunks" */
/* there will be 1024 chunks represented by each counter in the page pointers */
#define PAGEPTR_BLOCK_RATIO(bitmap) \
(CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
#define PAGEPTR_BLOCK_SHIFT(bitmap) \
(CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)

/*
* on-disk bitmap:
*
* Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
* file a page at a time. There's a superblock at the start of the file.
*/

/* map chunks (bits) to file pages - offset by the size of the superblock */
#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))     /*chunk 的位偏移*/

在介绍函数调用关系之前，先介绍一下bitmap内存页的状态标志：

enum bitmap_page_attr {
BITMAP_PAGE_DIRTY = 0, // there are set bits that need to be synced   bitmap中的数据位为dirty，说明需要同步到磁盘中，在写操作之前设置
BITMAP_PAGE_CLEAN = 1, // there are bits that might need to be cleared  bitmap中的数据位需要被清除掉，在写数据完成后设置
BITMAP_PAGE_NEEDWRITE=2, // there are cleared bits that need to be synced  bitmap中的数据位需要被同步到磁盘中
};

下面重点分析函数的调用关系，主要包含以下几个部分：
1）设置BITMAP_PAGE_DIRTY：在发送写请求make_request()中调用add_stripe_bio()，进而调用bitmap_startwrite()，该函数就是通过调用set_page_attr()函数来设置BITMAP_PAGE_DIRTY；

2）在守护线程中，将标志位BITMAP_PAGE_DIRTY的内存页写入到磁盘文件中：

3）在写操作完成后，调用bitmap_endwrite()函数，完成对BITMAP_PAGE_CLEAN状态信息的设置：

4）在守护线程中，调用bitmap_daemon_work()完成对bitmap文件的磁盘写操作：

在这里，我们重点分析下面几个函数：
1）bitmap_daemon_work()：该函数主要的功能就是在清除bit位，并将bitmap内存页写入到磁盘文件中；但是过程比较难理解，是通过三次调用该函数才完成：
第一次进入，先清除BITMAP_PAGE_CLEAN状态信息，而此时的*bmc=2，做*bmc--后重新设置BITMAP_PAGE_CLEAN状态信息;
第二次进入，*bmc--后清除掉BITMAP_PAGE_CLEAN状态信息，同时设置BITMAP_PAGE_NEEDWRITE状态信息;
第三次进入，才是真正调用write_page()完成对bitmap磁盘文件的写入操作；

/*该函数主要是更新bitmap的写计数器，在add_stripe_bio()函数中被调用，调用的条件是conf->mddev->bitmap && firstwrite
*（第一次写，并且bitmap指针不为空）
*@bitmap：指向内存中bitmap结构体的指针
*@offset：起始扇区
*@sectors：写请求的扇区数
*@behind：标志是否为write-behind
*返回值：为0 */
int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind){
if (!bitmap) return 0;

if (behind) {
atomic_inc(&bitmap->behind_writes);
PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n",
atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
}

while (sectors) {     /*以block大小为单位，循环更新bitmap的计数器*/
int blocks;
bitmap_counter_t *bmc;

spin_lock_irq(&bitmap->lock);
bmc = bitmap_get_counter(bitmap, offset, &blocks, 1);     /*获取当前扇区所对应的计数器*/
if (!bmc) {
spin_unlock_irq(&bitmap->lock);
return 0;
}

if (unlikely((*bmc & COUNTER_MAX) == COUNTER_MAX)) {     /*如果计数器达到最大值，则等待，调用schedule()放弃CPU*/
DEFINE_WAIT(__wait);
/* note that it is safe to do the prepare_to_wait
* after the test as long as we do it before dropping
* the spinlock.
*/
prepare_to_wait(&bitmap->overflow_wait, &__wait,
TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&bitmap->lock);
blk_unplug(bitmap->mddev->queue);
schedule();
finish_wait(&bitmap->overflow_wait, &__wait);
continue;
}

switch(*bmc) {
case 0:           /*如果当前的计数器为0*/
bitmap_file_set_bit(bitmap, offset); /*将bitmap的内存页中对应的bit设置为1，表明要写磁盘，并将页的状态设置为BITMAP_PAGE_DIRTY*/
bitmap_count_page(bitmap,offset, 1);     /*增加bitmap页的计数器*/
blk_plug_device_unlocked(bitmap->mddev->queue);     /*蓄流，等待更多的写操作*/
/* fall through */
case 1:
*bmc = 2;
}

(*bmc)++;

spin_unlock_irq(&bitmap->lock);

offset += blocks;
if (sectors > blocks)
sectors -= blocks;
else sectors = 0;
}
bitmap->allclean = 0;
return 0;
}

/* this gets called when the md device is ready to unplug its underlying
* (slave) device queues -- before we let any writes go down, we need to
* sync the dirty pages of the bitmap file to disk
* 该函数主要泄流，在让任何写操作执行之前，我们需要同步bitmap文件中的dirty页到磁盘上
*/
void bitmap_unplug(struct bitmap *bitmap)
{
unsigned long i, flags;
int dirty, need_write;
struct page *page;
int wait = 0;

if (!bitmap)
return;

/* look at each page to see if there are any set bits that need to be
* flushed out to disk */
for (i = 0; i < bitmap->file_pages; i++) {     /*循环遍历bitmap文件对应的内存也，查看是否需要为dirty或是need_write*/
spin_lock_irqsave(&bitmap->lock, flags);
if (!bitmap->filemap) {
spin_unlock_irqrestore(&bitmap->lock, flags);
return;
}
page = bitmap->filemap[i];
dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
if (dirty)
wait = 1;
spin_unlock_irqrestore(&bitmap->lock, flags);

if (dirty | need_write)
write_page(bitmap, page, 0);     /*将bitmap文件中的dirty或need_write页写入磁盘中*/
}
if (wait) { /* if any writes were performed, we need to wait on them如果有写bitmap文件的写操作在执行，则等待写操作完成 */
if (bitmap->file)     /*如果bitmap磁盘文件存放在MD设备之外*/
wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0);
else                    /*如果bitmap磁盘文件存放在MD设备自身*/
md_super_wait(bitmap->mddev);     /*等待MD设备的superblock写完成*/
}
if (bitmap->flags & BITMAP_WRITE_ERROR)
bitmap_file_kick(bitmap);
}
/*循环检查该数据块是否需要同步，如果设置了dirty或是need-write，则需要同步，否则不需要同步操作*/
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
int degraded)
{
/* bitmap_start_sync must always report on multiples of whole
* pages, otherwise resync (which is very PAGE_SIZE based) will
* get confused.
* So call __bitmap_start_sync repeatedly (if needed) until
* At least PAGE_SIZE>>9 blocks are covered.
* Return the 'or' of the result.
*/
int rv = 0;
int blocks1;

*blocks = 0;
while (*blocks < (PAGE_SIZE>>9)) {
rv |= __bitmap_start_sync(bitmap, offset,
&blocks1, degraded);
offset += blocks1;
*blocks += blocks1;
}
return rv;
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： raid

相关文章推荐

新的分享

章节导航