UBIFS文件系统分析六之wear-leveling

2011-03-08 14:34:56

分类： LINUX

在本文的开头，先接本章讲一下EBA，什么是EBA，Eraseblock Association。

在上次提到structubi_volume结构体的成员变量eba_tbl的时候稍微提到了。每次文件系统需要对一个逻辑可擦除块（LEB）进行操作的时候，它就会到对应的volume的eba_tbl中去查找该逻辑可擦除块对应着哪一个物理可擦除块（PEB）。

EBA子系统的两个最重要的操作是map和unmap的过程。但是在UBI的内核源码中并没有关于map的专门函数，而是嵌套在ubi_eba_write_leb函数中，下面看来函数的具体代码：

int ubi_eba_write_leb(struct ubi_device*ubi, struct ubi_volume *vol, int lnum,

const void *buf, int offset, int len, intdtype)

{

interr, pnum, tries = 0, vol_id = vol->vol_id;

structubi_vid_hdr *vid_hdr;

if(ubi->ro_mode)

return-EROFS;

err= leb_write_lock(ubi, vol_id, lnum);

if(err)

returnerr;

到具体volume的eba_tbl表中去查找LEB与PEB之间的关系，如果pnum大于0就表示该LEB已经影射了

pnum= vol->eba_tbl[lnum];

if(pnum >= 0) {

dbg_eba(“write%d bytes at offset %d of LEB %d:%d, PEB %d”,

len,offset, vol_id, lnum, pnum);

err= ubi_io_write_data(ubi, buf, pnum, offset, len);

if(err) {

ubi_warn(“failedto write data to PEB %d”, pnum);

if(err == -EIO && ubi->bad_allowed)

err= recover_peb(ubi, pnum, vol_id, lnum, buf,

offset, len);

//事实上上面的ubi_io_write_data有可能会失败的，因为ubi_dbg_check_all_ff函数会检查被写入的地方是否全是0xff，对于overwrite显然不是，那么就通过recover_peb来进行数据的搬运工作。

if(err)

ubi_ro_mode(ubi);

}

leb_write_unlock(ubi,vol_id, lnum);

returnerr;

}

* The logical eraseblock is not mapped. Wehave to get a free physical

* eraseblock and write the volume identifierheader there first.

vid_hdr= ubi_zalloc_vid_hdr(ubi, GFP_NOFS);

if(!vid_hdr) {

leb_write_unlock(ubi,vol_id, lnum);

return-ENOMEM;

}

vid_hdr->vol_type= UBI_VID_DYNAMIC;

vid_hdr->sqnum= cpu_to_be64(next_sqnum(ubi));

vid_hdr->vol_id= cpu_to_be32(vol_id);

vid_hdr->lnum= cpu_to_be32(lnum);

vid_hdr->compat= ubi_get_compat(ubi, vol_id);

vid_hdr->data_pad= cpu_to_be32(vol->data_pad);

retry:

上面的代码比较简单，也不是本次关注的内容

通过函数ubi_wl_get_peb来从WL子系统中获得一块free的PEB，然后修改volume的eba_tbl，这样一个map过程就算完成了，soeasy ,~。~！！

pnum= ubi_wl_get_peb(ubi, dtype);

if(pnum < 0) {

ubi_free_vid_hdr(ubi,vid_hdr);

leb_write_unlock(ubi,vol_id, lnum);

returnpnum;

}

dbg_eba(“writeVID hdr and %d bytes at offset %d of LEB %d:%d, PEB %d”,

len,offset, vol_id, lnum, pnum);

err= ubi_io_write_vid_hdr(ubi, pnum, vid_hdr);

if(err) {

ubi_warn(“failedto write VID header to LEB %d:%d, PEB %d”,

vol_id, lnum, pnum);

goto write_error;

}

if(len) {

err= ubi_io_write_data(ubi, buf, pnum, offset, len);

if(err) {

ubi_warn(“failedto write %d bytes at offset %d of ”

“LEB %d:%d, PEB %d”, len, offset,vol_id,

lnum, pnum);

gotowrite_error;

}

vol->eba_tbl[lnum] = pnum;

leb_write_unlock(ubi,vol_id, lnum);

ubi_free_vid_hdr(ubi,vid_hdr);

return0;

write_error:

if(err != -EIO || !ubi->bad_allowed) {

ubi_ro_mode(ubi);

leb_write_unlock(ubi,vol_id, lnum);

ubi_free_vid_hdr(ubi,vid_hdr);

returnerr;

}

* Fortunately, this is the first writeoperation to this physical

* eraseblock, so just put it and request a newone. We assume that if

* this physical eraseblock went bad, the erasecode will handle that.

err= ubi_wl_put_peb(ubi, pnum, 1);

if(err || ++tries > UBI_IO_RETRIES) {

ubi_ro_mode(ubi);

leb_write_unlock(ubi,vol_id, lnum);

ubi_free_vid_hdr(ubi,vid_hdr);

returnerr;

}

vid_hdr->sqnum= cpu_to_be64(next_sqnum(ubi));

ubi_msg(“tryanother PEB”);

gotoretry;

}

接着看一个unmap的过程：

int ubi_eba_unmap_leb(struct ubi_device*ubi, struct ubi_volume *vol,

int lnum)

{

interr, pnum, vol_id = vol->vol_id;

if(ubi->ro_mode)

return-EROFS;

err= leb_write_lock(ubi, vol_id, lnum);

if(err)

returnerr;

首先还是查询vol->eba_tbl表，如果对应的想为-1,说明我们要unmap的块根本就没有map，所以也就不需要做任何事情了

pnum= vol->eba_tbl[lnum];

if(pnum < 0)

/*This logical eraseblock is already unmapped */

gotoout_unlock;

dbg_eba(“eraseLEB %d:%d, PEB %d”, vol_id, lnum, pnum);

如果不是小于0，那么得到值肯定是一个PEB号，修改eba_tbl对应项为-1

vol->eba_tbl[lnum]= UBI_LEB_UNMAPPED;

我们上面提到了，在map的过程中需要从WL子系统中获得peb，现在unmap掉了，需要将PEB归还给WL子系统并需要擦除，这个是由ubi_wl_put_peb

完成的

。

err= ubi_wl_put_peb(ubi, pnum, 0);

out_unlock:

leb_write_unlock(ubi,vol_id, lnum);

returnerr;

}

从上面的这段例子中可以看出，在UBI中，获得每一个PEB都是从WL子系统中获得，释放掉的每一个PEB都要归还给WL子系统，可以说WL无处不在每一个涉及可擦除块的使用的操作肯定涉及到WL子系统。

下面介绍一下涉及的wl的主要的数据结构：

struct ubi_wl_entry {

union{

structrb_node rb;

structlist_head list;

}u;

intec;

intpnum;

};

从这个结构体中可以看出WL子系统操作的是实实在在的物理可擦除块，另外一个关注的就是EC头部的erase counter，这也是WL进行操作的依据。

从联合u中可以看出wl子系统中是采用红黑树来管理的。关于红黑的一些操作下面稍微掠过，并不以源码的形式详细阐述。

static void wl_tree_add(struct ubi_wl_entry*e, struct rb_root *root)该操作用于将e添加到以root为RB树根的树中

static int in_wl_tree(struct ubi_wl_entry*e, struct rb_root *root)用于判断e是否存在于以root为根的RB树中

static struct ubi_wl_entry*find_wl_entry(struct rb_root *root, int max)用于在以root为根的RB树中查找erase counter无限左接近max的PEB。

WL的作用是什么呢？上面提到了一点，就是以EC值为依据来进行可擦除逻辑块的管理，以防对某一些可擦除块过多的操作导致变为坏块。如果在操作的过程中发现，某一个可擦除块的EC值变的不正常了，也就是变的太大了。（EC值是随着擦除的次数增加的）。既然EC值已经变的这么大了，那么这块可擦除块还能用吗？能。

在include/mtd/ubi-user.h中有这样一个枚举。

enum {

UBI_LONGTERM = 1,

UBI_SHORTTERM= 2,

UBI_UNKNOWN = 3,

};

定了三种用于指定数据类型的标志位，从名字中可以看出这个枚举的目的用于说明数据是长期还是短期保存。

在ubi_wl_get_peb函数中有这样的一段代码：

caseUBI_LONGTERM:

e= find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

break;

我们在获得一个PEB的时候，如果是用于长期保存的数据的话，那么就取一个EC值比较大（也就是已经擦除过很多次）的PEB。这样就物尽其用了。

根据ubidesign的说明：UBI select a long term storageblock with a low erase count and copies the block contents to the block withthe high erase count using the block moving function.但是在后面的源码中并没有看到long term这方面的考虑？（我哪儿没看到？）

函数ensure_wear_leveling就是用来判断是否存在上述的这种情况的。

static int ensure_wear_leveling(structubi_device *ubi)

{

interr = 0;

structubi_wl_entry *e1;

structubi_wl_entry *e2;

structubi_work *wrk;

spin_lock(&ubi->wl_lock);

//

如果Wear-leveling已经在work工作队列了，那么这样的判断就没有必要了，因为不管你怎么判断，都是Wear-leveling必须的，而且会对pending的work造成影响，所以就什么事情也不做了。

if(ubi->wl_scheduled)

/*Wear-leveling is already in the work queue */

gotoout_unlock;

* If the ubi->scrub tree is not empty, scrubbingis needed, and the

* the WL worker has to be scheduled anyway.

@情况一：如果没有已经使用的可擦除块，也就是说该UBI设备刚被attach上去，没有任何数据。

@情况二：没有可用的可擦除块。上面说到了WL是将一块的数据搬运到另外一块可擦除块中，现在没有可用的可擦除块了，工作进行不下去了

if(!ubi->scrub.rb_node) {

//

这个队列中的结点是从哪儿来的呢？也就是说在什么情况下添加进来的

if(!ubi->used.rb_node || !ubi->free.rb_node)

/*No physical eraseblocks – no deal */

gotoout_unlock;

* We schedule wear-leveling only if thedifference between the

* lowest erase counter of used physicaleraseblocks and a high

* erase counter of free physical eraseblocksis greater than

* %UBI_WL_THRESHOLD.

上面说到了WL是将一块已用的可擦除块中的数据搬运到另外一块未用的可擦除块中去，所以就从used树中找一块EC值很小的（但是根据文档说，这儿应该是找一块UBI_LONGTERM类型的并且EC值比较小的），然后再从free树中找一块ec值很大的。

e1= rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);

e2= find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

if(!(e2->ec – e1->ec >= UBI_WL_THRESHOLD))

gotoout_unlock;

dbg_wl(“schedulewear-leveling”);

}else

dbg_wl(“schedulescrubbing”);

ubi->wl_scheduled= 1;

//

注意这儿将wl_scheduled标志置位

spin_unlock(&ubi->wl_lock);

wrk= kmalloc(sizeof(struct ubi_work), GFP_NOFS);

if(!wrk) {

err= -ENOMEM;

gotoout_cancel;

}

//

构造一个worker，并添加到队列中由后台进程来完成。具体工作是由wear_leveling_worker来完成的。

wrk->func= &wear_leveling_worker;

schedule_ubi_work(ubi,wrk);

returnerr;

out_cancel:

spin_lock(&ubi->wl_lock);

ubi->wl_scheduled= 0;

out_unlock:

spin_unlock(&ubi->wl_lock);

returnerr;

}

下面就看看wear_leveling_worker这个函数的具体的工作：

static int wear_leveling_worker(structubi_device *ubi, struct ubi_work *wrk,

intcancel)

{

interr, scrubbing = 0, torture = 0, protect = 0, erroneous = 0;

intvol_id = -1, uninitialized_var(lnum);

structubi_wl_entry *e1, *e2;

structubi_vid_hdr *vid_hdr;

kfree(wrk);

if(cancel)

return0;

//

分配一个VID头部，因为在拷贝数据的过程中，需要重新写入VID

vid_hdr= ubi_zalloc_vid_hdr(ubi, GFP_NOFS);

if(!vid_hdr)

return-ENOMEM;

mutex_lock(&ubi->move_mutex);

spin_lock(&ubi->wl_lock);

ubi_assert(!ubi->move_from&& !ubi->move_to);

ubi_assert(!ubi->move_to_put);

@下面的英文注释已经说的很清楚了，如果没有free的PEB，没有关系，可以等待被pending的erase_worker完成。但是如果连scrub都没有，那么就没有办法了，取消本次WL操作

@没有used 的PEB？。在ubi_wl_get_peb函数中

rb_erase(&e->u.rb,&ubi->free)

prot_queue_add(ubi, e);

而在ubi_wl_put_peb中有：

prot_queue_del(ubi,e->pnum);

这样的操作，相信在别的地方如erase_wroker也有这样的操作。也就是说UBI会将暂时操作的PEB从相应的队列中暂时移除，把它放到ubi->pq

中保护起来

。

if(!ubi->free.rb_node ||

(!ubi->used.rb_node &&!ubi->scrub.rb_node)) {

* No free physical eraseblocks? Well, theymust be waiting in

* the queue to be erased. Cancel movement – itwill be

* triggered again when a free physicaleraseblock appears.

* No used physical eraseblocks? They must betemporarily

* protected from being moved. They will bemoved to the

* @ubi->used tree later and thewear-leveling will be

* triggered again.

dbg_wl(“cancelWL, a list is empty: free %d, used %d”,

!ubi->free.rb_node, !ubi->used.rb_node);

gotoout_cancel;

}

if(!ubi->scrub.rb_node) {

* Now pick the least worn-out used physicaleraseblock and a

* highly worn-out free physical eraseblock. Ifthe erase

* counters differ much enough, startwear-leveling.

e1= rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);

//

如果scrub队列是空的，那么就从free队列中取一个目标PEB进行WL操作（EC无限左接近于WL_FREE_MAX_DIFF）

e2= find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

if(!(e2->ec – e1->ec >= UBI_WL_THRESHOLD)) {

dbg_wl(“noWL needed: min used EC %d, max free EC %d”,

e1->ec, e2->ec);

gotoout_cancel;

}

paranoid_check_in_wl_tree(e1,&ubi->used);

//

rb_erase

是一个红黑的基本删除操作，在lib/rbtree.c中。这儿e1中的数据被转移了，那么就需要将e1从ubi->used队列中删除掉

rb_erase(&e1->u.rb,&ubi->used);

dbg_wl(“movePEB %d EC %d to PEB %d EC %d”,

e1->pnum, e1->ec, e2->pnum,e2->ec);

}else {

/*Perform scrubbing */

scrubbing= 1;

//注意这儿从scrub中获得e2的时候，并没有像上面一样

if (!(e2->ec – e1->ec >= UBI_WL_THRESHOLD))比较e1和e2的EC值，为什么呢？因为scrub队列中的PEB都是在读的时候发生BIT_FILP的，所以必须进行WL

e1= rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, u.rb);

e2= find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

paranoid_check_in_wl_tree(e1,&ubi->scrub);

rb_erase(&e1->u.rb,&ubi->scrub);

dbg_wl(“scrubPEB %d to PEB %d”, e1->pnum, e2->pnum);

}

paranoid_check_in_wl_tree(e2,&ubi->free);

rb_erase(&e2->u.rb,&ubi->free);

//

注意这儿，这两个指针在数据搬运完成之后会被清除掉的

ubi->move_from= e1;

ubi->move_to= e2;

spin_unlock(&ubi->wl_lock);

* Now we are going to copy physical eraseblock@e1->pnum to @e2->pnum.

* We so far do not know which logicaleraseblock our physical

* eraseblock (@e1) belongs to. We have to readthe volume identifier

* header first.

* Note, we are protected from this PEB beingunmapped and erased. The

* ‘ubi_wl_put_peb()’ would wait for moving tobe finished if the PEB

* which is being moved was unmapped.

err= ubi_io_read_vid_hdr(ubi, e1->pnum, vid_hdr, 0);

if(err && err != UBI_IO_BITFLIPS) {

if(err == UBI_IO_PEB_FREE) {

* We are trying to move PEB without a VIDheader. UBI

* always write VID headers shortly after thePEB was

* given, so we have a situation when it hasnot yet

* had a chance to write it, because it waspreempted.

* So add this PEB to the protection queue sofar,

* because presumably more data will be writtenthere

* (including the missing VID header), and thenwe’ll

* move it.

//进一步检查VID头部，不能说因为它是从used队列中取出来的就直接将数据搬运过去了，可以以前某个地方出错了。这儿如果发现我们要搬运的PEB本身就是空，那么搬运也就没必要进行下去了。

dbg_wl(“PEB%d has no VID header”, e1->pnum);

protect= 1;

gotoout_not_moved;

}

ubi_err(“error%d while reading VID header from PEB %d”,

err,e1->pnum);

gotoout_error;

}

vol_id= be32_to_cpu(vid_hdr->vol_id);

lnum= be32_to_cpu(vid_hdr->lnum);

//

具体搬运数据由ubi_eba_copy_leb函数完成，实现比较简单，不在赘述

err= ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vid_hdr);

if(err) {

if(err == MOVE_CANCEL_RACE) {

* The LEB has not been moved because the volumeis

* being deleted or the PEB has been putmeanwhile. We

* should prevent this PEB from being selectedfor

* wear-leveling movement again, so put it tothe

* protection queue.

protect= 1;

gotoout_not_moved;

}

if(err == MOVE_CANCEL_BITFLIPS || err == MOVE_TARGET_WR_ERR ||

err == MOVE_TARGET_RD_ERR) {

* Target PEB had bit-flips or write error -torture it.

torture= 1;

gotoout_not_moved;

}

if(err == MOVE_SOURCE_RD_ERR) {

* An error happened while reading the sourcePEB. Do

* not switch to R/O mode in this case, andgive the

* upper layers a possibility to recover fromthis,

* e.g. by unmapping corresponding LEB.Instead, just

* put this PEB to the @ubi->erroneous listto prevent

* UBI from trying to move it over and overagain.

if(ubi->erroneous_peb_count > ubi->max_erroneous) {

ubi_err(“toomany erroneous eraseblocks (%d)”,

ubi->erroneous_peb_count);

gotoout_error;

}

erroneous= 1;

gotoout_not_moved;

}

if(err < 0)

gotoout_error;

ubi_assert(0);

}

/*The PEB has been successfully moved */

if(scrubbing)

ubi_msg(“scrubbedPEB %d (LEB %d:%d), data moved to PEB %d”,

e1->pnum,vol_id, lnum, e2->pnum);

ubi_free_vid_hdr(ubi,vid_hdr);

spin_lock(&ubi->wl_lock);

if(!ubi->move_to_put) {

wl_tree_add(e2,&ubi->used);

e2= NULL;

}

ubi->move_from= ubi->move_to = NULL;

ubi->move_to_put= ubi->wl_scheduled = 0;

spin_unlock(&ubi->wl_lock);

//

这个通过后台进程来擦除e1，erase_worker

err= schedule_erase(ubi, e1, 0);

if(err) {

kmem_cache_free(ubi_wl_entry_slab,e1);

if(e2)

kmem_cache_free(ubi_wl_entry_slab,e2);

gotoout_ro;

}

if(e2) {

* Well, the target PEB was put meanwhile,schedule it for

* erasure.

dbg_wl(“PEB%d (LEB %d:%d) was put meanwhile, erase”,

e2->pnum, vol_id, lnum);

err= schedule_erase(ubi, e2, 0);

if(err) {

kmem_cache_free(ubi_wl_entry_slab,e2);

gotoout_ro;

}

dbg_wl(“done”);

mutex_unlock(&ubi->move_mutex);

return0;

* For some reasons the LEB was not moved,might be an error, might be

* something else. @e1 was not changed, soreturn it back. @e2 might

* have been changed, schedule it for erasure.

out_not_moved:

if(vol_id != -1)

dbg_wl(“cancelmoving PEB %d (LEB %d:%d) to PEB %d (%d)”,

e1->pnum, vol_id, lnum, e2->pnum,err);

else

dbg_wl(“cancelmoving PEB %d to PEB %d (%d)”,

e1->pnum, e2->pnum, err);

spin_lock(&ubi->wl_lock);

if(protect)

prot_queue_add(ubi,e1);

elseif (erroneous) {

wl_tree_add(e1,&ubi->erroneous);

ubi->erroneous_peb_count+= 1;

}else if (scrubbing)

wl_tree_add(e1,&ubi->scrub);

else

wl_tree_add(e1,&ubi->used);

ubi_assert(!ubi->move_to_put);

ubi->move_from= ubi->move_to = NULL;

ubi->wl_scheduled= 0;

spin_unlock(&ubi->wl_lock);

ubi_free_vid_hdr(ubi,vid_hdr);

err= schedule_erase(ubi, e2, torture);

if(err) {

kmem_cache_free(ubi_wl_entry_slab,e2);

gotoout_ro;

}

mutex_unlock(&ubi->move_mutex);

return0;

out_error:

if(vol_id != -1)

ubi_err(“error%d while moving PEB %d to PEB %d”,

err,e1->pnum, e2->pnum);

else

ubi_err(“error%d while moving PEB %d (LEB %d:%d) to PEB %d”,

err,e1->pnum, vol_id, lnum, e2->pnum);

spin_lock(&ubi->wl_lock);

ubi->move_from= ubi->move_to = NULL;

ubi->move_to_put= ubi->wl_scheduled = 0;

spin_unlock(&ubi->wl_lock);

ubi_free_vid_hdr(ubi,vid_hdr);

kmem_cache_free(ubi_wl_entry_slab,e1);

kmem_cache_free(ubi_wl_entry_slab,e2);

out_ro:

ubi_ro_mode(ubi);

mutex_unlock(&ubi->move_mutex);

ubi_assert(err!= 0);

returnerr < 0 ? err : -EIO;

out_cancel:

ubi->wl_scheduled= 0;

spin_unlock(&ubi->wl_lock);

mutex_unlock(&ubi->move_mutex);

ubi_free_vid_hdr(ubi,vid_hdr);

return0;

}

至此，WL基本完成。主要代码都在/drivers/mtd/ubi/wl.c文件中。

那么UBIFS中在什么情况下会调用ensure_wear_leveling来判断是否进行WL。

1. erase_worker

2. ubi_wl_scrub_peb

3. ubi_wl_init_scan

对于WL，有点需要详细说明一下:上面提到了ubi->scrub中的结点是从哪儿来的？

在ubi_eba_read_leb函数，当发生BIT_FILP的时候，会调用ubi_wl_scrub_peb来进行WL。

同样在上面的ensure_wear_leveling中看到了，WL中是优先到ubi->scrub队列中查找的。

你可能也喜欢