Maemo patchset 20101501+0m5
[h-e-n] / drivers / mtd / ubi / wl.c
index dcb6dac..18d6ff3 100644 (file)
@@ -22,7 +22,7 @@
  * UBI wear-leveling sub-system.
  *
  * This sub-system is responsible for wear-leveling. It works in terms of
- * physical* eraseblocks and erase counters and knows nothing about logical
+ * physical eraseblocks and erase counters and knows nothing about logical
  * eraseblocks, volumes, etc. From this sub-system's perspective all physical
  * eraseblocks are of two types - used and free. Used physical eraseblocks are
  * those that were "get" by the 'ubi_wl_get_peb()' function, and free physical
  *
  * As it was said, for the UBI sub-system all physical eraseblocks are either
  * "free" or "used". Free eraseblock are kept in the @wl->free RB-tree, while
- * used eraseblocks are kept in a set of different RB-trees: @wl->used,
- * @wl->prot.pnum, @wl->prot.aec, and @wl->scrub.
+ * used eraseblocks are kept in @wl->used, @wl->erroneous, or @wl->scrub
+ * RB-trees, as well as (temporarily) in the @wl->pq queue.
+ *
+ * When the WL sub-system returns a physical eraseblock, the physical
+ * eraseblock is protected from being moved for some "time". For this reason,
+ * the physical eraseblock is not directly moved from the @wl->free tree to the
+ * @wl->used tree. There is a protection queue in between where this
+ * physical eraseblock is temporarily stored (@wl->pq).
+ *
+ * All this protection stuff is needed because:
+ *  o we don't want to move physical eraseblocks just after we have given them
+ *    to the user; instead, we first want to let users fill them up with data;
+ *
+ *  o there is a chance that the user will put the physical eraseblock very
+ *    soon, so it makes sense not to move it for some time, but wait; this is
+ *    especially important in case of "short term" physical eraseblocks.
+ *
+ * Physical eraseblocks stay protected only for limited time. But the "time" is
+ * measured in erase cycles in this case. This is implemented with help of the
+ * protection queue. Eraseblocks are put to the tail of this queue when they
+ * are returned by the 'ubi_wl_get_peb()', and eraseblocks are removed from the
+ * head of the queue on each erase operation (for any eraseblock). So the
+ * length of the queue defines how may (global) erase cycles PEBs are protected.
+ *
+ * To put it differently, each physical eraseblock has 2 main states: free and
+ * used. The former state corresponds to the @wl->free tree. The latter state
+ * is split up on several sub-states:
+ * o the WL movement is allowed (@wl->used tree);
+ * o the WL movement is disallowed (@wl->erroneous) because the PEB is
+ *   erroneous - e.g., there was a read error;
+ * o the WL movement is temporarily prohibited (@wl->pq queue);
+ * o scrubbing is needed (@wl->scrub tree).
+ *
+ * Depending on the sub-state, wear-leveling entries of the used physical
+ * eraseblocks may be kept in one of those structures.
  *
  * Note, in this implementation, we keep a small in-RAM object for each physical
  * eraseblock. This is surely not a scalable solution. But it appears to be good
  * target PEB, we pick a PEB with the highest EC if our PEB is "old" and we
  * pick target PEB with an average EC if our PEB is not very "old". This is a
  * room for future re-works of the WL sub-system.
- *
- * Note: the stuff with protection trees looks too complex and is difficult to
- * understand. Should be fixed.
  */
 
 #include <linux/slab.h>
 #define WL_RESERVED_PEBS 1
 
 /*
- * How many erase cycles are short term, unknown, and long term physical
- * eraseblocks protected.
- */
-#define ST_PROTECTION 16
-#define U_PROTECTION  10
-#define LT_PROTECTION 4
-
-/*
  * Maximum difference between two erase counters. If this threshold is
  * exceeded, the WL sub-system starts moving data from used physical
  * eraseblocks with low erase counter to free physical eraseblocks with high
 #define WL_MAX_FAILURES 32
 
 /**
- * struct ubi_wl_prot_entry - PEB protection entry.
- * @rb_pnum: link in the @wl->prot.pnum RB-tree
- * @rb_aec: link in the @wl->prot.aec RB-tree
- * @abs_ec: the absolute erase counter value when the protection ends
- * @e: the wear-leveling entry of the physical eraseblock under protection
- *
- * When the WL sub-system returns a physical eraseblock, the physical
- * eraseblock is protected from being moved for some "time". For this reason,
- * the physical eraseblock is not directly moved from the @wl->free tree to the
- * @wl->used tree. There is one more tree in between where this physical
- * eraseblock is temporarily stored (@wl->prot).
- *
- * All this protection stuff is needed because:
- *  o we don't want to move physical eraseblocks just after we have given them
- *    to the user; instead, we first want to let users fill them up with data;
- *
- *  o there is a chance that the user will put the physical eraseblock very
- *    soon, so it makes sense not to move it for some time, but wait; this is
- *    especially important in case of "short term" physical eraseblocks.
- *
- * Physical eraseblocks stay protected only for limited time. But the "time" is
- * measured in erase cycles in this case. This is implemented with help of the
- * absolute erase counter (@wl->abs_ec). When it reaches certain value, the
- * physical eraseblocks are moved from the protection trees (@wl->prot.*) to
- * the @wl->used tree.
- *
- * Protected physical eraseblocks are searched by physical eraseblock number
- * (when they are put) and by the absolute erase counter (to check if it is
- * time to move them to the @wl->used tree). So there are actually 2 RB-trees
- * storing the protected physical eraseblocks: @wl->prot.pnum and
- * @wl->prot.aec. They are referred to as the "protection" trees. The
- * first one is indexed by the physical eraseblock number. The second one is
- * indexed by the absolute erase counter. Both trees store
- * &struct ubi_wl_prot_entry objects.
- *
- * Each physical eraseblock has 2 main states: free and used. The former state
- * corresponds to the @wl->free tree. The latter state is split up on several
- * sub-states:
- * o the WL movement is allowed (@wl->used tree);
- * o the WL movement is temporarily prohibited (@wl->prot.pnum and
- * @wl->prot.aec trees);
- * o scrubbing is needed (@wl->scrub tree).
- *
- * Depending on the sub-state, wear-leveling entries of the used physical
- * eraseblocks may be kept in one of those trees.
- */
-struct ubi_wl_prot_entry {
-       struct rb_node rb_pnum;
-       struct rb_node rb_aec;
-       unsigned long long abs_ec;
-       struct ubi_wl_entry *e;
-};
-
-/**
  * struct ubi_work - UBI work description data structure.
  * @list: a link in the list of pending works
  * @func: worker function
- * @priv: private data of the worker function
  * @e: physical eraseblock to erase
  * @torture: if the physical eraseblock has to be tortured
  *
@@ -198,9 +165,11 @@ struct ubi_work {
 static int paranoid_check_ec(struct ubi_device *ubi, int pnum, int ec);
 static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e,
                                     struct rb_root *root);
+static int paranoid_check_in_pq(struct ubi_device *ubi, struct ubi_wl_entry *e);
 #else
 #define paranoid_check_ec(ubi, pnum, ec) 0
 #define paranoid_check_in_wl_tree(e, root)
+#define paranoid_check_in_pq(ubi, e) 0
 #endif
 
 /**
@@ -220,7 +189,7 @@ static void wl_tree_add(struct ubi_wl_entry *e, struct rb_root *root)
                struct ubi_wl_entry *e1;
 
                parent = *p;
-               e1 = rb_entry(parent, struct ubi_wl_entry, rb);
+               e1 = rb_entry(parent, struct ubi_wl_entry, u.rb);
 
                if (e->ec < e1->ec)
                        p = &(*p)->rb_left;
@@ -235,8 +204,8 @@ static void wl_tree_add(struct ubi_wl_entry *e, struct rb_root *root)
                }
        }
 
-       rb_link_node(&e->rb, parent, p);
-       rb_insert_color(&e->rb, root);
+       rb_link_node(&e->u.rb, parent, p);
+       rb_insert_color(&e->u.rb, root);
 }
 
 /**
@@ -331,7 +300,7 @@ static int in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root)
        while (p) {
                struct ubi_wl_entry *e1;
 
-               e1 = rb_entry(p, struct ubi_wl_entry, rb);
+               e1 = rb_entry(p, struct ubi_wl_entry, u.rb);
 
                if (e->pnum == e1->pnum) {
                        ubi_assert(e == e1);
@@ -355,50 +324,24 @@ static int in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root)
 }
 
 /**
- * prot_tree_add - add physical eraseblock to protection trees.
+ * prot_queue_add - add physical eraseblock to the protection queue.
  * @ubi: UBI device description object
  * @e: the physical eraseblock to add
- * @pe: protection entry object to use
- * @abs_ec: absolute erase counter value when this physical eraseblock has
- * to be removed from the protection trees.
  *
- * @wl->lock has to be locked.
+ * This function adds @e to the tail of the protection queue @ubi->pq, where
+ * @e will stay for %UBI_PROT_QUEUE_LEN erase operations and will be
+ * temporarily protected from the wear-leveling worker. Note, @wl->lock has to
+ * be locked.
  */
-static void prot_tree_add(struct ubi_device *ubi, struct ubi_wl_entry *e,
-                         struct ubi_wl_prot_entry *pe, int abs_ec)
+static void prot_queue_add(struct ubi_device *ubi, struct ubi_wl_entry *e)
 {
-       struct rb_node **p, *parent = NULL;
-       struct ubi_wl_prot_entry *pe1;
-
-       pe->e = e;
-       pe->abs_ec = ubi->abs_ec + abs_ec;
-
-       p = &ubi->prot.pnum.rb_node;
-       while (*p) {
-               parent = *p;
-               pe1 = rb_entry(parent, struct ubi_wl_prot_entry, rb_pnum);
-
-               if (e->pnum < pe1->e->pnum)
-                       p = &(*p)->rb_left;
-               else
-                       p = &(*p)->rb_right;
-       }
-       rb_link_node(&pe->rb_pnum, parent, p);
-       rb_insert_color(&pe->rb_pnum, &ubi->prot.pnum);
+       int pq_tail = ubi->pq_head - 1;
 
-       p = &ubi->prot.aec.rb_node;
-       parent = NULL;
-       while (*p) {
-               parent = *p;
-               pe1 = rb_entry(parent, struct ubi_wl_prot_entry, rb_aec);
-
-               if (pe->abs_ec < pe1->abs_ec)
-                       p = &(*p)->rb_left;
-               else
-                       p = &(*p)->rb_right;
-       }
-       rb_link_node(&pe->rb_aec, parent, p);
-       rb_insert_color(&pe->rb_aec, &ubi->prot.aec);
+       if (pq_tail < 0)
+               pq_tail = UBI_PROT_QUEUE_LEN - 1;
+       ubi_assert(pq_tail >= 0 && pq_tail < UBI_PROT_QUEUE_LEN);
+       list_add_tail(&e->u.list, &ubi->pq[pq_tail]);
+       dbg_wl("added PEB %d EC %d to the protection queue", e->pnum, e->ec);
 }
 
 /**
@@ -414,14 +357,14 @@ static struct ubi_wl_entry *find_wl_entry(struct rb_root *root, int max)
        struct rb_node *p;
        struct ubi_wl_entry *e;
 
-       e = rb_entry(rb_first(root), struct ubi_wl_entry, rb);
+       e = rb_entry(rb_first(root), struct ubi_wl_entry, u.rb);
        max += e->ec;
 
        p = root->rb_node;
        while (p) {
                struct ubi_wl_entry *e1;
 
-               e1 = rb_entry(p, struct ubi_wl_entry, rb);
+               e1 = rb_entry(p, struct ubi_wl_entry, u.rb);
                if (e1->ec >= max)
                        p = p->rb_left;
                else {
@@ -443,17 +386,12 @@ static struct ubi_wl_entry *find_wl_entry(struct rb_root *root, int max)
  */
 int ubi_wl_get_peb(struct ubi_device *ubi, int dtype)
 {
-       int err, protect, medium_ec;
+       int err, medium_ec;
        struct ubi_wl_entry *e, *first, *last;
-       struct ubi_wl_prot_entry *pe;
 
        ubi_assert(dtype == UBI_LONGTERM || dtype == UBI_SHORTTERM ||
                   dtype == UBI_UNKNOWN);
 
-       pe = kmalloc(sizeof(struct ubi_wl_prot_entry), GFP_NOFS);
-       if (!pe)
-               return -ENOMEM;
-
 retry:
        spin_lock(&ubi->wl_lock);
        if (!ubi->free.rb_node) {
@@ -461,16 +399,13 @@ retry:
                        ubi_assert(list_empty(&ubi->works));
                        ubi_err("no free eraseblocks");
                        spin_unlock(&ubi->wl_lock);
-                       kfree(pe);
                        return -ENOSPC;
                }
                spin_unlock(&ubi->wl_lock);
 
                err = produce_free_peb(ubi);
-               if (err < 0) {
-                       kfree(pe);
+               if (err < 0)
                        return err;
-               }
                goto retry;
        }
 
@@ -483,7 +418,6 @@ retry:
                 * %WL_FREE_MAX_DIFF.
                 */
                e = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
-               protect = LT_PROTECTION;
                break;
        case UBI_UNKNOWN:
                /*
@@ -492,81 +426,63 @@ retry:
                 * eraseblock with erase counter greater or equivalent than the
                 * lowest erase counter plus %WL_FREE_MAX_DIFF.
                 */
-               first = rb_entry(rb_first(&ubi->free), struct ubi_wl_entry, rb);
-               last = rb_entry(rb_last(&ubi->free), struct ubi_wl_entry, rb);
+               first = rb_entry(rb_first(&ubi->free), struct ubi_wl_entry,
+                                       u.rb);
+               last = rb_entry(rb_last(&ubi->free), struct ubi_wl_entry, u.rb);
 
                if (last->ec - first->ec < WL_FREE_MAX_DIFF)
                        e = rb_entry(ubi->free.rb_node,
-                                       struct ubi_wl_entry, rb);
+                                       struct ubi_wl_entry, u.rb);
                else {
                        medium_ec = (first->ec + WL_FREE_MAX_DIFF)/2;
                        e = find_wl_entry(&ubi->free, medium_ec);
                }
-               protect = U_PROTECTION;
                break;
        case UBI_SHORTTERM:
                /*
                 * For short term data we pick a physical eraseblock with the
                 * lowest erase counter as we expect it will be erased soon.
                 */
-               e = rb_entry(rb_first(&ubi->free), struct ubi_wl_entry, rb);
-               protect = ST_PROTECTION;
+               e = rb_entry(rb_first(&ubi->free), struct ubi_wl_entry, u.rb);
                break;
        default:
-               protect = 0;
-               e = NULL;
                BUG();
        }
 
+       paranoid_check_in_wl_tree(e, &ubi->free);
+
        /*
-        * Move the physical eraseblock to the protection trees where it will
+        * Move the physical eraseblock to the protection queue where it will
         * be protected from being moved for some time.
         */
-       paranoid_check_in_wl_tree(e, &ubi->free);
-       rb_erase(&e->rb, &ubi->free);
-       prot_tree_add(ubi, e, pe, protect);
-
-       dbg_wl("PEB %d EC %d, protection %d", e->pnum, e->ec, protect);
+       rb_erase(&e->u.rb, &ubi->free);
+       dbg_wl("PEB %d EC %d", e->pnum, e->ec);
+       prot_queue_add(ubi, e);
        spin_unlock(&ubi->wl_lock);
-
        return e->pnum;
 }
 
 /**
- * prot_tree_del - remove a physical eraseblock from the protection trees
+ * prot_queue_del - remove a physical eraseblock from the protection queue.
  * @ubi: UBI device description object
  * @pnum: the physical eraseblock to remove
  *
- * This function returns PEB @pnum from the protection trees and returns zero
- * in case of success and %-ENODEV if the PEB was not found in the protection
- * trees.
+ * This function deletes PEB @pnum from the protection queue and returns zero
+ * in case of success and %-ENODEV if the PEB was not found.
  */
-static int prot_tree_del(struct ubi_device *ubi, int pnum)
+static int prot_queue_del(struct ubi_device *ubi, int pnum)
 {
-       struct rb_node *p;
-       struct ubi_wl_prot_entry *pe = NULL;
-
-       p = ubi->prot.pnum.rb_node;
-       while (p) {
-
-               pe = rb_entry(p, struct ubi_wl_prot_entry, rb_pnum);
-
-               if (pnum == pe->e->pnum)
-                       goto found;
+       struct ubi_wl_entry *e;
 
-               if (pnum < pe->e->pnum)
-                       p = p->rb_left;
-               else
-                       p = p->rb_right;
-       }
+       e = ubi->lookuptbl[pnum];
+       if (!e)
+               return -ENODEV;
 
-       return -ENODEV;
+       if (paranoid_check_in_pq(ubi, e))
+               return -ENODEV;
 
-found:
-       ubi_assert(pe->e->pnum == pnum);
-       rb_erase(&pe->rb_aec, &ubi->prot.aec);
-       rb_erase(&pe->rb_pnum, &ubi->prot.pnum);
-       kfree(pe);
+       list_del(&e->u.list);
+       dbg_wl("deleted PEB %d from the protection queue", e->pnum);
        return 0;
 }
 
@@ -632,47 +548,47 @@ out_free:
 }
 
 /**
- * check_protection_over - check if it is time to stop protecting some PEBs.
+ * serve_prot_queue - check if it is time to stop protecting PEBs.
  * @ubi: UBI device description object
  *
- * This function is called after each erase operation, when the absolute erase
- * counter is incremented, to check if some physical eraseblock  have not to be
- * protected any longer. These physical eraseblocks are moved from the
- * protection trees to the used tree.
+ * This function is called after each erase operation and removes PEBs from the
+ * tail of the protection queue. These PEBs have been protected for long enough
+ * and should be moved to the used tree.
  */
-static void check_protection_over(struct ubi_device *ubi)
+static void serve_prot_queue(struct ubi_device *ubi)
 {
-       struct ubi_wl_prot_entry *pe;
+       struct ubi_wl_entry *e, *tmp;
+       int count;
 
        /*
         * There may be several protected physical eraseblock to remove,
         * process them all.
         */
-       while (1) {
-               spin_lock(&ubi->wl_lock);
-               if (!ubi->prot.aec.rb_node) {
-                       spin_unlock(&ubi->wl_lock);
-                       break;
-               }
-
-               pe = rb_entry(rb_first(&ubi->prot.aec),
-                             struct ubi_wl_prot_entry, rb_aec);
+repeat:
+       count = 0;
+       spin_lock(&ubi->wl_lock);
+       list_for_each_entry_safe(e, tmp, &ubi->pq[ubi->pq_head], u.list) {
+               dbg_wl("PEB %d EC %d protection over, move to used tree",
+                       e->pnum, e->ec);
 
-               if (pe->abs_ec > ubi->abs_ec) {
+               list_del(&e->u.list);
+               wl_tree_add(e, &ubi->used);
+               if (count++ > 32) {
+                       /*
+                        * Let's be nice and avoid holding the spinlock for
+                        * too long.
+                        */
                        spin_unlock(&ubi->wl_lock);
-                       break;
+                       cond_resched();
+                       goto repeat;
                }
-
-               dbg_wl("PEB %d protection over, abs_ec %llu, PEB abs_ec %llu",
-                      pe->e->pnum, ubi->abs_ec, pe->abs_ec);
-               rb_erase(&pe->rb_aec, &ubi->prot.aec);
-               rb_erase(&pe->rb_pnum, &ubi->prot.pnum);
-               wl_tree_add(pe->e, &ubi->used);
-               spin_unlock(&ubi->wl_lock);
-
-               kfree(pe);
-               cond_resched();
        }
+
+       ubi->pq_head += 1;
+       if (ubi->pq_head == UBI_PROT_QUEUE_LEN)
+               ubi->pq_head = 0;
+       ubi_assert(ubi->pq_head >= 0 && ubi->pq_head < UBI_PROT_QUEUE_LEN);
+       spin_unlock(&ubi->wl_lock);
 }
 
 /**
@@ -680,8 +596,8 @@ static void check_protection_over(struct ubi_device *ubi)
  * @ubi: UBI device description object
  * @wrk: the work to schedule
  *
- * This function enqueues a work defined by @wrk to the tail of the pending
- * works list.
+ * This function adds a work defined by @wrk to the tail of the pending works
+ * list.
  */
 static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk)
 {
@@ -739,13 +655,11 @@ static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,
 static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
                                int cancel)
 {
-       int err, put = 0, scrubbing = 0, protect = 0;
-       struct ubi_wl_prot_entry *uninitialized_var(pe);
+       int err, scrubbing = 0, torture = 0, protect = 0, erroneous = 0;
        struct ubi_wl_entry *e1, *e2;
        struct ubi_vid_hdr *vid_hdr;
 
        kfree(wrk);
-
        if (cancel)
                return 0;
 
@@ -781,7 +695,7 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
                 * highly worn-out free physical eraseblock. If the erase
                 * counters differ much enough, start wear-leveling.
                 */
-               e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, rb);
+               e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);
                e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
 
                if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) {
@@ -790,21 +704,21 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
                        goto out_cancel;
                }
                paranoid_check_in_wl_tree(e1, &ubi->used);
-               rb_erase(&e1->rb, &ubi->used);
+               rb_erase(&e1->u.rb, &ubi->used);
                dbg_wl("move PEB %d EC %d to PEB %d EC %d",
                       e1->pnum, e1->ec, e2->pnum, e2->ec);
        } else {
                /* Perform scrubbing */
                scrubbing = 1;
-               e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, rb);
+               e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, u.rb);
                e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
                paranoid_check_in_wl_tree(e1, &ubi->scrub);
-               rb_erase(&e1->rb, &ubi->scrub);
+               rb_erase(&e1->u.rb, &ubi->scrub);
                dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum);
        }
 
        paranoid_check_in_wl_tree(e2, &ubi->free);
-       rb_erase(&e2->rb, &ubi->free);
+       rb_erase(&e2->u.rb, &ubi->free);
        ubi->move_from = e1;
        ubi->move_to = e2;
        spin_unlock(&ubi->wl_lock);
@@ -826,81 +740,104 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
                        /*
                         * We are trying to move PEB without a VID header. UBI
                         * always write VID headers shortly after the PEB was
-                        * given, so we have a situation when it did not have
-                        * chance to write it down because it was preempted.
-                        * Just re-schedule the work, so that next time it will
-                        * likely have the VID header in place.
+                        * given, so we have a situation when it has not yet
+                        * had a chance to write it, because it was preempted.
+                        * So add this PEB to the protection queue so far,
+                        * because presumably more data will be written there
+                        * (including the missing VID header), and then we'll
+                        * move it.
                         */
                        dbg_wl("PEB %d has no VID header", e1->pnum);
+                       protect = 1;
                        goto out_not_moved;
                }
 
                ubi_err("error %d while reading VID header from PEB %d",
                        err, e1->pnum);
-               if (err > 0)
-                       err = -EIO;
                goto out_error;
        }
 
        err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vid_hdr);
        if (err) {
+               if (err == MOVE_CANCEL_RACE) {
+                       /*
+                        * The LEB has not been moved because the volume is
+                        * being deleted or the PEB has been put meanwhile. We
+                        * should prevent this PEB from being selected for
+                        * wear-leveling movement again, so put it to the
+                        * protection queue.
+                        */
+                       protect = 1;
+                       goto out_not_moved;
+               }
 
-               if (err < 0)
-                       goto out_error;
-               if (err == 1)
+               if (err == MOVE_CANCEL_BITFLIPS || err == MOVE_TARGET_WR_ERR ||
+                   err == MOVE_TARGET_RD_ERR) {
+                       /* Target PEB bit-flips or write error, torture it */
+                       torture = 1;
                        goto out_not_moved;
+               }
 
-               /*
-                * For some reason the LEB was not moved - it might be because
-                * the volume is being deleted. We should prevent this PEB from
-                * being selected for wear-levelling movement for some "time",
-                * so put it to the protection tree.
-                */
+               if (err == MOVE_SOURCE_RD_ERR) {
+                       /*
+                        * An error happened while reading the source PEB. Do
+                        * not switch to R/O mode in this case, and give the
+                        * upper layers a possibility to recover from this,
+                        * e.g. by unmapping corresponding LEB. Instead, just
+                        * put this PEB to the @ubi->erroneous list to prevent
+                        * UBI from trying to move it over and over again.
+                        */
+                       if (ubi->erroneous_peb_count > ubi->max_erroneous) {
+                               ubi_err("too many erroneous eraseblocks (%d)",
+                                       ubi->erroneous_peb_count);
+                               goto out_error;
+                       }
+                       erroneous = 1;
+                       goto out_not_moved;
+               }
 
-               dbg_wl("cancelled moving PEB %d", e1->pnum);
-               pe = kmalloc(sizeof(struct ubi_wl_prot_entry), GFP_NOFS);
-               if (!pe) {
-                       err = -ENOMEM;
+               if (err < 0)
                        goto out_error;
-               }
 
-               protect = 1;
+               ubi_assert(0);
        }
 
+       /* The PEB has been successfully moved */
        ubi_free_vid_hdr(ubi, vid_hdr);
-       if (scrubbing && !protect)
+       if (scrubbing)
                ubi_msg("scrubbed PEB %d, data moved to PEB %d",
                        e1->pnum, e2->pnum);
 
        spin_lock(&ubi->wl_lock);
-       if (protect)
-               prot_tree_add(ubi, e1, pe, protect);
-       if (!ubi->move_to_put)
+       if (!ubi->move_to_put) {
                wl_tree_add(e2, &ubi->used);
-       else
-               put = 1;
+               e2 = NULL;
+       }
        ubi->move_from = ubi->move_to = NULL;
        ubi->move_to_put = ubi->wl_scheduled = 0;
        spin_unlock(&ubi->wl_lock);
 
-       if (put) {
+       err = schedule_erase(ubi, e1, 0);
+       if (err) {
+               kmem_cache_free(ubi_wl_entry_slab, e1);
+               if (e2)
+                       kmem_cache_free(ubi_wl_entry_slab, e2);
+               goto out_ro;
+       }
+
+       if (e2) {
                /*
                 * Well, the target PEB was put meanwhile, schedule it for
                 * erasure.
                 */
                dbg_wl("PEB %d was put meanwhile, erase", e2->pnum);
                err = schedule_erase(ubi, e2, 0);
-               if (err)
-                       goto out_error;
-       }
-
-       if (!protect) {
-               err = schedule_erase(ubi, e1, 0);
-               if (err)
-                       goto out_error;
+               if (err) {
+                       kmem_cache_free(ubi_wl_entry_slab, e2);
+                       goto out_ro;
+               }
        }
 
-
        dbg_wl("done");
        mutex_unlock(&ubi->move_mutex);
        return 0;
@@ -908,42 +845,52 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
        /*
         * For some reasons the LEB was not moved, might be an error, might be
         * something else. @e1 was not changed, so return it back. @e2 might
-        * be changed, schedule it for erasure.
+        * have been changed, schedule it for erasure.
         */
 out_not_moved:
-       ubi_free_vid_hdr(ubi, vid_hdr);
+       dbg_wl("cancel moving PEB %d to PEB %d (%d)",
+              e1->pnum, e2->pnum, err);
        spin_lock(&ubi->wl_lock);
-       if (scrubbing)
+       if (protect)
+               prot_queue_add(ubi, e1);
+       else if (erroneous) {
+               wl_tree_add(e1, &ubi->erroneous);
+               ubi->erroneous_peb_count += 1;
+       } else if (scrubbing)
                wl_tree_add(e1, &ubi->scrub);
        else
                wl_tree_add(e1, &ubi->used);
+       ubi_assert(!ubi->move_to_put);
        ubi->move_from = ubi->move_to = NULL;
-       ubi->move_to_put = ubi->wl_scheduled = 0;
+       ubi->wl_scheduled = 0;
        spin_unlock(&ubi->wl_lock);
 
-       err = schedule_erase(ubi, e2, 0);
-       if (err)
-               goto out_error;
-
+       ubi_free_vid_hdr(ubi, vid_hdr);
+       err = schedule_erase(ubi, e2, torture);
+       if (err) {
+               kmem_cache_free(ubi_wl_entry_slab, e2);
+               goto out_ro;
+       }
        mutex_unlock(&ubi->move_mutex);
        return 0;
 
 out_error:
        ubi_err("error %d while moving PEB %d to PEB %d",
                err, e1->pnum, e2->pnum);
-
-       ubi_free_vid_hdr(ubi, vid_hdr);
        spin_lock(&ubi->wl_lock);
        ubi->move_from = ubi->move_to = NULL;
        ubi->move_to_put = ubi->wl_scheduled = 0;
        spin_unlock(&ubi->wl_lock);
 
+       ubi_free_vid_hdr(ubi, vid_hdr);
        kmem_cache_free(ubi_wl_entry_slab, e1);
        kmem_cache_free(ubi_wl_entry_slab, e2);
-       ubi_ro_mode(ubi);
 
+out_ro:
+       ubi_ro_mode(ubi);
        mutex_unlock(&ubi->move_mutex);
-       return err;
+       ubi_assert(err != 0);
+       return err < 0 ? err : -EIO;
 
 out_cancel:
        ubi->wl_scheduled = 0;
@@ -988,7 +935,7 @@ static int ensure_wear_leveling(struct ubi_device *ubi)
                 * erase counter of free physical eraseblocks is greater then
                 * %UBI_WL_THRESHOLD.
                 */
-               e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, rb);
+               e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);
                e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
 
                if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD))
@@ -1050,7 +997,6 @@ static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk,
                kfree(wl_wrk);
 
                spin_lock(&ubi->wl_lock);
-               ubi->abs_ec += 1;
                wl_tree_add(e, &ubi->free);
                spin_unlock(&ubi->wl_lock);
 
@@ -1058,7 +1004,7 @@ static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk,
                 * One more erase operation has happened, take care about
                 * protected physical eraseblocks.
                 */
-               check_protection_over(ubi);
+               serve_prot_queue(ubi);
 
                /* And take care about wear-leveling */
                err = ensure_wear_leveling(ubi);
@@ -1084,7 +1030,7 @@ static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk,
                /*
                 * If this is not %-EIO, we have no idea what to do. Scheduling
                 * this physical eraseblock for erasure again would cause
-                * errors again and again. Well, lets switch to RO mode.
+                * errors again and again. Well, lets switch to R/O mode.
                 */
                goto out_ro;
        }
@@ -1190,12 +1136,19 @@ retry:
        } else {
                if (in_wl_tree(e, &ubi->used)) {
                        paranoid_check_in_wl_tree(e, &ubi->used);
-                       rb_erase(&e->rb, &ubi->used);
+                       rb_erase(&e->u.rb, &ubi->used);
                } else if (in_wl_tree(e, &ubi->scrub)) {
                        paranoid_check_in_wl_tree(e, &ubi->scrub);
-                       rb_erase(&e->rb, &ubi->scrub);
+                       rb_erase(&e->u.rb, &ubi->scrub);
+               } else if (in_wl_tree(e, &ubi->erroneous)) {
+                       paranoid_check_in_wl_tree(e, &ubi->erroneous);
+                       rb_erase(&e->u.rb, &ubi->erroneous);
+                       ubi->erroneous_peb_count -= 1;
+                       ubi_assert(ubi->erroneous_peb_count >= 0);
+                       /* Erroneous PEBs should be tortured */
+                       torture = 1;
                } else {
-                       err = prot_tree_del(ubi, e->pnum);
+                       err = prot_queue_del(ubi, e->pnum);
                        if (err) {
                                ubi_err("PEB %d not found", pnum);
                                ubi_ro_mode(ubi);
@@ -1255,11 +1208,11 @@ retry:
 
        if (in_wl_tree(e, &ubi->used)) {
                paranoid_check_in_wl_tree(e, &ubi->used);
-               rb_erase(&e->rb, &ubi->used);
+               rb_erase(&e->u.rb, &ubi->used);
        } else {
                int err;
 
-               err = prot_tree_del(ubi, e->pnum);
+               err = prot_queue_del(ubi, e->pnum);
                if (err) {
                        ubi_err("PEB %d not found", pnum);
                        ubi_ro_mode(ubi);
@@ -1290,7 +1243,7 @@ int ubi_wl_flush(struct ubi_device *ubi)
        int err;
 
        /*
-        * Erase while the pending works queue is not empty, but not more then
+        * Erase while the pending works queue is not empty, but not more than
         * the number of currently pending works.
         */
        dbg_wl("flush (%d pending works)", ubi->works_count);
@@ -1308,7 +1261,7 @@ int ubi_wl_flush(struct ubi_device *ubi)
        up_write(&ubi->work_sem);
 
        /*
-        * And in case last was the WL worker and it cancelled the LEB
+        * And in case last was the WL worker and it canceled the LEB
         * movement, flush again.
         */
        while (ubi->works_count) {
@@ -1337,11 +1290,11 @@ static void tree_destroy(struct rb_root *root)
                else if (rb->rb_right)
                        rb = rb->rb_right;
                else {
-                       e = rb_entry(rb, struct ubi_wl_entry, rb);
+                       e = rb_entry(rb, struct ubi_wl_entry, u.rb);
 
                        rb = rb_parent(rb);
                        if (rb) {
-                               if (rb->rb_left == &e->rb)
+                               if (rb->rb_left == &e->u.rb)
                                        rb->rb_left = NULL;
                                else
                                        rb->rb_right = NULL;
@@ -1436,15 +1389,13 @@ static void cancel_pending(struct ubi_device *ubi)
  */
 int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si)
 {
-       int err;
+       int err, i;
        struct rb_node *rb1, *rb2;
        struct ubi_scan_volume *sv;
        struct ubi_scan_leb *seb, *tmp;
        struct ubi_wl_entry *e;
 
-
-       ubi->used = ubi->free = ubi->scrub = RB_ROOT;
-       ubi->prot.pnum = ubi->prot.aec = RB_ROOT;
+       ubi->used = ubi->erroneous = ubi->free = ubi->scrub = RB_ROOT;
        spin_lock_init(&ubi->wl_lock);
        mutex_init(&ubi->move_mutex);
        init_rwsem(&ubi->work_sem);
@@ -1458,6 +1409,10 @@ int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si)
        if (!ubi->lookuptbl)
                return err;
 
+       for (i = 0; i < UBI_PROT_QUEUE_LEN; i++)
+               INIT_LIST_HEAD(&ubi->pq[i]);
+       ubi->pq_head = 0;
+
        list_for_each_entry_safe(seb, tmp, &si->erase, u.list) {
                cond_resched();
 
@@ -1552,33 +1507,18 @@ out_free:
 }
 
 /**
- * protection_trees_destroy - destroy the protection RB-trees.
+ * protection_queue_destroy - destroy the protection queue.
  * @ubi: UBI device description object
  */
-static void protection_trees_destroy(struct ubi_device *ubi)
+static void protection_queue_destroy(struct ubi_device *ubi)
 {
-       struct rb_node *rb;
-       struct ubi_wl_prot_entry *pe;
-
-       rb = ubi->prot.aec.rb_node;
-       while (rb) {
-               if (rb->rb_left)
-                       rb = rb->rb_left;
-               else if (rb->rb_right)
-                       rb = rb->rb_right;
-               else {
-                       pe = rb_entry(rb, struct ubi_wl_prot_entry, rb_aec);
-
-                       rb = rb_parent(rb);
-                       if (rb) {
-                               if (rb->rb_left == &pe->rb_aec)
-                                       rb->rb_left = NULL;
-                               else
-                                       rb->rb_right = NULL;
-                       }
+       int i;
+       struct ubi_wl_entry *e, *tmp;
 
-                       kmem_cache_free(ubi_wl_entry_slab, pe->e);
-                       kfree(pe);
+       for (i = 0; i < UBI_PROT_QUEUE_LEN; ++i) {
+               list_for_each_entry_safe(e, tmp, &ubi->pq[i], u.list) {
+                       list_del(&e->u.list);
+                       kmem_cache_free(ubi_wl_entry_slab, e);
                }
        }
 }
@@ -1591,8 +1531,9 @@ void ubi_wl_close(struct ubi_device *ubi)
 {
        dbg_wl("close the WL sub-system");
        cancel_pending(ubi);
-       protection_trees_destroy(ubi);
+       protection_queue_destroy(ubi);
        tree_destroy(&ubi->used);
+       tree_destroy(&ubi->erroneous);
        tree_destroy(&ubi->free);
        tree_destroy(&ubi->scrub);
        kfree(ubi->lookuptbl);
@@ -1661,4 +1602,27 @@ static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e,
        return 1;
 }
 
+/**
+ * paranoid_check_in_pq - check if wear-leveling entry is in the protection
+ *                        queue.
+ * @ubi: UBI device description object
+ * @e: the wear-leveling entry to check
+ *
+ * This function returns zero if @e is in @ubi->pq and %1 if it is not.
+ */
+static int paranoid_check_in_pq(struct ubi_device *ubi, struct ubi_wl_entry *e)
+{
+       struct ubi_wl_entry *p;
+       int i;
+
+       for (i = 0; i < UBI_PROT_QUEUE_LEN; ++i)
+               list_for_each_entry(p, &ubi->pq[i], u.list)
+                       if (p == e)
+                               return 0;
+
+       ubi_err("paranoid check failed for PEB %d, EC %d, Protect queue",
+               e->pnum, e->ec);
+       ubi_dbg_dump_stack();
+       return 1;
+}
 #endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */