wl.c 54.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
2 3 4 5 6 7 8
/*
 * Copyright (c) International Business Machines Corp., 2006
 *
 * Authors: Artem Bityutskiy (Битюцкий Артём), Thomas Gleixner
 */

/*
Artem Bityutskiy's avatar
Artem Bityutskiy committed
9
 * UBI wear-leveling sub-system.
10
 *
Artem Bityutskiy's avatar
Artem Bityutskiy committed
11
 * This sub-system is responsible for wear-leveling. It works in terms of
12
 * physical eraseblocks and erase counters and knows nothing about logical
Artem Bityutskiy's avatar
Artem Bityutskiy committed
13 14 15 16
 * eraseblocks, volumes, etc. From this sub-system's perspective all physical
 * eraseblocks are of two types - used and free. Used physical eraseblocks are
 * those that were "get" by the 'ubi_wl_get_peb()' function, and free physical
 * eraseblocks are those that were put by the 'ubi_wl_put_peb()' function.
17 18
 *
 * Physical eraseblocks returned by 'ubi_wl_get_peb()' have only erase counter
Artem Bityutskiy's avatar
Artem Bityutskiy committed
19
 * header. The rest of the physical eraseblock contains only %0xFF bytes.
20
 *
Artem Bityutskiy's avatar
Artem Bityutskiy committed
21
 * When physical eraseblocks are returned to the WL sub-system by means of the
22 23
 * 'ubi_wl_put_peb()' function, they are scheduled for erasure. The erasure is
 * done asynchronously in context of the per-UBI device background thread,
Artem Bityutskiy's avatar
Artem Bityutskiy committed
24
 * which is also managed by the WL sub-system.
25 26 27 28 29
 *
 * The wear-leveling is ensured by means of moving the contents of used
 * physical eraseblocks with low erase counter to free physical eraseblocks
 * with high erase counter.
 *
Artem Bityutskiy's avatar
Artem Bityutskiy committed
30 31
 * If the WL sub-system fails to erase a physical eraseblock, it marks it as
 * bad.
32
 *
Artem Bityutskiy's avatar
Artem Bityutskiy committed
33 34 35
 * This sub-system is also responsible for scrubbing. If a bit-flip is detected
 * in a physical eraseblock, it has to be moved. Technically this is the same
 * as moving it for wear-leveling reasons.
36
 *
Artem Bityutskiy's avatar
Artem Bityutskiy committed
37 38
 * As it was said, for the UBI sub-system all physical eraseblocks are either
 * "free" or "used". Free eraseblock are kept in the @wl->free RB-tree, while
39 40
 * used eraseblocks are kept in @wl->used, @wl->erroneous, or @wl->scrub
 * RB-trees, as well as (temporarily) in the @wl->pq queue.
41 42 43 44 45 46 47 48 49 50 51 52
 *
 * When the WL sub-system returns a physical eraseblock, the physical
 * eraseblock is protected from being moved for some "time". For this reason,
 * the physical eraseblock is not directly moved from the @wl->free tree to the
 * @wl->used tree. There is a protection queue in between where this
 * physical eraseblock is temporarily stored (@wl->pq).
 *
 * All this protection stuff is needed because:
 *  o we don't want to move physical eraseblocks just after we have given them
 *    to the user; instead, we first want to let users fill them up with data;
 *
 *  o there is a chance that the user will put the physical eraseblock very
53
 *    soon, so it makes sense not to move it for some time, but wait.
54 55 56 57 58 59 60 61 62 63 64 65
 *
 * Physical eraseblocks stay protected only for limited time. But the "time" is
 * measured in erase cycles in this case. This is implemented with help of the
 * protection queue. Eraseblocks are put to the tail of this queue when they
 * are returned by the 'ubi_wl_get_peb()', and eraseblocks are removed from the
 * head of the queue on each erase operation (for any eraseblock). So the
 * length of the queue defines how may (global) erase cycles PEBs are protected.
 *
 * To put it differently, each physical eraseblock has 2 main states: free and
 * used. The former state corresponds to the @wl->free tree. The latter state
 * is split up on several sub-states:
 * o the WL movement is allowed (@wl->used tree);
66
 * o the WL movement is disallowed (@wl->erroneous) because the PEB is
67
 *   erroneous - e.g., there was a read error;
68 69 70 71 72
 * o the WL movement is temporarily prohibited (@wl->pq queue);
 * o scrubbing is needed (@wl->scrub tree).
 *
 * Depending on the sub-state, wear-leveling entries of the used physical
 * eraseblocks may be kept in one of those structures.
73 74 75 76
 *
 * Note, in this implementation, we keep a small in-RAM object for each physical
 * eraseblock. This is surely not a scalable solution. But it appears to be good
 * enough for moderately large flashes and it is simple. In future, one may
Artem Bityutskiy's avatar
Artem Bityutskiy committed
77
 * re-work this sub-system and make it more scalable.
78
 *
Artem Bityutskiy's avatar
Artem Bityutskiy committed
79 80 81
 * At the moment this sub-system does not utilize the sequence number, which
 * was introduced relatively recently. But it would be wise to do this because
 * the sequence number of a logical eraseblock characterizes how old is it. For
82 83 84
 * example, when we move a PEB with low erase counter, and we need to pick the
 * target PEB, we pick a PEB with the highest EC if our PEB is "old" and we
 * pick target PEB with an average EC if our PEB is not very "old". This is a
Artem Bityutskiy's avatar
Artem Bityutskiy committed
85
 * room for future re-works of the WL sub-system.
86 87 88 89 90 91 92
 */

#include <linux/slab.h>
#include <linux/crc32.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include "ubi.h"
93
#include "wl.h"
94 95 96 97 98 99

/* Number of physical eraseblocks reserved for wear-leveling purposes */
#define WL_RESERVED_PEBS 1

/*
 * Maximum difference between two erase counters. If this threshold is
Artem Bityutskiy's avatar
Artem Bityutskiy committed
100 101 102
 * exceeded, the WL sub-system starts moving data from used physical
 * eraseblocks with low erase counter to free physical eraseblocks with high
 * erase counter.
103 104 105 106
 */
#define UBI_WL_THRESHOLD CONFIG_MTD_UBI_WL_THRESHOLD

/*
Artem Bityutskiy's avatar
Artem Bityutskiy committed
107
 * When a physical eraseblock is moved, the WL sub-system has to pick the target
108 109 110 111 112
 * physical eraseblock to move to. The simplest way would be just to pick the
 * one with the highest erase counter. But in certain workloads this could lead
 * to an unlimited wear of one or few physical eraseblock. Indeed, imagine a
 * situation when the picked physical eraseblock is constantly erased after the
 * data is written to it. So, we have a constant which limits the highest erase
Artem Bityutskiy's avatar
Artem Bityutskiy committed
113
 * counter of the free physical eraseblock to pick. Namely, the WL sub-system
114
 * does not pick eraseblocks with erase counter greater than the lowest erase
115 116 117 118 119 120 121 122 123 124
 * counter plus %WL_FREE_MAX_DIFF.
 */
#define WL_FREE_MAX_DIFF (2*UBI_WL_THRESHOLD)

/*
 * Maximum number of consecutive background thread failures which is enough to
 * switch to read-only mode.
 */
#define WL_MAX_FAILURES 32

125 126 127 128 129
static int self_check_ec(struct ubi_device *ubi, int pnum, int ec);
static int self_check_in_wl_tree(const struct ubi_device *ubi,
				 struct ubi_wl_entry *e, struct rb_root *root);
static int self_check_in_pq(const struct ubi_device *ubi,
			    struct ubi_wl_entry *e);
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147

/**
 * wl_tree_add - add a wear-leveling entry to a WL RB-tree.
 * @e: the wear-leveling entry to add
 * @root: the root of the tree
 *
 * Note, we use (erase counter, physical eraseblock number) pairs as keys in
 * the @ubi->used and @ubi->free RB-trees.
 */
static void wl_tree_add(struct ubi_wl_entry *e, struct rb_root *root)
{
	struct rb_node **p, *parent = NULL;

	p = &root->rb_node;
	while (*p) {
		struct ubi_wl_entry *e1;

		parent = *p;
148
		e1 = rb_entry(parent, struct ubi_wl_entry, u.rb);
149 150 151 152 153 154 155 156 157 158 159 160 161 162

		if (e->ec < e1->ec)
			p = &(*p)->rb_left;
		else if (e->ec > e1->ec)
			p = &(*p)->rb_right;
		else {
			ubi_assert(e->pnum != e1->pnum);
			if (e->pnum < e1->pnum)
				p = &(*p)->rb_left;
			else
				p = &(*p)->rb_right;
		}
	}

163 164
	rb_link_node(&e->u.rb, parent, p);
	rb_insert_color(&e->u.rb, root);
165 166
}

167 168 169 170 171 172 173 174 175 176 177 178 179 180
/**
 * wl_tree_destroy - destroy a wear-leveling entry.
 * @ubi: UBI device description object
 * @e: the wear-leveling entry to add
 *
 * This function destroys a wear leveling entry and removes
 * the reference from the lookup table.
 */
static void wl_entry_destroy(struct ubi_device *ubi, struct ubi_wl_entry *e)
{
	ubi->lookuptbl[e->pnum] = NULL;
	kmem_cache_free(ubi_wl_entry_slab, e);
}

181 182 183 184 185 186 187 188 189 190 191 192
/**
 * do_work - do one pending work.
 * @ubi: UBI device description object
 *
 * This function returns zero in case of success and a negative error code in
 * case of failure.
 */
static int do_work(struct ubi_device *ubi)
{
	int err;
	struct ubi_work *wrk;

193 194
	cond_resched();

Artem Bityutskiy's avatar
Artem Bityutskiy committed
195 196 197 198 199 200 201
	/*
	 * @ubi->work_sem is used to synchronize with the workers. Workers take
	 * it in read mode, so many of them may be doing works at a time. But
	 * the queue flush code has to be sure the whole queue of works is
	 * done, and it takes the mutex in write mode.
	 */
	down_read(&ubi->work_sem);
202 203 204
	spin_lock(&ubi->wl_lock);
	if (list_empty(&ubi->works)) {
		spin_unlock(&ubi->wl_lock);
Artem Bityutskiy's avatar
Artem Bityutskiy committed
205
		up_read(&ubi->work_sem);
206 207 208 209 210
		return 0;
	}

	wrk = list_entry(ubi->works.next, struct ubi_work, list);
	list_del(&wrk->list);
Artem Bityutskiy's avatar
Artem Bityutskiy committed
211 212
	ubi->works_count -= 1;
	ubi_assert(ubi->works_count >= 0);
213 214 215 216 217 218 219 220 221
	spin_unlock(&ubi->wl_lock);

	/*
	 * Call the worker function. Do not touch the work structure
	 * after this call as it will have been freed or reused by that
	 * time by the worker function.
	 */
	err = wrk->func(ubi, wrk, 0);
	if (err)
222
		ubi_err(ubi, "work failed with error code %d", err);
Artem Bityutskiy's avatar
Artem Bityutskiy committed
223
	up_read(&ubi->work_sem);
Artem Bityutskiy's avatar
Artem Bityutskiy committed
224

225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
	return err;
}

/**
 * in_wl_tree - check if wear-leveling entry is present in a WL RB-tree.
 * @e: the wear-leveling entry to check
 * @root: the root of the tree
 *
 * This function returns non-zero if @e is in the @root RB-tree and zero if it
 * is not.
 */
static int in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root)
{
	struct rb_node *p;

	p = root->rb_node;
	while (p) {
		struct ubi_wl_entry *e1;

244
		e1 = rb_entry(p, struct ubi_wl_entry, u.rb);
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266

		if (e->pnum == e1->pnum) {
			ubi_assert(e == e1);
			return 1;
		}

		if (e->ec < e1->ec)
			p = p->rb_left;
		else if (e->ec > e1->ec)
			p = p->rb_right;
		else {
			ubi_assert(e->pnum != e1->pnum);
			if (e->pnum < e1->pnum)
				p = p->rb_left;
			else
				p = p->rb_right;
		}
	}

	return 0;
}

Richard Weinberger's avatar
Richard Weinberger committed
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
/**
 * in_pq - check if a wear-leveling entry is present in the protection queue.
 * @ubi: UBI device description object
 * @e: the wear-leveling entry to check
 *
 * This function returns non-zero if @e is in the protection queue and zero
 * if it is not.
 */
static inline int in_pq(const struct ubi_device *ubi, struct ubi_wl_entry *e)
{
	struct ubi_wl_entry *p;
	int i;

	for (i = 0; i < UBI_PROT_QUEUE_LEN; ++i)
		list_for_each_entry(p, &ubi->pq[i], u.list)
			if (p == e)
				return 1;

	return 0;
}

288
/**
289
 * prot_queue_add - add physical eraseblock to the protection queue.
290 291 292
 * @ubi: UBI device description object
 * @e: the physical eraseblock to add
 *
293 294 295 296
 * This function adds @e to the tail of the protection queue @ubi->pq, where
 * @e will stay for %UBI_PROT_QUEUE_LEN erase operations and will be
 * temporarily protected from the wear-leveling worker. Note, @wl->lock has to
 * be locked.
297
 */
298
static void prot_queue_add(struct ubi_device *ubi, struct ubi_wl_entry *e)
299
{
300
	int pq_tail = ubi->pq_head - 1;
301

302 303 304 305 306
	if (pq_tail < 0)
		pq_tail = UBI_PROT_QUEUE_LEN - 1;
	ubi_assert(pq_tail >= 0 && pq_tail < UBI_PROT_QUEUE_LEN);
	list_add_tail(&e->u.list, &ubi->pq[pq_tail]);
	dbg_wl("added PEB %d EC %d to the protection queue", e->pnum, e->ec);
307 308 309 310
}

/**
 * find_wl_entry - find wear-leveling entry closest to certain erase counter.
311
 * @ubi: UBI device description object
312
 * @root: the RB-tree where to look for
313
 * @diff: maximum possible difference from the smallest erase counter
314 315
 *
 * This function looks for a wear leveling entry with erase counter closest to
316
 * min + @diff, where min is the smallest erase counter.
317
 */
318 319
static struct ubi_wl_entry *find_wl_entry(struct ubi_device *ubi,
					  struct rb_root *root, int diff)
320 321
{
	struct rb_node *p;
322
	struct ubi_wl_entry *e;
323
	int max;
324

325
	e = rb_entry(rb_first(root), struct ubi_wl_entry, u.rb);
326
	max = e->ec + diff;
327 328 329 330 331

	p = root->rb_node;
	while (p) {
		struct ubi_wl_entry *e1;

332
		e1 = rb_entry(p, struct ubi_wl_entry, u.rb);
333 334 335 336 337 338 339 340 341 342 343 344
		if (e1->ec >= max)
			p = p->rb_left;
		else {
			p = p->rb_right;
			e = e1;
		}
	}

	return e;
}

/**
345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
 * find_mean_wl_entry - find wear-leveling entry with medium erase counter.
 * @ubi: UBI device description object
 * @root: the RB-tree where to look for
 *
 * This function looks for a wear leveling entry with medium erase counter,
 * but not greater or equivalent than the lowest erase counter plus
 * %WL_FREE_MAX_DIFF/2.
 */
static struct ubi_wl_entry *find_mean_wl_entry(struct ubi_device *ubi,
					       struct rb_root *root)
{
	struct ubi_wl_entry *e, *first, *last;

	first = rb_entry(rb_first(root), struct ubi_wl_entry, u.rb);
	last = rb_entry(rb_last(root), struct ubi_wl_entry, u.rb);

	if (last->ec - first->ec < WL_FREE_MAX_DIFF) {
		e = rb_entry(root->rb_node, struct ubi_wl_entry, u.rb);

		/* If no fastmap has been written and this WL entry can be used
		 * as anchor PEB, hold it back and return the second best
		 * WL entry such that fastmap can use the anchor PEB later. */
367
		e = may_reserve_for_fm(ubi, e, root);
368 369 370 371 372 373 374
	} else
		e = find_wl_entry(ubi, root, WL_FREE_MAX_DIFF/2);

	return e;
}

/**
375
 * wl_get_wle - get a mean wl entry to be used by ubi_wl_get_peb() or
Richard Weinberger's avatar
Richard Weinberger committed
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
 * refill_wl_user_pool().
 * @ubi: UBI device description object
 *
 * This function returns a a wear leveling entry in case of success and
 * NULL in case of failure.
 */
static struct ubi_wl_entry *wl_get_wle(struct ubi_device *ubi)
{
	struct ubi_wl_entry *e;

	e = find_mean_wl_entry(ubi, &ubi->free);
	if (!e) {
		ubi_err(ubi, "no free eraseblocks");
		return NULL;
	}

	self_check_in_wl_tree(ubi, e, &ubi->free);

	/*
	 * Move the physical eraseblock to the protection queue where it will
	 * be protected from being moved for some time.
	 */
	rb_erase(&e->u.rb, &ubi->free);
	ubi->free_count--;
	dbg_wl("PEB %d EC %d", e->pnum, e->ec);

	return e;
}

405
/**
406
 * prot_queue_del - remove a physical eraseblock from the protection queue.
407 408
 * @ubi: UBI device description object
 * @pnum: the physical eraseblock to remove
409
 *
410 411
 * This function deletes PEB @pnum from the protection queue and returns zero
 * in case of success and %-ENODEV if the PEB was not found.
412
 */
413
static int prot_queue_del(struct ubi_device *ubi, int pnum)
414
{
415
	struct ubi_wl_entry *e;
416

417 418 419
	e = ubi->lookuptbl[pnum];
	if (!e)
		return -ENODEV;
420

421
	if (self_check_in_pq(ubi, e))
422
		return -ENODEV;
423

424 425
	list_del(&e->u.list);
	dbg_wl("deleted PEB %d from the protection queue", e->pnum);
426
	return 0;
427 428 429 430 431 432 433 434 435 436 437
}

/**
 * sync_erase - synchronously erase a physical eraseblock.
 * @ubi: UBI device description object
 * @e: the the physical eraseblock to erase
 * @torture: if the physical eraseblock has to be tortured
 *
 * This function returns zero in case of success and a negative error code in
 * case of failure.
 */
438 439
static int sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,
		      int torture)
440 441 442 443 444 445 446
{
	int err;
	struct ubi_ec_hdr *ec_hdr;
	unsigned long long ec = e->ec;

	dbg_wl("erase PEB %d, old EC %llu", e->pnum, ec);

447
	err = self_check_ec(ubi, e->pnum, e->ec);
448
	if (err)
449 450
		return -EINVAL;

451
	ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_NOFS);
452 453 454 455 456 457 458 459 460 461 462 463 464
	if (!ec_hdr)
		return -ENOMEM;

	err = ubi_io_sync_erase(ubi, e->pnum, torture);
	if (err < 0)
		goto out_free;

	ec += err;
	if (ec > UBI_MAX_ERASECOUNTER) {
		/*
		 * Erase counter overflow. Upgrade UBI and use 64-bit
		 * erase counters internally.
		 */
465
		ubi_err(ubi, "erase counter overflow at PEB %d, EC %llu",
466 467 468 469 470 471 472
			e->pnum, ec);
		err = -EINVAL;
		goto out_free;
	}

	dbg_wl("erased PEB %d, new EC %llu", e->pnum, ec);

473
	ec_hdr->ec = cpu_to_be64(ec);
474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490

	err = ubi_io_write_ec_hdr(ubi, e->pnum, ec_hdr);
	if (err)
		goto out_free;

	e->ec = ec;
	spin_lock(&ubi->wl_lock);
	if (e->ec > ubi->max_ec)
		ubi->max_ec = e->ec;
	spin_unlock(&ubi->wl_lock);

out_free:
	kfree(ec_hdr);
	return err;
}

/**
491
 * serve_prot_queue - check if it is time to stop protecting PEBs.
492 493
 * @ubi: UBI device description object
 *
494 495 496
 * This function is called after each erase operation and removes PEBs from the
 * tail of the protection queue. These PEBs have been protected for long enough
 * and should be moved to the used tree.
497
 */
498
static void serve_prot_queue(struct ubi_device *ubi)
499
{
500 501
	struct ubi_wl_entry *e, *tmp;
	int count;
502 503 504 505 506

	/*
	 * There may be several protected physical eraseblock to remove,
	 * process them all.
	 */
507 508 509 510 511 512
repeat:
	count = 0;
	spin_lock(&ubi->wl_lock);
	list_for_each_entry_safe(e, tmp, &ubi->pq[ubi->pq_head], u.list) {
		dbg_wl("PEB %d EC %d protection over, move to used tree",
			e->pnum, e->ec);
513

514 515 516 517 518 519 520
		list_del(&e->u.list);
		wl_tree_add(e, &ubi->used);
		if (count++ > 32) {
			/*
			 * Let's be nice and avoid holding the spinlock for
			 * too long.
			 */
521
			spin_unlock(&ubi->wl_lock);
522 523
			cond_resched();
			goto repeat;
524 525
		}
	}
526 527 528 529 530 531

	ubi->pq_head += 1;
	if (ubi->pq_head == UBI_PROT_QUEUE_LEN)
		ubi->pq_head = 0;
	ubi_assert(ubi->pq_head >= 0 && ubi->pq_head < UBI_PROT_QUEUE_LEN);
	spin_unlock(&ubi->wl_lock);
532 533 534
}

/**
535
 * __schedule_ubi_work - schedule a work.
536 537 538
 * @ubi: UBI device description object
 * @wrk: the work to schedule
 *
539
 * This function adds a work defined by @wrk to the tail of the pending works
540
 * list. Can only be used if ubi->work_sem is already held in read mode!
541
 */
542
static void __schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk)
543 544 545 546 547
{
	spin_lock(&ubi->wl_lock);
	list_add_tail(&wrk->list, &ubi->works);
	ubi_assert(ubi->works_count >= 0);
	ubi->works_count += 1;
548
	if (ubi->thread_enabled && !ubi_dbg_is_bgt_disabled(ubi))
549 550 551 552
		wake_up_process(ubi->bgt_thread);
	spin_unlock(&ubi->wl_lock);
}

553 554 555 556 557 558 559 560 561 562 563 564 565 566 567
/**
 * schedule_ubi_work - schedule a work.
 * @ubi: UBI device description object
 * @wrk: the work to schedule
 *
 * This function adds a work defined by @wrk to the tail of the pending works
 * list.
 */
static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk)
{
	down_read(&ubi->work_sem);
	__schedule_ubi_work(ubi, wrk);
	up_read(&ubi->work_sem);
}

568
static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk,
569
			int shutdown);
570 571 572 573 574

/**
 * schedule_erase - schedule an erase work.
 * @ubi: UBI device description object
 * @e: the WL entry of the physical eraseblock to erase
575 576
 * @vol_id: the volume ID that last used this PEB
 * @lnum: the last used logical eraseblock number for the PEB
577 578 579 580 581 582
 * @torture: if the physical eraseblock has to be tortured
 *
 * This function returns zero in case of success and a %-ENOMEM in case of
 * failure.
 */
static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,
583
			  int vol_id, int lnum, int torture, bool nested)
584 585 586
{
	struct ubi_work *wl_wrk;

587 588
	ubi_assert(e);

589 590 591
	dbg_wl("schedule erasure of PEB %d, EC %d, torture %d",
	       e->pnum, e->ec, torture);

592
	wl_wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS);
593 594 595 596 597
	if (!wl_wrk)
		return -ENOMEM;

	wl_wrk->func = &erase_worker;
	wl_wrk->e = e;
598 599
	wl_wrk->vol_id = vol_id;
	wl_wrk->lnum = lnum;
600 601
	wl_wrk->torture = torture;

602 603 604 605
	if (nested)
		__schedule_ubi_work(ubi, wl_wrk);
	else
		schedule_ubi_work(ubi, wl_wrk);
606 607 608
	return 0;
}

609
static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk);
610 611 612 613 614 615 616 617 618 619 620 621
/**
 * do_sync_erase - run the erase worker synchronously.
 * @ubi: UBI device description object
 * @e: the WL entry of the physical eraseblock to erase
 * @vol_id: the volume ID that last used this PEB
 * @lnum: the last used logical eraseblock number for the PEB
 * @torture: if the physical eraseblock has to be tortured
 *
 */
static int do_sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,
			 int vol_id, int lnum, int torture)
{
622
	struct ubi_work wl_wrk;
623 624 625

	dbg_wl("sync erase of PEB %i", e->pnum);

626 627 628 629
	wl_wrk.e = e;
	wl_wrk.vol_id = vol_id;
	wl_wrk.lnum = lnum;
	wl_wrk.torture = torture;
630

631
	return __erase_worker(ubi, &wl_wrk);
632 633
}

634
static int ensure_wear_leveling(struct ubi_device *ubi, int nested);
635 636 637 638
/**
 * wear_leveling_worker - wear-leveling worker function.
 * @ubi: UBI device description object
 * @wrk: the work object
639 640
 * @shutdown: non-zero if the worker has to free memory and exit
 * because the WL-subsystem is shutting down
641 642 643 644 645 646
 *
 * This function copies a more worn out physical eraseblock to a less worn out
 * one. Returns zero in case of success and a negative error code in case of
 * failure.
 */
static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
647
				int shutdown)
648
{
649
	int err, scrubbing = 0, torture = 0, protect = 0, erroneous = 0;
650
	int erase = 0, keep = 0, vol_id = -1, lnum = -1;
651
	struct ubi_wl_entry *e1, *e2;
652
	struct ubi_vid_io_buf *vidb;
653
	struct ubi_vid_hdr *vid_hdr;
654
	int dst_leb_clean = 0;
655 656

	kfree(wrk);
657
	if (shutdown)
658 659
		return 0;

660 661
	vidb = ubi_alloc_vid_buf(ubi, GFP_NOFS);
	if (!vidb)
662 663
		return -ENOMEM;

664 665
	vid_hdr = ubi_get_vid_hdr(vidb);

666
	down_read(&ubi->fm_eba_sem);
667
	mutex_lock(&ubi->move_mutex);
668
	spin_lock(&ubi->wl_lock);
669 670
	ubi_assert(!ubi->move_from && !ubi->move_to);
	ubi_assert(!ubi->move_to_put);
671

672
	if (!ubi->free.rb_node ||
Artem Bityutskiy's avatar
Artem Bityutskiy committed
673
	    (!ubi->used.rb_node && !ubi->scrub.rb_node)) {
674
		/*
675 676 677
		 * No free physical eraseblocks? Well, they must be waiting in
		 * the queue to be erased. Cancel movement - it will be
		 * triggered again when a free physical eraseblock appears.
678 679 680 681 682 683 684
		 *
		 * No used physical eraseblocks? They must be temporarily
		 * protected from being moved. They will be moved to the
		 * @ubi->used tree later and the wear-leveling will be
		 * triggered again.
		 */
		dbg_wl("cancel WL, a list is empty: free %d, used %d",
Artem Bityutskiy's avatar
Artem Bityutskiy committed
685
		       !ubi->free.rb_node, !ubi->used.rb_node);
686
		goto out_cancel;
687 688
	}

689
#ifdef CONFIG_MTD_UBI_FASTMAP
Sascha Hauer's avatar
Sascha Hauer committed
690
	if (ubi->fm_do_produce_anchor) {
691 692 693 694 695 696 697
		e1 = find_anchor_wl_entry(&ubi->used);
		if (!e1)
			goto out_cancel;
		e2 = get_peb_for_wl(ubi);
		if (!e2)
			goto out_cancel;

698 699 700 701 702 703
		/*
		 * Anchor move within the anchor area is useless.
		 */
		if (e2->pnum < UBI_FM_MAX_START)
			goto out_cancel;

704 705 706
		self_check_in_wl_tree(ubi, e1, &ubi->used);
		rb_erase(&e1->u.rb, &ubi->used);
		dbg_wl("anchor-move PEB %d to PEB %d", e1->pnum, e2->pnum);
Sascha Hauer's avatar
Sascha Hauer committed
707
		ubi->fm_do_produce_anchor = 0;
708 709
	} else if (!ubi->scrub.rb_node) {
#else
Artem Bityutskiy's avatar
Artem Bityutskiy committed
710
	if (!ubi->scrub.rb_node) {
711
#endif
712 713 714 715 716
		/*
		 * Now pick the least worn-out used physical eraseblock and a
		 * highly worn-out free physical eraseblock. If the erase
		 * counters differ much enough, start wear-leveling.
		 */
717
		e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);
718 719 720
		e2 = get_peb_for_wl(ubi);
		if (!e2)
			goto out_cancel;
721 722 723 724

		if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) {
			dbg_wl("no WL needed: min used EC %d, max free EC %d",
			       e1->ec, e2->ec);
725 726 727

			/* Give the unused PEB back */
			wl_tree_add(e2, &ubi->free);
728
			ubi->free_count++;
729
			goto out_cancel;
730
		}
731
		self_check_in_wl_tree(ubi, e1, &ubi->used);
732
		rb_erase(&e1->u.rb, &ubi->used);
733 734 735
		dbg_wl("move PEB %d EC %d to PEB %d EC %d",
		       e1->pnum, e1->ec, e2->pnum, e2->ec);
	} else {
736 737
		/* Perform scrubbing */
		scrubbing = 1;
738
		e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, u.rb);
739 740 741 742
		e2 = get_peb_for_wl(ubi);
		if (!e2)
			goto out_cancel;

743
		self_check_in_wl_tree(ubi, e1, &ubi->scrub);
744
		rb_erase(&e1->u.rb, &ubi->scrub);
745 746 747 748 749 750 751 752 753 754 755 756
		dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum);
	}

	ubi->move_from = e1;
	ubi->move_to = e2;
	spin_unlock(&ubi->wl_lock);

	/*
	 * Now we are going to copy physical eraseblock @e1->pnum to @e2->pnum.
	 * We so far do not know which logical eraseblock our physical
	 * eraseblock (@e1) belongs to. We have to read the volume identifier
	 * header first.
757 758 759 760
	 *
	 * Note, we are protected from this PEB being unmapped and erased. The
	 * 'ubi_wl_put_peb()' would wait for moving to be finished if the PEB
	 * which is being moved was unmapped.
761 762
	 */

763
	err = ubi_io_read_vid_hdr(ubi, e1->pnum, vidb, 0);
764
	if (err && err != UBI_IO_BITFLIPS) {
765
		dst_leb_clean = 1;
766
		if (err == UBI_IO_FF) {
767 768 769
			/*
			 * We are trying to move PEB without a VID header. UBI
			 * always write VID headers shortly after the PEB was
770 771 772
			 * given, so we have a situation when it has not yet
			 * had a chance to write it, because it was preempted.
			 * So add this PEB to the protection queue so far,
773 774
			 * because presumably more data will be written there
			 * (including the missing VID header), and then we'll
775
			 * move it.
776 777
			 */
			dbg_wl("PEB %d has no VID header", e1->pnum);
778
			protect = 1;
779
			goto out_not_moved;
780 781 782 783 784 785 786 787 788 789
		} else if (err == UBI_IO_FF_BITFLIPS) {
			/*
			 * The same situation as %UBI_IO_FF, but bit-flips were
			 * detected. It is better to schedule this PEB for
			 * scrubbing.
			 */
			dbg_wl("PEB %d has no VID header but has bit-flips",
			       e1->pnum);
			scrubbing = 1;
			goto out_not_moved;
790 791 792 793 794 795 796 797 798 799
		} else if (ubi->fast_attach && err == UBI_IO_BAD_HDR_EBADMSG) {
			/*
			 * While a full scan would detect interrupted erasures
			 * at attach time we can face them here when attached from
			 * Fastmap.
			 */
			dbg_wl("PEB %d has ECC errors, maybe from an interrupted erasure",
			       e1->pnum);
			erase = 1;
			goto out_not_moved;
800
		}
801

802
		ubi_err(ubi, "error %d while reading VID header from PEB %d",
803 804
			err, e1->pnum);
		goto out_error;
805 806
	}

807 808 809
	vol_id = be32_to_cpu(vid_hdr->vol_id);
	lnum = be32_to_cpu(vid_hdr->lnum);

810
	err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vidb);
811
	if (err) {
812 813 814 815 816 817 818 819 820
		if (err == MOVE_CANCEL_RACE) {
			/*
			 * The LEB has not been moved because the volume is
			 * being deleted or the PEB has been put meanwhile. We
			 * should prevent this PEB from being selected for
			 * wear-leveling movement again, so put it to the
			 * protection queue.
			 */
			protect = 1;
821
			dst_leb_clean = 1;
822 823
			goto out_not_moved;
		}
824 825
		if (err == MOVE_RETRY) {
			scrubbing = 1;
826
			dst_leb_clean = 1;
827 828
			goto out_not_moved;
		}
829
		if (err == MOVE_TARGET_BITFLIPS || err == MOVE_TARGET_WR_ERR ||
830
		    err == MOVE_TARGET_RD_ERR) {
831 832 833
			/*
			 * Target PEB had bit-flips or write error - torture it.
			 */
834
			torture = 1;
835
			keep = 1;
836
			goto out_not_moved;
837
		}
838

839 840 841 842 843 844
		if (err == MOVE_SOURCE_RD_ERR) {
			/*
			 * An error happened while reading the source PEB. Do
			 * not switch to R/O mode in this case, and give the
			 * upper layers a possibility to recover from this,
			 * e.g. by unmapping corresponding LEB. Instead, just
845 846
			 * put this PEB to the @ubi->erroneous list to prevent
			 * UBI from trying to move it over and over again.
847 848
			 */
			if (ubi->erroneous_peb_count > ubi->max_erroneous) {
849
				ubi_err(ubi, "too many erroneous eraseblocks (%d)",
850 851 852
					ubi->erroneous_peb_count);
				goto out_error;
			}
853
			dst_leb_clean = 1;
854 855 856 857
			erroneous = 1;
			goto out_not_moved;
		}

858 859
		if (err < 0)
			goto out_error;
860

861
		ubi_assert(0);
862 863
	}

864 865
	/* The PEB has been successfully moved */
	if (scrubbing)
866
		ubi_msg(ubi, "scrubbed PEB %d (LEB %d:%d), data moved to PEB %d",
867
			e1->pnum, vol_id, lnum, e2->pnum);
868
	ubi_free_vid_buf(vidb);
869

870
	spin_lock(&ubi->wl_lock);
Artem Bityutskiy's avatar
Artem Bityutskiy committed
871
	if (!ubi->move_to_put) {
Artem Bityutskiy's avatar
Artem Bityutskiy committed
872
		wl_tree_add(e2, &ubi->used);
Artem Bityutskiy's avatar
Artem Bityutskiy committed
873 874
		e2 = NULL;
	}
875
	ubi->move_from = ubi->move_to = NULL;
876
	ubi->move_to_put = ubi->wl_scheduled = 0;
877 878
	spin_unlock(&ubi->wl_lock);

879
	err = do_sync_erase(ubi, e1, vol_id, lnum, 0);
Artem Bityutskiy's avatar
Artem Bityutskiy committed
880
	if (err) {
881
		if (e2)
882
			wl_entry_destroy(ubi, e2);
883
		goto out_ro;
Artem Bityutskiy's avatar
Artem Bityutskiy committed
884
	}
885

Artem Bityutskiy's avatar
Artem Bityutskiy committed
886
	if (e2) {
887 888 889 890
		/*
		 * Well, the target PEB was put meanwhile, schedule it for
		 * erasure.
		 */
891 892
		dbg_wl("PEB %d (LEB %d:%d) was put meanwhile, erase",
		       e2->pnum, vol_id, lnum);
893
		err = do_sync_erase(ubi, e2, vol_id, lnum, 0);
894
		if (err)
895
			goto out_ro;
896 897 898
	}

	dbg_wl("done");
899
	mutex_unlock(&ubi->move_mutex);
900
	up_read(&ubi->fm_eba_sem);
901
	return 0;
902 903

	/*
904 905
	 * For some reasons the LEB was not moved, might be an error, might be
	 * something else. @e1 was not changed, so return it back. @e2 might
906
	 * have been changed, schedule it for erasure.
907
	 */
908
out_not_moved:
909 910 911 912 913 914
	if (vol_id != -1)
		dbg_wl("cancel moving PEB %d (LEB %d:%d) to PEB %d (%d)",
		       e1->pnum, vol_id, lnum, e2->pnum, err);
	else
		dbg_wl("cancel moving PEB %d to PEB %d (%d)",
		       e1->pnum, e2->pnum, err);
915
	spin_lock(&ubi->wl_lock);
916 917
	if (protect)
		prot_queue_add(ubi, e1);
918 919 920 921
	else if (erroneous) {
		wl_tree_add(e1, &ubi->erroneous);
		ubi->erroneous_peb_count += 1;
	} else if (scrubbing)
922
		wl_tree_add(e1, &ubi->scrub);
923
	else if (keep)
Artem Bityutskiy's avatar
Artem Bityutskiy committed
924
		wl_tree_add(e1, &ubi->used);
925 926 927 928 929
	if (dst_leb_clean) {
		wl_tree_add(e2, &ubi->free);
		ubi->free_count++;
	}

930
	ubi_assert(!ubi->move_to_put);
931
	ubi->move_from = ubi->move_to = NULL;
932
	ubi->wl_scheduled = 0;
933 934
	spin_unlock(&ubi->wl_lock);

935
	ubi_free_vid_buf(vidb);
936 937 938 939 940 941 942
	if (dst_leb_clean) {
		ensure_wear_leveling(ubi, 1);
	} else {
		err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
		if (err)
			goto out_ro;
	}
943

944 945 946 947 948 949
	if (erase) {
		err = do_sync_erase(ubi, e1, vol_id, lnum, 1);
		if (err)
			goto out_ro;
	}

950
	mutex_unlock(&ubi->move_mutex);
951
	up_read(&ubi->fm_eba_sem);
952 953 954
	return 0;

out_error:
955
	if (vol_id != -1)
956
		ubi_err(ubi, "error %d while moving PEB %d to PEB %d",
957 958
			err, e1->pnum, e2->pnum);
	else
959
		ubi_err(ubi, "error %d while moving PEB %d (LEB %d:%d) to PEB %d",
960
			err, e1->pnum, vol_id, lnum, e2->pnum);
961 962 963 964 965
	spin_lock(&ubi->wl_lock);
	ubi->move_from = ubi->move_to = NULL;
	ubi->move_to_put = ubi->wl_scheduled = 0;
	spin_unlock(&ubi->wl_lock);

966
	ubi_free_vid_buf(vidb);
967 968
	wl_entry_destroy(ubi, e1);
	wl_entry_destroy(ubi, e2);
969

970 971
out_ro:
	ubi_ro_mode(ubi);
972
	mutex_unlock(&ubi->move_mutex);
973
	up_read(&ubi->fm_eba_sem);
974 975
	ubi_assert(err != 0);
	return err < 0 ? err : -EIO;
976 977 978 979 980

out_cancel:
	ubi->wl_scheduled = 0;
	spin_unlock(&ubi->wl_lock);
	mutex_unlock(&ubi->move_mutex);
981
	up_read(&ubi->fm_eba_sem);
982
	ubi_free_vid_buf(vidb);
983
	return 0;
984 985 986 987 988
}

/**
 * ensure_wear_leveling - schedule wear-leveling if it is needed.
 * @ubi: UBI device description object
989
 * @nested: set to non-zero if this function is called from UBI worker
990 991 992 993 994
 *
 * This function checks if it is time to start wear-leveling and schedules it
 * if yes. This function returns zero in case of success and a negative error
 * code in case of failure.
 */
995
static int ensure_wear_leveling(struct ubi_device *ubi, int nested)
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
{
	int err = 0;
	struct ubi_wl_entry *e1;
	struct ubi_wl_entry *e2;
	struct ubi_work *wrk;

	spin_lock(&ubi->wl_lock);
	if (ubi->wl_scheduled)
		/* Wear-leveling is already in the work queue */
		goto out_unlock;

	/*
	 * If the ubi->scrub tree is not empty, scrubbing is needed, and the
	 * the WL worker has to be scheduled anyway.
	 */
Artem Bityutskiy's avatar
Artem Bityutskiy committed
1011 1012
	if (!ubi->scrub.rb_node) {
		if (!ubi->used.rb_node || !ubi->free.rb_node)
1013 1014 1015 1016 1017 1018
			/* No physical eraseblocks - no deal */
			goto out_unlock;

		/*
		 * We schedule wear-leveling only if the difference between the
		 * lowest erase counter of used physical eraseblocks and a high
1019
		 * erase counter of free physical eraseblocks is greater than
1020 1021
		 * %UBI_WL_THRESHOLD.
		 */
1022
		e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);
1023
		e2 = find_wl_entry(ubi, &ubi->free, WL_FREE_MAX_DIFF);
1024 1025 1026 1027 1028 1029 1030 1031 1032 1033

		if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD))
			goto out_unlock;
		dbg_wl("schedule wear-leveling");
	} else
		dbg_wl("schedule scrubbing");

	ubi->wl_scheduled = 1;
	spin_unlock(&ubi->wl_lock);

1034
	wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS);
1035 1036 1037 1038 1039 1040
	if (!wrk) {
		err = -ENOMEM;
		goto out_cancel;
	}

	wrk->func = &wear_leveling_worker;
1041 1042 1043 1044
	if (nested)
		__schedule_ubi_work(ubi, wrk);
	else
		schedule_ubi_work(ubi, wrk);
1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055
	return err;

out_cancel:
	spin_lock(&ubi->wl_lock);
	ubi->wl_scheduled = 0;
out_unlock:
	spin_unlock(&ubi->wl_lock);
	return err;
}

/**
1056
 * __erase_worker - physical eraseblock erase worker function.
1057 1058
 * @ubi: UBI device description object
 * @wl_wrk: the work object
1059 1060
 * @shutdown: non-zero if the worker has to free memory and exit
 * because the WL sub-system is shutting down
1061 1062 1063 1064 1065 1066
 *
 * This function erases a physical eraseblock and perform torture testing if
 * needed. It also takes care about marking the physical eraseblock bad if
 * needed. Returns zero in case of success and a negative error code in case of
 * failure.
 */
1067
static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk)
1068 1069
{
	struct ubi_wl_entry *e = wl_wrk->e;
1070
	int pnum = e->pnum;
1071 1072
	int vol_id = wl_wrk->vol_id;
	int lnum = wl_wrk->lnum;
1073
	int err, available_consumed = 0;
1074

1075 1076
	dbg_wl("erase PEB %d EC %d LEB %d:%d",
	       pnum, e->ec, wl_wrk->vol_id, wl_wrk->lnum);
1077 1078 1079 1080

	err = sync_erase(ubi, e, wl_wrk->torture);
	if (!err) {
		spin_lock(&ubi->wl_lock);
Sascha Hauer's avatar
Sascha Hauer committed
1081 1082 1083 1084 1085 1086 1087 1088 1089

		if (!ubi->fm_anchor && e->pnum < UBI_FM_MAX_START) {
			ubi->fm_anchor = e;
			ubi->fm_do_produce_anchor = 0;
		} else {
			wl_tree_add(e, &ubi->free);
			ubi->free_count++;
		}

1090 1091 1092
		spin_unlock(&ubi->wl_lock);

		/*
1093 1094
		 * One more erase operation has happened, take care about
		 * protected physical eraseblocks.
1095
		 */
1096
		serve_prot_queue(ubi);
1097 1098

		/* And take care about wear-leveling */
1099
		err = ensure_wear_leveling(ubi, 1);
1100 1101 1102
		return err;
	}

1103
	ubi_err(ubi, "failed to erase PEB %d, error %d", pnum, err);
1104