disk-io.c 110 KB
Newer Older
Chris Mason's avatar
Chris Mason committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

Chris Mason's avatar
Chris Mason committed
19
#include <linux/fs.h>
20
#include <linux/blkdev.h>
Chris Mason's avatar
Chris Mason committed
21
#include <linux/scatterlist.h>
Chris Mason's avatar
Chris Mason committed
22
#include <linux/swap.h>
23
#include <linux/radix-tree.h>
24
#include <linux/writeback.h>
25
#include <linux/buffer_head.h>
26
#include <linux/workqueue.h>
27
#include <linux/kthread.h>
Chris Mason's avatar
Chris Mason committed
28
#include <linux/freezer.h>
29
#include <linux/crc32c.h>
30
#include <linux/slab.h>
31
#include <linux/migrate.h>
32
#include <linux/ratelimit.h>
33
#include <linux/uuid.h>
34
#include <linux/semaphore.h>
35
#include <asm/unaligned.h>
36 37
#include "ctree.h"
#include "disk-io.h"
38
#include "transaction.h"
39
#include "btrfs_inode.h"
40
#include "volumes.h"
41
#include "print-tree.h"
42
#include "async-thread.h"
43
#include "locking.h"
44
#include "tree-log.h"
45
#include "free-space-cache.h"
46
#include "inode-map.h"
47
#include "check-integrity.h"
48
#include "rcu-string.h"
49
#include "dev-replace.h"
David Woodhouse's avatar
David Woodhouse committed
50
#include "raid56.h"
51

52 53 54 55
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
#endif

56
static struct extent_io_ops btree_extent_io_ops;
57
static void end_workqueue_fn(struct btrfs_work *work);
58
static void free_fs_root(struct btrfs_root *root);
59
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
60
				    int read_only);
61 62
static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
					     struct btrfs_root *root);
63
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
64 65
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
				      struct btrfs_root *root);
66
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
67 68 69 70 71
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
					struct extent_io_tree *dirty_pages,
					int mark);
static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
				       struct extent_io_tree *pinned_extents);
72 73
static int btrfs_cleanup_transaction(struct btrfs_root *root);
static void btrfs_error_commit_super(struct btrfs_root *root);
74

Chris Mason's avatar
Chris Mason committed
75 76 77 78 79
/*
 * end_io_wq structs are used to do processing in task context when an IO is
 * complete.  This is used during reads to verify checksums, and it is used
 * by writes to insert metadata for new file extents after IO is complete.
 */
80 81 82 83 84 85
struct end_io_wq {
	struct bio *bio;
	bio_end_io_t *end_io;
	void *private;
	struct btrfs_fs_info *info;
	int error;
86
	int metadata;
87
	struct list_head list;
88
	struct btrfs_work work;
89
};
90

Chris Mason's avatar
Chris Mason committed
91 92 93 94 95
/*
 * async submit bios are used to offload expensive checksumming
 * onto the worker threads.  They checksum file and metadata bios
 * just before they are sent down the IO stack.
 */
96 97 98 99
struct async_submit_bio {
	struct inode *inode;
	struct bio *bio;
	struct list_head list;
100 101
	extent_submit_bio_hook_t *submit_bio_start;
	extent_submit_bio_hook_t *submit_bio_done;
102 103
	int rw;
	int mirror_num;
104
	unsigned long bio_flags;
105 106 107 108 109
	/*
	 * bio_offset is optional, can be used if the pages in the bio
	 * can't tell us where in the file the bio should go
	 */
	u64 bio_offset;
110
	struct btrfs_work work;
111
	int error;
112 113
};

114 115 116 117 118 119 120 121 122 123
/*
 * Lockdep class keys for extent_buffer->lock's in this root.  For a given
 * eb, the lockdep key is determined by the btrfs_root it belongs to and
 * the level the eb occupies in the tree.
 *
 * Different roots are used for different purposes and may nest inside each
 * other and they require separate keysets.  As lockdep keys should be
 * static, assign keysets according to the purpose of the root as indicated
 * by btrfs_root->objectid.  This ensures that all special purpose roots
 * have separate keysets.
124
 *
125 126 127
 * Lock-nesting across peer nodes is always done with the immediate parent
 * node locked thus preventing deadlock.  As lockdep doesn't know this, use
 * subclass to avoid triggering lockdep warning in such cases.
128
 *
129 130 131
 * The key is set by the readpage_end_io_hook after the buffer has passed
 * csum validation but before the pages are unlocked.  It is also set by
 * btrfs_init_new_buffer on freshly allocated blocks.
132
 *
133 134 135
 * We also add a check to make sure the highest level of the tree is the
 * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
 * needs update as well.
136 137 138 139 140
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# if BTRFS_MAX_LEVEL != 8
#  error
# endif
141 142 143 144 145 146 147 148 149 150 151 152 153

static struct btrfs_lockdep_keyset {
	u64			id;		/* root objectid */
	const char		*name_stem;	/* lock name stem */
	char			names[BTRFS_MAX_LEVEL + 1][20];
	struct lock_class_key	keys[BTRFS_MAX_LEVEL + 1];
} btrfs_lockdep_keysets[] = {
	{ .id = BTRFS_ROOT_TREE_OBJECTID,	.name_stem = "root"	},
	{ .id = BTRFS_EXTENT_TREE_OBJECTID,	.name_stem = "extent"	},
	{ .id = BTRFS_CHUNK_TREE_OBJECTID,	.name_stem = "chunk"	},
	{ .id = BTRFS_DEV_TREE_OBJECTID,	.name_stem = "dev"	},
	{ .id = BTRFS_FS_TREE_OBJECTID,		.name_stem = "fs"	},
	{ .id = BTRFS_CSUM_TREE_OBJECTID,	.name_stem = "csum"	},
154
	{ .id = BTRFS_QUOTA_TREE_OBJECTID,	.name_stem = "quota"	},
155 156 157
	{ .id = BTRFS_TREE_LOG_OBJECTID,	.name_stem = "log"	},
	{ .id = BTRFS_TREE_RELOC_OBJECTID,	.name_stem = "treloc"	},
	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	.name_stem = "dreloc"	},
158
	{ .id = BTRFS_UUID_TREE_OBJECTID,	.name_stem = "uuid"	},
159
	{ .id = 0,				.name_stem = "tree"	},
160
};
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191

void __init btrfs_init_lockdep(void)
{
	int i, j;

	/* initialize lockdep class names */
	for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
		struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];

		for (j = 0; j < ARRAY_SIZE(ks->names); j++)
			snprintf(ks->names[j], sizeof(ks->names[j]),
				 "btrfs-%s-%02d", ks->name_stem, j);
	}
}

void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
				    int level)
{
	struct btrfs_lockdep_keyset *ks;

	BUG_ON(level >= ARRAY_SIZE(ks->keys));

	/* find the matching keyset, id 0 is the default entry */
	for (ks = btrfs_lockdep_keysets; ks->id; ks++)
		if (ks->id == objectid)
			break;

	lockdep_set_class_and_name(&eb->lock,
				   &ks->keys[level], ks->names[level]);
}

192 193
#endif

Chris Mason's avatar
Chris Mason committed
194 195 196 197
/*
 * extents on the btree inode are pretty simple, there's one extent
 * that covers the entire device
 */
198
static struct extent_map *btree_get_extent(struct inode *inode,
199
		struct page *page, size_t pg_offset, u64 start, u64 len,
200
		int create)
201
{
202 203 204 205
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
	struct extent_map *em;
	int ret;

206
	read_lock(&em_tree->lock);
207
	em = lookup_extent_mapping(em_tree, start, len);
208 209 210
	if (em) {
		em->bdev =
			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
211
		read_unlock(&em_tree->lock);
212
		goto out;
213
	}
214
	read_unlock(&em_tree->lock);
215

216
	em = alloc_extent_map();
217 218 219 220 221
	if (!em) {
		em = ERR_PTR(-ENOMEM);
		goto out;
	}
	em->start = 0;
222
	em->len = (u64)-1;
223
	em->block_len = (u64)-1;
224
	em->block_start = 0;
225
	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
226

227
	write_lock(&em_tree->lock);
Josef Bacik's avatar
Josef Bacik committed
228
	ret = add_extent_mapping(em_tree, em, 0);
229 230
	if (ret == -EEXIST) {
		free_extent_map(em);
231
		em = lookup_extent_mapping(em_tree, start, len);
232
		if (!em)
233
			em = ERR_PTR(-EIO);
234
	} else if (ret) {
235
		free_extent_map(em);
236
		em = ERR_PTR(ret);
237
	}
238
	write_unlock(&em_tree->lock);
239

240 241
out:
	return em;
242 243
}

244
u32 btrfs_csum_data(char *data, u32 seed, size_t len)
245
{
246
	return crc32c(seed, data, len);
247 248 249 250
}

void btrfs_csum_final(u32 crc, char *result)
{
251
	put_unaligned_le32(~crc, result);
252 253
}

Chris Mason's avatar
Chris Mason committed
254 255 256 257
/*
 * compute the csum for a btree block, and either verify it or write it
 * into the csum field of the block.
 */
258 259 260
static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
			   int verify)
{
261
	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
262
	char *result = NULL;
263 264 265 266 267 268 269 270
	unsigned long len;
	unsigned long cur_len;
	unsigned long offset = BTRFS_CSUM_SIZE;
	char *kaddr;
	unsigned long map_start;
	unsigned long map_len;
	int err;
	u32 crc = ~(u32)0;
271
	unsigned long inline_result;
272 273

	len = buf->len - offset;
274
	while (len > 0) {
275
		err = map_private_extent_buffer(buf, offset, 32,
276
					&kaddr, &map_start, &map_len);
277
		if (err)
278 279
			return 1;
		cur_len = min(len, map_len - (offset - map_start));
280
		crc = btrfs_csum_data(kaddr + offset - map_start,
281 282 283 284
				      crc, cur_len);
		len -= cur_len;
		offset += cur_len;
	}
285 286 287 288 289 290 291 292
	if (csum_size > sizeof(inline_result)) {
		result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
		if (!result)
			return 1;
	} else {
		result = (char *)&inline_result;
	}

293 294 295
	btrfs_csum_final(crc, result);

	if (verify) {
296
		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
297 298
			u32 val;
			u32 found = 0;
299
			memcpy(&found, result, csum_size);
300

301
			read_extent_buffer(buf, &val, 0, csum_size);
302
			printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
303 304
				       "failed on %llu wanted %X found %X "
				       "level %d\n",
305 306
				       root->fs_info->sb->s_id, buf->start,
				       val, found, btrfs_header_level(buf));
307 308
			if (result != (char *)&inline_result)
				kfree(result);
309 310 311
			return 1;
		}
	} else {
312
		write_extent_buffer(buf, result, 0, csum_size);
313
	}
314 315
	if (result != (char *)&inline_result)
		kfree(result);
316 317 318
	return 0;
}

Chris Mason's avatar
Chris Mason committed
319 320 321 322 323 324
/*
 * we can't consider a given block up to date unless the transid of the
 * block matches the transid in the parent node's pointer.  This is how we
 * detect blocks that either didn't get written at all or got written
 * in the wrong place.
 */
325
static int verify_parent_transid(struct extent_io_tree *io_tree,
326 327
				 struct extent_buffer *eb, u64 parent_transid,
				 int atomic)
328
{
329
	struct extent_state *cached_state = NULL;
330 331 332 333 334
	int ret;

	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
		return 0;

335 336 337
	if (atomic)
		return -EAGAIN;

338
	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
339
			 0, &cached_state);
340
	if (extent_buffer_uptodate(eb) &&
341 342 343 344
	    btrfs_header_generation(eb) == parent_transid) {
		ret = 0;
		goto out;
	}
345
	printk_ratelimited("parent transid verify failed on %llu wanted %llu "
346
		       "found %llu\n",
347
		       eb->start, parent_transid, btrfs_header_generation(eb));
348
	ret = 1;
349
	clear_extent_buffer_uptodate(eb);
Chris Mason's avatar
Chris Mason committed
350
out:
351 352
	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
			     &cached_state, GFP_NOFS);
353 354 355
	return ret;
}

356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
/*
 * Return 0 if the superblock checksum type matches the checksum value of that
 * algorithm. Pass the raw disk superblock data.
 */
static int btrfs_check_super_csum(char *raw_disk_sb)
{
	struct btrfs_super_block *disk_sb =
		(struct btrfs_super_block *)raw_disk_sb;
	u16 csum_type = btrfs_super_csum_type(disk_sb);
	int ret = 0;

	if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
		u32 crc = ~(u32)0;
		const int csum_size = sizeof(crc);
		char result[csum_size];

		/*
		 * The super_block structure does not span the whole
		 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
		 * is filled with zeros and is included in the checkum.
		 */
		crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
				crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
		btrfs_csum_final(crc, result);

		if (memcmp(raw_disk_sb, result, csum_size))
			ret = 1;
383 384 385 386 387

		if (ret && btrfs_super_generation(disk_sb) < 10) {
			printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n");
			ret = 0;
		}
388 389 390 391 392 393 394 395 396 397 398
	}

	if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
		printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n",
				csum_type);
		ret = 1;
	}

	return ret;
}

Chris Mason's avatar
Chris Mason committed
399 400 401 402
/*
 * helper to read a given tree block, doing retries as required when
 * the checksums don't match and we have alternate mirrors to try.
 */
403 404
static int btree_read_extent_buffer_pages(struct btrfs_root *root,
					  struct extent_buffer *eb,
405
					  u64 start, u64 parent_transid)
406 407
{
	struct extent_io_tree *io_tree;
408
	int failed = 0;
409 410 411
	int ret;
	int num_copies = 0;
	int mirror_num = 0;
412
	int failed_mirror = 0;
413

414
	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
415 416
	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
	while (1) {
417 418
		ret = read_extent_buffer_pages(io_tree, eb, start,
					       WAIT_COMPLETE,
419
					       btree_get_extent, mirror_num);
420 421
		if (!ret) {
			if (!verify_parent_transid(io_tree, eb,
422
						   parent_transid, 0))
423 424 425 426
				break;
			else
				ret = -EIO;
		}
427

428 429 430 431 432 433
		/*
		 * This buffer's crc is fine, but its contents are corrupted, so
		 * there is no reason to read the other copies, they won't be
		 * any less wrong.
		 */
		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
434 435
			break;

436
		num_copies = btrfs_num_copies(root->fs_info,
437
					      eb->start, eb->len);
Chris Mason's avatar
Chris Mason committed
438
		if (num_copies == 1)
439
			break;
Chris Mason's avatar
Chris Mason committed
440

441 442 443 444 445
		if (!failed_mirror) {
			failed = 1;
			failed_mirror = eb->read_mirror;
		}

446
		mirror_num++;
447 448 449
		if (mirror_num == failed_mirror)
			mirror_num++;

Chris Mason's avatar
Chris Mason committed
450
		if (mirror_num > num_copies)
451
			break;
452
	}
453

454
	if (failed && !ret && failed_mirror)
455 456 457
		repair_eb_io_failure(root, eb, failed_mirror);

	return ret;
458
}
459

Chris Mason's avatar
Chris Mason committed
460
/*
461 462
 * checksum a dirty tree block before IO.  This has extra checks to make sure
 * we only fill in the checksum field in the first page of a multi-page block
Chris Mason's avatar
Chris Mason committed
463
 */
464

465
static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
466
{
467
	struct extent_io_tree *tree;
Miao Xie's avatar
Miao Xie committed
468
	u64 start = page_offset(page);
469 470
	u64 found_start;
	struct extent_buffer *eb;
471

472
	tree = &BTRFS_I(page->mapping->host)->io_tree;
473

474 475 476
	eb = (struct extent_buffer *)page->private;
	if (page != eb->pages[0])
		return 0;
477
	found_start = btrfs_header_bytenr(eb);
478
	if (WARN_ON(found_start != start || !PageUptodate(page)))
479
		return 0;
480 481 482 483
	csum_tree_block(root, eb, 0);
	return 0;
}

Yan Zheng's avatar
Yan Zheng committed
484 485 486 487 488 489 490
static int check_tree_block_fsid(struct btrfs_root *root,
				 struct extent_buffer *eb)
{
	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
	u8 fsid[BTRFS_UUID_SIZE];
	int ret = 1;

491
	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
Yan Zheng's avatar
Yan Zheng committed
492 493 494 495 496 497 498 499 500 501
	while (fs_devices) {
		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
			ret = 0;
			break;
		}
		fs_devices = fs_devices->seed;
	}
	return ret;
}

502 503 504
#define CORRUPT(reason, eb, root, slot)				\
	printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu,"	\
	       "root=%llu, slot=%d\n", reason,			\
505
	       btrfs_header_bytenr(eb),	root->objectid, slot)
506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567

static noinline int check_leaf(struct btrfs_root *root,
			       struct extent_buffer *leaf)
{
	struct btrfs_key key;
	struct btrfs_key leaf_key;
	u32 nritems = btrfs_header_nritems(leaf);
	int slot;

	if (nritems == 0)
		return 0;

	/* Check the 0 item */
	if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
	    BTRFS_LEAF_DATA_SIZE(root)) {
		CORRUPT("invalid item offset size pair", leaf, root, 0);
		return -EIO;
	}

	/*
	 * Check to make sure each items keys are in the correct order and their
	 * offsets make sense.  We only have to loop through nritems-1 because
	 * we check the current slot against the next slot, which verifies the
	 * next slot's offset+size makes sense and that the current's slot
	 * offset is correct.
	 */
	for (slot = 0; slot < nritems - 1; slot++) {
		btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &key, slot + 1);

		/* Make sure the keys are in the right order */
		if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
			CORRUPT("bad key order", leaf, root, slot);
			return -EIO;
		}

		/*
		 * Make sure the offset and ends are right, remember that the
		 * item data starts at the end of the leaf and grows towards the
		 * front.
		 */
		if (btrfs_item_offset_nr(leaf, slot) !=
			btrfs_item_end_nr(leaf, slot + 1)) {
			CORRUPT("slot offset bad", leaf, root, slot);
			return -EIO;
		}

		/*
		 * Check to make sure that we don't point outside of the leaf,
		 * just incase all the items are consistent to eachother, but
		 * all point outside of the leaf.
		 */
		if (btrfs_item_end_nr(leaf, slot) >
		    BTRFS_LEAF_DATA_SIZE(root)) {
			CORRUPT("slot end outside of leaf", leaf, root, slot);
			return -EIO;
		}
	}

	return 0;
}

568 569 570
static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
				      u64 phy_offset, struct page *page,
				      u64 start, u64 end, int mirror)
571 572 573 574 575 576
{
	struct extent_io_tree *tree;
	u64 found_start;
	int found_level;
	struct extent_buffer *eb;
	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
577
	int ret = 0;
578
	int reads_done;
579 580 581

	if (!page->private)
		goto out;
582

583
	tree = &BTRFS_I(page->mapping->host)->io_tree;
584
	eb = (struct extent_buffer *)page->private;
585

586 587 588 589 590 591
	/* the pending IO might have been the only thing that kept this buffer
	 * in memory.  Make sure we have a ref for all this other checks
	 */
	extent_buffer_get(eb);

	reads_done = atomic_dec_and_test(&eb->io_pages);
592 593
	if (!reads_done)
		goto err;
594

595
	eb->read_mirror = mirror;
596 597 598 599 600
	if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
		ret = -EIO;
		goto err;
	}

601
	found_start = btrfs_header_bytenr(eb);
602
	if (found_start != eb->start) {
603
		printk_ratelimited(KERN_INFO "btrfs bad tree block start "
604
			       "%llu %llu\n",
605
			       found_start, eb->start);
606
		ret = -EIO;
607 608
		goto err;
	}
Yan Zheng's avatar
Yan Zheng committed
609
	if (check_tree_block_fsid(root, eb)) {
610
		printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
611
			       eb->start);
612 613 614
		ret = -EIO;
		goto err;
	}
615
	found_level = btrfs_header_level(eb);
616 617 618 619 620 621
	if (found_level >= BTRFS_MAX_LEVEL) {
		btrfs_info(root->fs_info, "bad tree block level %d\n",
			   (int)btrfs_header_level(eb));
		ret = -EIO;
		goto err;
	}
622

623 624
	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
				       eb, found_level);
625

626
	ret = csum_tree_block(root, eb, 1);
627
	if (ret) {
628
		ret = -EIO;
629 630 631 632 633 634 635 636 637 638 639 640
		goto err;
	}

	/*
	 * If this is a leaf block and it is corrupt, set the corrupt bit so
	 * that we don't try and read the other copies of this block, just
	 * return -EIO.
	 */
	if (found_level == 0 && check_leaf(root, eb)) {
		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
		ret = -EIO;
	}
641

642 643
	if (!ret)
		set_extent_buffer_uptodate(eb);
644
err:
645 646
	if (reads_done &&
	    test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
Arne Jansen's avatar
Arne Jansen committed
647 648
		btree_readahead_hook(root, eb, eb->start, ret);

David Woodhouse's avatar
David Woodhouse committed
649 650 651 652 653 654 655
	if (ret) {
		/*
		 * our io error hook is going to dec the io pages
		 * again, we have to make sure it has something
		 * to decrement
		 */
		atomic_inc(&eb->io_pages);
656
		clear_extent_buffer_uptodate(eb);
David Woodhouse's avatar
David Woodhouse committed
657
	}
658
	free_extent_buffer(eb);
659
out:
660
	return ret;
661 662
}

663
static int btree_io_failed_hook(struct page *page, int failed_mirror)
Arne Jansen's avatar
Arne Jansen committed
664 665 666 667
{
	struct extent_buffer *eb;
	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;

668
	eb = (struct extent_buffer *)page->private;
669
	set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
670
	eb->read_mirror = failed_mirror;
David Woodhouse's avatar
David Woodhouse committed
671
	atomic_dec(&eb->io_pages);
672
	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
Arne Jansen's avatar
Arne Jansen committed
673 674 675 676
		btree_readahead_hook(root, eb, eb->start, -EIO);
	return -EIO;	/* we fixed nothing */
}

677 678 679 680 681 682 683
static void end_workqueue_bio(struct bio *bio, int err)
{
	struct end_io_wq *end_io_wq = bio->bi_private;
	struct btrfs_fs_info *fs_info;

	fs_info = end_io_wq->info;
	end_io_wq->error = err;
684 685
	end_io_wq->work.func = end_workqueue_fn;
	end_io_wq->work.flags = 0;
686

687
	if (bio->bi_rw & REQ_WRITE) {
David Woodhouse's avatar
David Woodhouse committed
688
		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
689 690
			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
					   &end_io_wq->work);
David Woodhouse's avatar
David Woodhouse committed
691
		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
692 693
			btrfs_queue_worker(&fs_info->endio_freespace_worker,
					   &end_io_wq->work);
David Woodhouse's avatar
David Woodhouse committed
694 695 696
		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
			btrfs_queue_worker(&fs_info->endio_raid56_workers,
					   &end_io_wq->work);
697 698 699
		else
			btrfs_queue_worker(&fs_info->endio_write_workers,
					   &end_io_wq->work);
700
	} else {
David Woodhouse's avatar
David Woodhouse committed
701 702 703 704
		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
			btrfs_queue_worker(&fs_info->endio_raid56_workers,
					   &end_io_wq->work);
		else if (end_io_wq->metadata)
705 706 707 708 709 710
			btrfs_queue_worker(&fs_info->endio_meta_workers,
					   &end_io_wq->work);
		else
			btrfs_queue_worker(&fs_info->endio_workers,
					   &end_io_wq->work);
	}
711 712
}

713 714 715 716 717 718
/*
 * For the metadata arg you want
 *
 * 0 - if data
 * 1 - if normal metadta
 * 2 - if writing to the free space cache area
David Woodhouse's avatar
David Woodhouse committed
719
 * 3 - raid parity work
720
 */
721 722
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
			int metadata)
723
{
724 725 726 727 728 729 730
	struct end_io_wq *end_io_wq;
	end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
	if (!end_io_wq)
		return -ENOMEM;

	end_io_wq->private = bio->bi_private;
	end_io_wq->end_io = bio->bi_end_io;
731
	end_io_wq->info = info;
732 733
	end_io_wq->error = 0;
	end_io_wq->bio = bio;
734
	end_io_wq->metadata = metadata;
735 736 737

	bio->bi_private = end_io_wq;
	bio->bi_end_io = end_workqueue_bio;
738 739 740
	return 0;
}

741
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
742
{
743 744 745 746 747
	unsigned long limit = min_t(unsigned long,
				    info->workers.max_workers,
				    info->fs_devices->open_devices);
	return 256 * limit;
}
748

749 750 751
static void run_one_async_start(struct btrfs_work *work)
{
	struct async_submit_bio *async;
752
	int ret;
753 754

	async = container_of(work, struct  async_submit_bio, work);
755 756 757 758 759
	ret = async->submit_bio_start(async->inode, async->rw, async->bio,
				      async->mirror_num, async->bio_flags,
				      async->bio_offset);
	if (ret)
		async->error = ret;
760 761 762
}

static void run_one_async_done(struct btrfs_work *work)
763 764 765
{
	struct btrfs_fs_info *fs_info;
	struct async_submit_bio *async;
766
	int limit;
767 768 769

	async = container_of(work, struct  async_submit_bio, work);
	fs_info = BTRFS_I(async->inode)->root->fs_info;
770

771
	limit = btrfs_async_submit_limit(fs_info);
772 773
	limit = limit * 2 / 3;

774
	if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
775
	    waitqueue_active(&fs_info->async_submit_wait))
776 777
		wake_up(&fs_info->async_submit_wait);

778 779 780 781 782 783
	/* If an error occured we just want to clean up the bio and move on */
	if (async->error) {
		bio_endio(async->bio, async->error);
		return;
	}

784
	async->submit_bio_done(async->inode, async->rw, async->bio,
785 786
			       async->mirror_num, async->bio_flags,
			       async->bio_offset);
787 788 789 790 791 792 793
}

static void run_one_async_free(struct btrfs_work *work)
{
	struct async_submit_bio *async;

	async = container_of(work, struct  async_submit_bio, work);
794 795 796
	kfree(async);
}

797 798
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
			int rw, struct bio *bio, int mirror_num,
799
			unsigned long bio_flags,
800
			u64 bio_offset,
801 802
			extent_submit_bio_hook_t *submit_bio_start,
			extent_submit_bio_hook_t *submit_bio_done)
803 804 805 806 807 808 809 810 811 812 813
{
	struct async_submit_bio *async;

	async = kmalloc(sizeof(*async), GFP_NOFS);
	if (!async)
		return -ENOMEM;

	async->inode = inode;
	async->rw = rw;
	async->bio = bio;
	async->mirror_num = mirror_num;
814 815 816 817 818 819 820
	async->submit_bio_start = submit_bio_start;
	async->submit_bio_done = submit_bio_done;

	async->work.func = run_one_async_start;
	async->work.ordered_func = run_one_async_done;
	async->work.ordered_free = run_one_async_free;

821
	async->work.flags = 0;
822
	async->bio_flags = bio_flags;
823
	async->bio_offset = bio_offset;
824

825 826
	async->error = 0;

827
	atomic_inc(&fs_info->nr_async_submits);
828

829
	if (rw & REQ_SYNC)
830 831
		btrfs_set_work_high_prio(&async->work);

832
	btrfs_queue_worker(&fs_info->workers, &async->work);
833

834
	while (atomic_read(&fs_info->async_submit_draining) &&
835 836 837 838 839
	      atomic_read(&fs_info->nr_async_submits)) {
		wait_event(fs_info->async_submit_wait,
			   (atomic_read(&fs_info->nr_async_submits) == 0));
	}

840 841 842
	return 0;
}

843 844
static int btree_csum_one_bio(struct bio *bio)
{
845
	struct bio_vec *bvec;
846
	struct btrfs_root *root;
847
	int i, ret = 0;
848

849
	bio_for_each_segment_all(bvec, bio, i) {
850
		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
851 852 853
		ret = csum_dirty_buffer(root, bvec->bv_page);
		if (ret)
			break;
854
	}
855

856
	return ret;
857 858
}

859 860
static int __btree_submit_bio_start(struct inode *inode, int rw,
				    struct bio *bio, int mirror_num,
861 862
				    unsigned long bio_flags,
				    u64 bio_offset)
863
{
864 865
	/*
	 * when we're called for a write, we're already in the async
866
	 * submission context.  Just jump into btrfs_map_bio
867
	 */
868
	return btree_csum_one_bio(bio);
869
}
870

871
static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
872 873
				 int mirror_num, unsigned long bio_flags,
				 u64 bio_offset)
874
{
875 876
	int ret;

877
	/*
878 879
	 * when we're called for a write, we're already in the async
	 * submission context.  Just jump into btrfs_map_bio
880
	 */
881 882 883 884
	ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
	if (ret)
		bio_endio(bio, ret);
	return ret;
885 886
}

887 888 889 890 891 892 893 894 895 896 897
static int check_async_write(struct inode *inode, unsigned long bio_flags)
{
	if (bio_flags & EXTENT_BIO_TREE_LOG)
		return 0;
#ifdef CONFIG_X86
	if (cpu_has_xmm4_2)
		return 0;
#endif
	return 1;
}

898
static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
899 900
				 int mirror_num, unsigned long bio_flags,
				 u64 bio_offset)
901
{
902
	int async = check_async_write(inode, bio_flags);
903 904
	int ret;

905
	if (!(rw & REQ_WRITE)) {
906 907 908 909
		/*
		 * called for a read, do the setup so that checksum validation
		 * can happen in the async kernel threads
		 */
910 911
		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
					  bio, 1);
912
		if (ret)
913 914 915
			goto out_w_error;
		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
				    mirror_num, 0);
916 917 918
	} else if (!async) {
		ret = btree_csum_one_bio(bio);
		if (ret)
919 920 921 922 923 924 925 926 927 928 929 930 931
			goto out_w_error;
		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
				    mirror_num, 0);
	} else {
		/*
		 * kthread helpers are used to submit writes so that
		 * checksumming can happen in parallel across all CPUs
		 */
		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
					  inode, rw, bio, mirror_num, 0,
					  bio_offset,
					  __btree_submit_bio_start,
					  __btree_submit_bio_done);
932
	}
933

934 935 936 937 938
	if (ret) {
out_w_error:
		bio_endio(bio, ret);
	}
	return ret;
939 940
}

Jan Beulich's avatar
Jan Beulich committed
941
#ifdef CONFIG_MIGRATION
942
static int btree_migratepage(struct address_space *mapping,
943 944
			struct page *newpage, struct page *page,
			enum migrate_mode mode)
945 946 947 948 949 950 951 952 953 954 955 956 957 958
{
	/*
	 * we can't safely write a btree page from here,
	 * we haven't done the locking hook
	 */
	if (PageDirty(page))
		return -EAGAIN;
	/*
	 * Buffers may be managed in a filesystem specific way.
	 * We must have no buffers or drop them.
	 */
	if (page_has_private(page) &&
	    !try_to_release_page(page, GFP_KERNEL))
		return -EAGAIN;
959
	return migrate_page(mapping, newpage, page, mode);
960
}
Jan Beulich's avatar
Jan Beulich committed
961
#endif
962

963 964 965 966

static int btree_writepages(struct address_space *mapping,
			    struct writeback_control *wbc)
{
967
	struct extent_io_tree *tree;
968 969 970
	struct btrfs_fs_info *fs_info;
	int ret;

971
	tree = &BTRFS_I(mapping->host)->io_tree;
972
	if (wbc->sync_mode == WB_SYNC_NONE) {
973 974 975 976

		if (wbc->for_kupdate)
			return 0;

977
		fs_info = BTRFS_I(mapping->host)->root->fs_info;
978
		/* this is a bit racy, but that's ok */
979 980 981
		ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
					     BTRFS_DIRTY_METADATA_THRESH);
		if (ret < 0)
982 983
			return 0;
	}
984
	return btree_write_cache_pages(mapping, wbc);
985 986
}

987
static int btree_readpage(struct file *file, struct page *page)
988
{
989 990
	struct extent_io_tree *tree;
	tree = &BTRFS_I(page->mapping->host)->io_tree;
991
	return extent_read_full_page(tree, page, btree_get_extent, 0);
992
}
Chris Mason's avatar
Chris Mason committed
993

994
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
995
{
996
	if (PageWriteback(page) || PageDirty(page))
997
		return 0;
998

999
	return try_release_extent_buffer(page);
1000 1001
}

1002 1003
static void btree_invalidatepage(struct page *page, unsigned int offset,
				 unsigned int length)
1004
{
1005 1006
	struct extent_io_tree *tree;
	tree = &BTRFS_I(page->mapping->host)->io_tree;
1007 1008
	extent_invalidatepage(tree, page, offset);
	btree_releasepage(page, GFP_NOFS);
1009
	if (PagePrivate(page)) {
1010 1011
		printk(KERN_WARNING "btrfs warning page private not zero "
		       "on page %llu\n", (unsigned long long)page_offset(page));
1012 1013 1014 1015
		ClearPagePrivate(page);
		set_page_private(page, 0);
		page_cache_release(page);
	}
1016 1017
}

1018 1019
static int btree_set_page_dirty(struct page *page)
{
1020
#ifdef DEBUG
1021 1022 1023 1024 1025 1026 1027 1028
	struct extent_buffer *eb;

	BUG_ON(!PagePrivate(page));
	eb = (struct extent_buffer *)page->private;
	BUG_ON(!eb);
	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
	BUG_ON(!atomic_read(&eb->refs));
	btrfs_assert_tree_locked(eb);
1029
#endif
1030 1031 1032
	return __set_page_dirty_nobuffers(page);
}

1033
static const struct address_space_operations btree_aops = {
1034
	.readpage	= btree_readpage,
1035
	.writepages	= btree_writepages,
1036 1037
	.releasepage	= btree_releasepage,
	.invalidatepage = btree_invalidatepage,
1038
#ifdef CONFIG_MIGRATION
1039
	.migratepage	= btree_migratepage,
1040
#endif
1041
	.set_page_dirty = btree_set_page_dirty,
1042 1043
};

1044 1045
int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
			 u64 parent_transid)
Chris Mason's avatar
Chris Mason committed
1046
{
1047 1048
	struct extent_buffer *buf = NULL;
	struct inode *btree_inode = root->fs_info->btree_inode;
1049
	int ret = 0;
Chris Mason's avatar
Chris Mason committed
1050

1051
	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1052
	if (!buf)
Chris Mason's avatar
Chris Mason committed
1053
		return 0;
1054
	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
1055
				 buf, 0, WAIT_NONE, btree_get_extent, 0);
1056
	free_extent_buffer(buf);
1057
	return ret;
Chris Mason's avatar
Chris Mason committed
1058 1059
}

1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083
int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksi