memory-failure.c 47.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
/*
 * Copyright (C) 2008, 2009 Intel Corporation
 * Authors: Andi Kleen, Fengguang Wu
 *
 * This software may be redistributed and/or modified under the terms of
 * the GNU General Public License ("GPL") version 2 only as published by the
 * Free Software Foundation.
 *
 * High level machine check handler. Handles pages reported by the
10
 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
11
 * failure.
12
13
14
 * 
 * In addition there is a "soft offline" entry point that allows stop using
 * not-yet-corrupted-by-suspicious pages without killing anything.
15
16
 *
 * Handles page cache pages in various states.	The tricky part
17
18
19
20
21
22
 * here is that we can access any page asynchronously in respect to 
 * other VM users, because memory failures could happen anytime and 
 * anywhere. This could violate some of their assumptions. This is why 
 * this code has to be extremely careful. Generally it tries to use 
 * normal locking rules, as in get the standard locks, even if that means 
 * the error handling takes potentially a long time.
23
24
25
26
27
28
29
30
 *
 * It can be very tempting to add handling for obscure cases here.
 * In general any code for handling new cases should only be added iff:
 * - You know how to test it.
 * - You have a test that can be added to mce-test
 *   https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
 * - The case actually shows up as a frequent (top 10) page state in
 *   tools/vm/page-types when running a real workload.
31
32
33
34
35
36
37
 * 
 * There are several operations here with exponential complexity because
 * of unsuitable VM data structures. For example the operation to map back 
 * from RMAP chains to processes has to walk the complete process list and 
 * has non linear complexity with the number. But since memory corruptions
 * are rare we hope to get away with this. This avoids impacting the core 
 * VM.
38
39
40
41
 */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
Wu Fengguang's avatar
Wu Fengguang committed
42
#include <linux/kernel-page-flags.h>
43
#include <linux/sched/signal.h>
44
#include <linux/sched/task.h>
Hugh Dickins's avatar
Hugh Dickins committed
45
#include <linux/ksm.h>
46
#include <linux/rmap.h>
47
#include <linux/export.h>
48
49
50
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/backing-dev.h>
51
52
#include <linux/migrate.h>
#include <linux/suspend.h>
53
#include <linux/slab.h>
54
#include <linux/swapops.h>
55
#include <linux/hugetlb.h>
56
#include <linux/memory_hotplug.h>
57
#include <linux/mm_inline.h>
58
#include <linux/kfifo.h>
59
#include <linux/ratelimit.h>
60
#include "internal.h"
61
#include "ras/ras_event.h"
62
63
64
65
66

int sysctl_memory_failure_early_kill __read_mostly = 0;

int sysctl_memory_failure_recovery __read_mostly = 1;

67
atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
68

69
70
#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)

71
u32 hwpoison_filter_enable = 0;
Wu Fengguang's avatar
Wu Fengguang committed
72
73
u32 hwpoison_filter_dev_major = ~0U;
u32 hwpoison_filter_dev_minor = ~0U;
Wu Fengguang's avatar
Wu Fengguang committed
74
75
u64 hwpoison_filter_flags_mask;
u64 hwpoison_filter_flags_value;
76
EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
Wu Fengguang's avatar
Wu Fengguang committed
77
78
EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
Wu Fengguang's avatar
Wu Fengguang committed
79
80
EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
Wu Fengguang's avatar
Wu Fengguang committed
81
82
83
84
85
86
87
88
89
90
91

static int hwpoison_filter_dev(struct page *p)
{
	struct address_space *mapping;
	dev_t dev;

	if (hwpoison_filter_dev_major == ~0U &&
	    hwpoison_filter_dev_minor == ~0U)
		return 0;

	/*
92
	 * page_mapping() does not accept slab pages.
Wu Fengguang's avatar
Wu Fengguang committed
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
	 */
	if (PageSlab(p))
		return -EINVAL;

	mapping = page_mapping(p);
	if (mapping == NULL || mapping->host == NULL)
		return -EINVAL;

	dev = mapping->host->i_sb->s_dev;
	if (hwpoison_filter_dev_major != ~0U &&
	    hwpoison_filter_dev_major != MAJOR(dev))
		return -EINVAL;
	if (hwpoison_filter_dev_minor != ~0U &&
	    hwpoison_filter_dev_minor != MINOR(dev))
		return -EINVAL;

	return 0;
}

Wu Fengguang's avatar
Wu Fengguang committed
112
113
114
115
116
117
118
119
120
121
122
123
static int hwpoison_filter_flags(struct page *p)
{
	if (!hwpoison_filter_flags_mask)
		return 0;

	if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
				    hwpoison_filter_flags_value)
		return 0;
	else
		return -EINVAL;
}

124
125
126
127
128
129
130
131
132
133
/*
 * This allows stress tests to limit test scope to a collection of tasks
 * by putting them under some memcg. This prevents killing unrelated/important
 * processes such as /sbin/init. Note that the target task may share clean
 * pages with init (eg. libc text), which is harmless. If the target task
 * share _dirty_ pages with another task B, the test scheme must make sure B
 * is also included in the memcg. At last, due to race conditions this filter
 * can only guarantee that the page either belongs to the memcg tasks, or is
 * a freed page.
 */
134
#ifdef CONFIG_MEMCG
135
136
137
138
139
140
141
u64 hwpoison_filter_memcg;
EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
static int hwpoison_filter_task(struct page *p)
{
	if (!hwpoison_filter_memcg)
		return 0;

142
	if (page_cgroup_ino(p) != hwpoison_filter_memcg)
143
144
145
146
147
148
149
150
		return -EINVAL;

	return 0;
}
#else
static int hwpoison_filter_task(struct page *p) { return 0; }
#endif

Wu Fengguang's avatar
Wu Fengguang committed
151
152
int hwpoison_filter(struct page *p)
{
153
154
155
	if (!hwpoison_filter_enable)
		return 0;

Wu Fengguang's avatar
Wu Fengguang committed
156
157
158
	if (hwpoison_filter_dev(p))
		return -EINVAL;

Wu Fengguang's avatar
Wu Fengguang committed
159
160
161
	if (hwpoison_filter_flags(p))
		return -EINVAL;

162
163
164
	if (hwpoison_filter_task(p))
		return -EINVAL;

Wu Fengguang's avatar
Wu Fengguang committed
165
166
	return 0;
}
167
168
169
170
171
172
173
#else
int hwpoison_filter(struct page *p)
{
	return 0;
}
#endif

Wu Fengguang's avatar
Wu Fengguang committed
174
175
EXPORT_SYMBOL_GPL(hwpoison_filter);

176
/*
177
178
179
 * Send all the processes who have the page mapped a signal.
 * ``action optional'' if they are not immediately affected by the error
 * ``action required'' if error happened in current execution context
180
 */
181
static int kill_proc(struct task_struct *t, unsigned long addr,
182
			unsigned long pfn, struct page *page, int flags)
183
184
185
186
{
	struct siginfo si;
	int ret;

187
188
	pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
		pfn, t->comm, t->pid);
189
190
191
	si.si_signo = SIGBUS;
	si.si_errno = 0;
	si.si_addr = (void *)addr;
192
	si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
193

194
	if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
195
		si.si_code = BUS_MCEERR_AR;
196
		ret = force_sig_info(SIGBUS, &si, current);
197
198
199
200
201
202
203
204
205
206
	} else {
		/*
		 * Don't use force here, it's convenient if the signal
		 * can be temporarily blocked.
		 * This could cause a loop when the user sets SIGBUS
		 * to SIG_IGN, but hopefully no one will do that?
		 */
		si.si_code = BUS_MCEERR_AO;
		ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
	}
207
	if (ret < 0)
208
		pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
209
			t->comm, t->pid, ret);
210
211
212
	return ret;
}

213
214
215
216
/*
 * When a unknown page type is encountered drain as many buffers as possible
 * in the hope to turn the page into a LRU or free page, which we can handle.
 */
217
void shake_page(struct page *p, int access)
218
{
219
220
221
	if (PageHuge(p))
		return;

222
223
224
225
	if (!PageSlab(p)) {
		lru_add_drain_all();
		if (PageLRU(p))
			return;
226
		drain_all_pages(page_zone(p));
227
228
229
		if (PageLRU(p) || is_free_buddy_page(p))
			return;
	}
230

231
	/*
232
233
	 * Only call shrink_node_slabs here (which would also shrink
	 * other caches) if access is not potentially fatal.
234
	 */
235
236
	if (access)
		drop_slab_node(page_to_nid(p));
237
238
239
}
EXPORT_SYMBOL_GPL(shake_page);

240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/*
 * Kill all processes that have a poisoned page mapped and then isolate
 * the page.
 *
 * General strategy:
 * Find all processes having the page mapped and kill them.
 * But we keep a page reference around so that the page is not
 * actually freed yet.
 * Then stash the page away
 *
 * There's no convenient way to get back to mapped processes
 * from the VMAs. So do a brute-force search over all
 * running processes.
 *
 * Remember that machine checks are not common (or rather
 * if they are common you have other problems), so this shouldn't
 * be a performance issue.
 *
 * Also there are some races possible while we get from the
 * error detection to actually handle it.
 */

struct to_kill {
	struct list_head nd;
	struct task_struct *tsk;
	unsigned long addr;
266
	char addr_valid;
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
};

/*
 * Failure handling: if we can't find or can't kill a process there's
 * not much we can do.	We just print a message and ignore otherwise.
 */

/*
 * Schedule a process for later kill.
 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
 * TBD would GFP_NOIO be enough?
 */
static void add_to_kill(struct task_struct *tsk, struct page *p,
		       struct vm_area_struct *vma,
		       struct list_head *to_kill,
		       struct to_kill **tkc)
{
	struct to_kill *tk;

	if (*tkc) {
		tk = *tkc;
		*tkc = NULL;
	} else {
		tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
		if (!tk) {
292
			pr_err("Memory failure: Out of memory while machine check handling\n");
293
294
295
296
297
298
299
300
301
302
303
304
305
			return;
		}
	}
	tk->addr = page_address_in_vma(p, vma);
	tk->addr_valid = 1;

	/*
	 * In theory we don't have to kill when the page was
	 * munmaped. But it could be also a mremap. Since that's
	 * likely very rare kill anyways just out of paranoia, but use
	 * a SIGKILL because the error is not contained anymore.
	 */
	if (tk->addr == -EFAULT) {
306
		pr_info("Memory failure: Unable to find user space address %lx in %s\n",
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
			page_to_pfn(p), tsk->comm);
		tk->addr_valid = 0;
	}
	get_task_struct(tsk);
	tk->tsk = tsk;
	list_add_tail(&tk->nd, to_kill);
}

/*
 * Kill the processes that have been collected earlier.
 *
 * Only do anything when DOIT is set, otherwise just free the list
 * (this is used for clean pages which do not need killing)
 * Also when FAIL is set do a force kill because something went
 * wrong earlier.
 */
323
static void kill_procs(struct list_head *to_kill, int forcekill,
Minchan Kim's avatar
Minchan Kim committed
324
			  bool fail, struct page *page, unsigned long pfn,
325
			  int flags)
326
327
328
329
{
	struct to_kill *tk, *next;

	list_for_each_entry_safe (tk, next, to_kill, nd) {
330
		if (forcekill) {
331
			/*
332
			 * In case something went wrong with munmapping
333
334
335
336
			 * make sure the process doesn't catch the
			 * signal and then access the memory. Just kill it.
			 */
			if (fail || tk->addr_valid == 0) {
337
				pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
338
				       pfn, tk->tsk->comm, tk->tsk->pid);
339
340
341
342
343
344
345
346
347
				force_sig(SIGKILL, tk->tsk);
			}

			/*
			 * In theory the process could have mapped
			 * something else on the address in-between. We could
			 * check for that, but we need to tell the
			 * process anyways.
			 */
348
			else if (kill_proc(tk->tsk, tk->addr,
349
					      pfn, page, flags) < 0)
350
				pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
351
				       pfn, tk->tsk->comm, tk->tsk->pid);
352
353
354
355
356
357
		}
		put_task_struct(tk->tsk);
		kfree(tk);
	}
}

358
359
360
361
362
363
364
365
366
/*
 * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
 * on behalf of the thread group. Return task_struct of the (first found)
 * dedicated thread if found, and return NULL otherwise.
 *
 * We already hold read_lock(&tasklist_lock) in the caller, so we don't
 * have to call rcu_read_lock/unlock() in this function.
 */
static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
367
{
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
	struct task_struct *t;

	for_each_thread(tsk, t)
		if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
			return t;
	return NULL;
}

/*
 * Determine whether a given process is "early kill" process which expects
 * to be signaled when some page under the process is hwpoisoned.
 * Return task_struct of the dedicated thread (main thread unless explicitly
 * specified) if the process is "early kill," and otherwise returns NULL.
 */
static struct task_struct *task_early_kill(struct task_struct *tsk,
					   int force_early)
{
	struct task_struct *t;
386
	if (!tsk->mm)
387
		return NULL;
388
	if (force_early)
389
390
391
392
393
394
395
		return tsk;
	t = find_early_kill_thread(tsk);
	if (t)
		return t;
	if (sysctl_memory_failure_early_kill)
		return tsk;
	return NULL;
396
397
398
399
400
401
}

/*
 * Collect processes when the error hit an anonymous page.
 */
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
402
			      struct to_kill **tkc, int force_early)
403
404
405
406
{
	struct vm_area_struct *vma;
	struct task_struct *tsk;
	struct anon_vma *av;
407
	pgoff_t pgoff;
408

409
	av = page_lock_anon_vma_read(page);
410
	if (av == NULL)	/* Not actually mapped anymore */
411
412
		return;

413
	pgoff = page_to_pgoff(page);
414
	read_lock(&tasklist_lock);
415
	for_each_process (tsk) {
416
		struct anon_vma_chain *vmac;
417
		struct task_struct *t = task_early_kill(tsk, force_early);
418

419
		if (!t)
420
			continue;
421
422
		anon_vma_interval_tree_foreach(vmac, &av->rb_root,
					       pgoff, pgoff) {
423
			vma = vmac->vma;
424
425
			if (!page_mapped_in_vma(page, vma))
				continue;
426
427
			if (vma->vm_mm == t->mm)
				add_to_kill(t, page, vma, to_kill, tkc);
428
429
430
		}
	}
	read_unlock(&tasklist_lock);
431
	page_unlock_anon_vma_read(av);
432
433
434
435
436
437
}

/*
 * Collect processes when the error hit a file mapped page.
 */
static void collect_procs_file(struct page *page, struct list_head *to_kill,
438
			      struct to_kill **tkc, int force_early)
439
440
441
442
443
{
	struct vm_area_struct *vma;
	struct task_struct *tsk;
	struct address_space *mapping = page->mapping;

444
	i_mmap_lock_read(mapping);
445
	read_lock(&tasklist_lock);
446
	for_each_process(tsk) {
447
		pgoff_t pgoff = page_to_pgoff(page);
448
		struct task_struct *t = task_early_kill(tsk, force_early);
449

450
		if (!t)
451
			continue;
452
		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
453
454
455
456
457
458
459
460
				      pgoff) {
			/*
			 * Send early kill signal to tasks where a vma covers
			 * the page but the corrupted page is not necessarily
			 * mapped it in its pte.
			 * Assume applications who requested early kill want
			 * to be informed of all such data corruptions.
			 */
461
462
			if (vma->vm_mm == t->mm)
				add_to_kill(t, page, vma, to_kill, tkc);
463
464
465
		}
	}
	read_unlock(&tasklist_lock);
466
	i_mmap_unlock_read(mapping);
467
468
469
470
471
472
473
474
}

/*
 * Collect the processes who have the corrupted page mapped to kill.
 * This is done in two steps for locking reasons.
 * First preallocate one tokill structure outside the spin locks,
 * so that we can kill at least one process reasonably reliable.
 */
475
476
static void collect_procs(struct page *page, struct list_head *tokill,
				int force_early)
477
478
479
480
481
482
483
484
485
486
{
	struct to_kill *tk;

	if (!page->mapping)
		return;

	tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
	if (!tk)
		return;
	if (PageAnon(page))
487
		collect_procs_anon(page, tokill, &tk, force_early);
488
	else
489
		collect_procs_file(page, tokill, &tk, force_early);
490
491
492
493
	kfree(tk);
}

static const char *action_name[] = {
494
495
496
497
	[MF_IGNORED] = "Ignored",
	[MF_FAILED] = "Failed",
	[MF_DELAYED] = "Delayed",
	[MF_RECOVERED] = "Recovered",
498
499
500
};

static const char * const action_page_types[] = {
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
	[MF_MSG_KERNEL]			= "reserved kernel page",
	[MF_MSG_KERNEL_HIGH_ORDER]	= "high-order kernel page",
	[MF_MSG_SLAB]			= "kernel slab page",
	[MF_MSG_DIFFERENT_COMPOUND]	= "different compound page after locking",
	[MF_MSG_POISONED_HUGE]		= "huge page already hardware poisoned",
	[MF_MSG_HUGE]			= "huge page",
	[MF_MSG_FREE_HUGE]		= "free huge page",
	[MF_MSG_UNMAP_FAILED]		= "unmapping failed page",
	[MF_MSG_DIRTY_SWAPCACHE]	= "dirty swapcache page",
	[MF_MSG_CLEAN_SWAPCACHE]	= "clean swapcache page",
	[MF_MSG_DIRTY_MLOCKED_LRU]	= "dirty mlocked LRU page",
	[MF_MSG_CLEAN_MLOCKED_LRU]	= "clean mlocked LRU page",
	[MF_MSG_DIRTY_UNEVICTABLE_LRU]	= "dirty unevictable LRU page",
	[MF_MSG_CLEAN_UNEVICTABLE_LRU]	= "clean unevictable LRU page",
	[MF_MSG_DIRTY_LRU]		= "dirty LRU page",
	[MF_MSG_CLEAN_LRU]		= "clean LRU page",
	[MF_MSG_TRUNCATED_LRU]		= "already truncated LRU page",
	[MF_MSG_BUDDY]			= "free buddy page",
	[MF_MSG_BUDDY_2ND]		= "free buddy page (2nd try)",
	[MF_MSG_UNKNOWN]		= "unknown page",
521
522
};

523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
/*
 * XXX: It is possible that a page is isolated from LRU cache,
 * and then kept in swap cache or failed to remove from page cache.
 * The page count will stop it from being freed by unpoison.
 * Stress tests should be aware of this memory leak problem.
 */
static int delete_from_lru_cache(struct page *p)
{
	if (!isolate_lru_page(p)) {
		/*
		 * Clear sensible page flags, so that the buddy system won't
		 * complain when the page is unpoison-and-freed.
		 */
		ClearPageActive(p);
		ClearPageUnevictable(p);
538
539
540
541
542
543
544

		/*
		 * Poisoned page might never drop its ref count to 0 so we have
		 * to uncharge it manually from its memcg.
		 */
		mem_cgroup_uncharge(p);

545
546
547
		/*
		 * drop the page count elevated by isolate_lru_page()
		 */
548
		put_page(p);
549
550
551
552
553
		return 0;
	}
	return -EIO;
}

554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
static int truncate_error_page(struct page *p, unsigned long pfn,
				struct address_space *mapping)
{
	int ret = MF_FAILED;

	if (mapping->a_ops->error_remove_page) {
		int err = mapping->a_ops->error_remove_page(mapping, p);

		if (err != 0) {
			pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
				pfn, err);
		} else if (page_has_private(p) &&
			   !try_to_release_page(p, GFP_NOIO)) {
			pr_info("Memory failure: %#lx: failed to release buffers\n",
				pfn);
		} else {
			ret = MF_RECOVERED;
		}
	} else {
		/*
		 * If the file system doesn't support it just invalidate
		 * This fails on dirty or anything with private pages
		 */
		if (invalidate_inode_page(p))
			ret = MF_RECOVERED;
		else
			pr_info("Memory failure: %#lx: Failed to invalidate\n",
				pfn);
	}

	return ret;
}

587
588
589
590
591
592
593
/*
 * Error hit kernel page.
 * Do nothing, try to be lucky and not touch this instead. For a few cases we
 * could be more sophisticated.
 */
static int me_kernel(struct page *p, unsigned long pfn)
{
594
	return MF_IGNORED;
595
596
597
598
599
600
601
}

/*
 * Page in unknown state. Do nothing.
 */
static int me_unknown(struct page *p, unsigned long pfn)
{
602
	pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
603
	return MF_FAILED;
604
605
606
607
608
609
610
611
612
}

/*
 * Clean (or cleaned) page cache page.
 */
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
	struct address_space *mapping;

613
614
	delete_from_lru_cache(p);

615
616
617
618
619
	/*
	 * For anonymous pages we're done the only reference left
	 * should be the one m_f() holds.
	 */
	if (PageAnon(p))
620
		return MF_RECOVERED;
621
622
623
624
625
626
627
628
629
630
631
632
633

	/*
	 * Now truncate the page in the page cache. This is really
	 * more like a "temporary hole punch"
	 * Don't do this for block devices when someone else
	 * has a reference, because it could be file system metadata
	 * and that's not safe to truncate.
	 */
	mapping = page_mapping(p);
	if (!mapping) {
		/*
		 * Page has been teared down in the meanwhile
		 */
634
		return MF_FAILED;
635
636
637
638
639
640
641
	}

	/*
	 * Truncation is a bit tricky. Enable it per file system for now.
	 *
	 * Open: to take i_mutex or not for this? Right now we don't.
	 */
642
	return truncate_error_page(p, pfn, mapping);
643
644
645
}

/*
646
 * Dirty pagecache page
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
 * Issues: when the error hit a hole page the error is not properly
 * propagated.
 */
static int me_pagecache_dirty(struct page *p, unsigned long pfn)
{
	struct address_space *mapping = page_mapping(p);

	SetPageError(p);
	/* TBD: print more information about the file. */
	if (mapping) {
		/*
		 * IO error will be reported by write(), fsync(), etc.
		 * who check the mapping.
		 * This way the application knows that something went
		 * wrong with its dirty file data.
		 *
		 * There's one open issue:
		 *
		 * The EIO will be only reported on the next IO
		 * operation and then cleared through the IO map.
		 * Normally Linux has two mechanisms to pass IO error
		 * first through the AS_EIO flag in the address space
		 * and then through the PageError flag in the page.
		 * Since we drop pages on memory failure handling the
		 * only mechanism open to use is through AS_AIO.
		 *
		 * This has the disadvantage that it gets cleared on
		 * the first operation that returns an error, while
		 * the PageError bit is more sticky and only cleared
		 * when the page is reread or dropped.  If an
		 * application assumes it will always get error on
		 * fsync, but does other operations on the fd before
Lucas De Marchi's avatar
Lucas De Marchi committed
679
		 * and the page is dropped between then the error
680
681
682
683
684
685
686
687
688
689
690
		 * will not be properly reported.
		 *
		 * This can already happen even without hwpoisoned
		 * pages: first on metadata IO errors (which only
		 * report through AS_EIO) or when the page is dropped
		 * at the wrong time.
		 *
		 * So right now we assume that the application DTRT on
		 * the first EIO, but we're not worse than other parts
		 * of the kernel.
		 */
691
		mapping_set_error(mapping, -EIO);
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
	}

	return me_pagecache_clean(p, pfn);
}

/*
 * Clean and dirty swap cache.
 *
 * Dirty swap cache page is tricky to handle. The page could live both in page
 * cache and swap cache(ie. page is freshly swapped in). So it could be
 * referenced concurrently by 2 types of PTEs:
 * normal PTEs and swap PTEs. We try to handle them consistently by calling
 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
 * and then
 *      - clear dirty bit to prevent IO
 *      - remove from LRU
 *      - but keep in the swap cache, so that when we return to it on
 *        a later page fault, we know the application is accessing
 *        corrupted data and shall be killed (we installed simple
 *        interception code in do_swap_page to catch it).
 *
 * Clean swap cache pages can be directly isolated. A later page fault will
 * bring in the known good data from disk.
 */
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
{
	ClearPageDirty(p);
	/* Trigger EIO in shmem: */
	ClearPageUptodate(p);

722
	if (!delete_from_lru_cache(p))
723
		return MF_DELAYED;
724
	else
725
		return MF_FAILED;
726
727
728
729
730
}

static int me_swapcache_clean(struct page *p, unsigned long pfn)
{
	delete_from_swap_cache(p);
731

732
	if (!delete_from_lru_cache(p))
733
		return MF_RECOVERED;
734
	else
735
		return MF_FAILED;
736
737
738
739
740
}

/*
 * Huge pages. Needs work.
 * Issues:
741
742
 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
 *   To narrow down kill region to one page, we need to break up pmd.
743
744
745
 */
static int me_huge_page(struct page *p, unsigned long pfn)
{
746
	int res = 0;
747
	struct page *hpage = compound_head(p);
748
	struct address_space *mapping;
749
750
751
752

	if (!PageHuge(hpage))
		return MF_DELAYED;

753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
	mapping = page_mapping(hpage);
	if (mapping) {
		res = truncate_error_page(hpage, pfn, mapping);
	} else {
		unlock_page(hpage);
		/*
		 * migration entry prevents later access on error anonymous
		 * hugepage, so we can free and dissolve it into buddy to
		 * save healthy subpages.
		 */
		if (PageAnon(hpage))
			put_page(hpage);
		dissolve_free_huge_page(p);
		res = MF_RECOVERED;
		lock_page(hpage);
768
	}
769
770

	return res;
771
772
773
774
775
776
777
778
779
}

/*
 * Various page states we can handle.
 *
 * A page state is defined by its current page->flags bits.
 * The table matches them in order and calls the right handler.
 *
 * This is quite tricky because we can access page at any time
Lucas De Marchi's avatar
Lucas De Marchi committed
780
 * in its live cycle, so all accesses have to be extremely careful.
781
782
783
784
785
786
 *
 * This is not complete. More states could be added.
 * For any missing state don't attempt recovery.
 */

#define dirty		(1UL << PG_dirty)
787
#define sc		((1UL << PG_swapcache) | (1UL << PG_swapbacked))
788
789
790
791
792
793
794
795
796
797
798
#define unevict		(1UL << PG_unevictable)
#define mlock		(1UL << PG_mlocked)
#define writeback	(1UL << PG_writeback)
#define lru		(1UL << PG_lru)
#define head		(1UL << PG_head)
#define slab		(1UL << PG_slab)
#define reserved	(1UL << PG_reserved)

static struct page_state {
	unsigned long mask;
	unsigned long res;
799
	enum mf_action_page_type type;
800
801
	int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
802
	{ reserved,	reserved,	MF_MSG_KERNEL,	me_kernel },
803
804
805
806
	/*
	 * free pages are specially detected outside this table:
	 * PG_buddy pages only make a small fraction of all free pages.
	 */
807
808
809
810
811
812

	/*
	 * Could in theory check if slab page is free or if we can drop
	 * currently unused objects without touching them. But just
	 * treat it as standard kernel for now.
	 */
813
	{ slab,		slab,		MF_MSG_SLAB,	me_kernel },
814

815
	{ head,		head,		MF_MSG_HUGE,		me_huge_page },
816

817
818
	{ sc|dirty,	sc|dirty,	MF_MSG_DIRTY_SWAPCACHE,	me_swapcache_dirty },
	{ sc|dirty,	sc,		MF_MSG_CLEAN_SWAPCACHE,	me_swapcache_clean },
819

820
821
	{ mlock|dirty,	mlock|dirty,	MF_MSG_DIRTY_MLOCKED_LRU,	me_pagecache_dirty },
	{ mlock|dirty,	mlock,		MF_MSG_CLEAN_MLOCKED_LRU,	me_pagecache_clean },
822

823
824
	{ unevict|dirty, unevict|dirty,	MF_MSG_DIRTY_UNEVICTABLE_LRU,	me_pagecache_dirty },
	{ unevict|dirty, unevict,	MF_MSG_CLEAN_UNEVICTABLE_LRU,	me_pagecache_clean },
825

826
827
	{ lru|dirty,	lru|dirty,	MF_MSG_DIRTY_LRU,	me_pagecache_dirty },
	{ lru|dirty,	lru,		MF_MSG_CLEAN_LRU,	me_pagecache_clean },
828
829
830
831

	/*
	 * Catchall entry: must be at end.
	 */
832
	{ 0,		0,		MF_MSG_UNKNOWN,	me_unknown },
833
834
};

835
836
837
838
839
840
841
842
843
844
#undef dirty
#undef sc
#undef unevict
#undef mlock
#undef writeback
#undef lru
#undef head
#undef slab
#undef reserved

845
846
847
848
/*
 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
 */
849
850
static void action_result(unsigned long pfn, enum mf_action_page_type type,
			  enum mf_result result)
851
{
852
853
	trace_memory_failure_event(pfn, type, result);

854
	pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
855
		pfn, action_page_types[type], action_name[result]);
856
857
858
}

static int page_action(struct page_state *ps, struct page *p,
859
			unsigned long pfn)
860
861
{
	int result;
862
	int count;
863
864

	result = ps->action(p, pfn);
865

866
	count = page_count(p) - 1;
867
	if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
868
		count--;
869
	if (count > 0) {
870
		pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
871
		       pfn, action_page_types[ps->type], count);
872
		result = MF_FAILED;
873
	}
874
	action_result(pfn, ps->type, result);
875
876
877
878
879
880

	/* Could do more checks here if page looks ok */
	/*
	 * Could adjust zone counters here to correct for the missing page.
	 */

881
	return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
882
883
}

884
885
886
887
888
889
890
891
892
893
894
/**
 * get_hwpoison_page() - Get refcount for memory error handling:
 * @page:	raw error page (hit by memory error)
 *
 * Return: return 0 if failed to grab the refcount, otherwise true (some
 * non-zero value.)
 */
int get_hwpoison_page(struct page *page)
{
	struct page *head = compound_head(page);

895
	if (!PageHuge(head) && PageTransHuge(head)) {
896
897
898
899
900
901
902
		/*
		 * Non anonymous thp exists only in allocation/free time. We
		 * can't handle such a case correctly, so let's give it up.
		 * This should be better than triggering BUG_ON when kernel
		 * tries to touch the "partially handled" page.
		 */
		if (!PageAnon(head)) {
903
			pr_err("Memory failure: %#lx: non anonymous thp\n",
904
905
906
				page_to_pfn(page));
			return 0;
		}
907
908
	}

909
910
911
912
	if (get_page_unless_zero(head)) {
		if (head == compound_head(page))
			return 1;

913
914
		pr_info("Memory failure: %#lx cannot catch tail\n",
			page_to_pfn(page));
915
916
917
918
		put_page(head);
	}

	return 0;
919
920
921
}
EXPORT_SYMBOL_GPL(get_hwpoison_page);

922
923
924
925
/*
 * Do all that is necessary to remove user space mappings. Unmap
 * the pages and send SIGBUS to the processes if the data was dirty.
 */
Minchan Kim's avatar
Minchan Kim committed
926
static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
927
				  int flags, struct page **hpagep)
928
{
929
	enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
930
931
	struct address_space *mapping;
	LIST_HEAD(tokill);
Minchan Kim's avatar
Minchan Kim committed
932
	bool unmap_success;
933
	int kill = 1, forcekill;
934
	struct page *hpage = *hpagep;
935
	bool mlocked = PageMlocked(hpage);
936

937
938
939
940
941
	/*
	 * Here we are interested only in user-mapped pages, so skip any
	 * other types of pages.
	 */
	if (PageReserved(p) || PageSlab(p))
Minchan Kim's avatar
Minchan Kim committed
942
		return true;
943
	if (!(PageLRU(hpage) || PageHuge(p)))
Minchan Kim's avatar
Minchan Kim committed
944
		return true;
945
946
947
948
949

	/*
	 * This check implies we don't kill processes if their pages
	 * are in the swap cache early. Those are always late kills.
	 */
950
	if (!page_mapped(hpage))
Minchan Kim's avatar
Minchan Kim committed
951
		return true;
Wu Fengguang's avatar
Wu Fengguang committed
952

953
	if (PageKsm(p)) {
954
		pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
Minchan Kim's avatar
Minchan Kim committed
955
		return false;
956
	}
957
958

	if (PageSwapCache(p)) {
959
960
		pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
			pfn);
961
962
963
964
965
966
		ttu |= TTU_IGNORE_HWPOISON;
	}

	/*
	 * Propagate the dirty bit from PTEs to struct page first, because we
	 * need this to decide if we should kill or just drop the page.
967
968
	 * XXX: the dirty test could be racy: set_page_dirty() may not always
	 * be called inside page lock (it's recommended but not enforced).
969
	 */
970
	mapping = page_mapping(hpage);
971
	if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
972
973
974
	    mapping_cap_writeback_dirty(mapping)) {
		if (page_mkclean(hpage)) {
			SetPageDirty(hpage);
975
976
977
		} else {
			kill = 0;
			ttu |= TTU_IGNORE_HWPOISON;
978
			pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
979
980
981
982
983
984
985
986
987
988
989
990
991
				pfn);
		}
	}

	/*
	 * First collect all the processes that have the page
	 * mapped in dirty form.  This has to be done before try_to_unmap,
	 * because ttu takes the rmap data structures down.
	 *
	 * Error handling: We ignore errors here because
	 * there's nothing that can be done.
	 */
	if (kill)
992
		collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
993

Minchan Kim's avatar
Minchan Kim committed
994
995
	unmap_success = try_to_unmap(hpage, ttu);
	if (!unmap_success)
996
		pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
997
		       pfn, page_mapcount(hpage));
998

999
1000
1001
1002
1003
1004
1005
	/*
	 * try_to_unmap() might put mlocked page in lru cache, so call
	 * shake_page() again to ensure that it's flushed.
	 */
	if (mlocked)
		shake_page(hpage, 0);

1006
1007
1008
1009
	/*
	 * Now that the dirty bit has been propagated to the
	 * struct page and all unmaps done we can decide if
	 * killing is needed or not.  Only kill when the page
1010
1011
	 * was dirty or the process is not restartable,
	 * otherwise the tokill list is merely
1012
1013
1014
1015
	 * freed.  When there was a problem unmapping earlier
	 * use a more force-full uncatchable kill to prevent
	 * any accesses to the poisoned memory.
	 */
1016
	forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1017
	kill_procs(&tokill, forcekill, !unmap_success, p, pfn, flags);
Wu Fengguang's avatar
Wu Fengguang committed
1018

Minchan Kim's avatar
Minchan Kim committed
1019
	return unmap_success;
1020
1021
}

1022
1023
static int identify_page_state(unsigned long pfn, struct page *p,
				unsigned long page_flags)
1024
1025
{
	struct page_state *ps;
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044

	/*
	 * The first check uses the current page flags which may not have any
	 * relevant information. The second check with the saved page flags is
	 * carried out only if the first check can't determine the page status.
	 */
	for (ps = error_states;; ps++)
		if ((p->flags & ps->mask) == ps->res)
			break;

	page_flags |= (p->flags & (1UL << PG_dirty));

	if (!ps->mask)
		for (ps = error_states;; ps++)
			if ((page_flags & ps->mask) == ps->res)
				break;
	return page_action(ps, p, pfn);
}

1045
static int memory_failure_hugetlb(unsigned long pfn, int flags)
1046
{
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
	struct page *p = pfn_to_page(pfn);
	struct page *head = compound_head(p);
	int res;
	unsigned long page_flags;

	if (TestSetPageHWPoison(head)) {
		pr_err("Memory failure: %#lx: already hardware poisoned\n",
		       pfn);
		return 0;
	}

	num_poisoned_pages_inc();

	if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
		/*
		 * Check "filter hit" and "race with other subpage."
		 */
		lock_page(head);
		if (PageHWPoison(head)) {
			if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
			    || (p != head && TestSetPageHWPoison(head))) {
				num_poisoned_pages_dec();
				unlock_page(head);
				return 0;
			}
		}
		unlock_page(head);
		dissolve_free_huge_page(p);
		action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
		return 0;
	}

	lock_page(head);
	page_flags = head->flags;

	if (!PageHWPoison(head)) {
		pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
		num_poisoned_pages_dec();
		unlock_page(head);
		put_hwpoison_page(head);
		return 0;
	}

1090
	if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
1091
1092
1093
1094
1095
		action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
		res = -EBUSY;
		goto out;
	}

1096
	res = identify_page_state(pfn, p, page_flags);
1097
1098
1099
1100
1101
out:
	unlock_page(head);
	return res;
}

1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
/**
 * memory_failure - Handle memory failure of a page.
 * @pfn: Page Number of the corrupted page
 * @flags: fine tune action taken
 *
 * This function is called by the low level machine check code
 * of an architecture when it detects hardware memory corruption
 * of a page. It tries its best to recover, which includes
 * dropping pages, killing processes etc.
 *
 * The function is primarily of use for corruptions that
 * happen outside the current execution context (e.g. when
 * detected by a background scrubber)
 *
 * Must run in process context (e.g. a work queue) with interrupts
 * enabled and no spinlocks hold.
 */
1119
int memory_failure(unsigned long pfn, int flags)
1120
1121
{
	struct page *p;
1122
	struct page *hpage;
1123
	struct page *orig_head;
1124
	int res;
1125
	unsigned long page_flags;
1126
1127

	if (!sysctl_memory_failure_recovery)
1128
		panic("Memory failure on page %lx", pfn);
1129
1130

	if (!pfn_valid(pfn)) {
1131
1132
		pr_err("Memory failure: %#lx: memory outside kernel control\n",
			pfn);
1133
		return -ENXIO;
1134
1135
1136
	}

	p = pfn_to_page(pfn);
1137
	if (PageHuge(p))
1138
		return memory_failure_hugetlb(pfn, flags);
1139
	if (TestSetPageHWPoison(p)) {
1140
1141
		pr_err("Memory failure: %#lx: already hardware poisoned\n",
			pfn);
1142
1143
1144
		return 0;
	}

1145
1146
	arch_unmap_kpfn(pfn);

1147
	orig_head = hpage = compound_head(p);
1148
	num_poisoned_pages_inc();
1149
1150
1151
1152
1153

	/*
	 * We need/can do nothing about count=0 pages.
	 * 1) it's a free page, and therefore in safe hand:
	 *    prep_new_page() will be the gate keeper.
1154
	 * 2) it's part of a non-compound high order page.
1155
1156
1157
1158
1159
1160
	 *    Implies some kernel user: cannot stop them from
	 *    R/W the page; let's pray that the page has been
	 *    used and will be freed some time later.
	 * In fact it's dangerous to directly bump up page count from 0,
	 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
	 */
1161
	if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1162
		if (is_free_buddy_page(p)) {
1163
			action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1164
1165
			return 0;
		} else {
1166
			action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
1167
1168
			return -EBUSY;
		}
1169
1170
	}

1171
	if (PageTransHuge(hpage)) {
1172
1173
1174
1175
		lock_page(p);
		if (!PageAnon(p) || unlikely(split_huge_page(p))) {
			unlock_page(p);
			if (!PageAnon(p))
1176
1177
				pr_err("Memory failure: %#lx: non anonymous thp\n",
					pfn);
1178
			else
1179
1180
				pr_err("Memory failure: %#lx: thp split failed\n",
					pfn);
1181
			if (TestClearPageHWPoison(p))
1182
				num_poisoned_pages_dec();
1183
			put_hwpoison_page(p);
1184
1185
			return -EBUSY;
		}
1186
		unlock_page(p);
1187
1188
1189
1190
		VM_BUG_ON_PAGE(!page_count(p), p);
		hpage = compound_head(p);
	}

1191
1192
1193
	/*
	 * We ignore non-LRU pages for good reasons.
	 * - PG_locked is only well defined for LRU pages and a few others
1194
	 * - to avoid races with __SetPageLocked()
1195
1196
1197
1198
	 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
	 * The check (unnecessarily) ignores LRU pages being isolated and
	 * walked by the page reclaim code, however that's not a big loss.
	 */
1199
1200
1201
1202
1203
1204
1205
1206
	shake_page(p, 0);
	/* shake_page could have turned it free. */
	if (!PageLRU(p) && is_free_buddy_page(p)) {
		if (flags & MF_COUNT_INCREASED)
			action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
		else
			action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
		return 0;
1207
1208
	}

1209
	lock_page(p);
1210

1211
1212
1213
1214
	/*
	 * The page could have changed compound pages during the locking.
	 * If this happens just bail out.
	 */
1215
	if (PageCompound(p) && compound_head(p) != orig_head) {
1216
		action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
1217
1218
1219
1220
		res = -EBUSY;
		goto out;
	}

1221
1222
1223
1224
1225
1226
1227
	/*
	 * We use page flags to determine what action should be taken, but
	 * the flags can be modified by the error containment action.  One
	 * example is an mlocked page, where PG_mlocked is cleared by
	 * page_remove_rmap() in try_to_unmap_one(). So to determine page status
	 * correctly, we save a copy of the page flags at this time.
	 */
1228
1229
1230
1231
	if (PageHuge(p))
		page_flags = hpage->flags;
	else
		page_flags = p->flags;
1232

1233
1234
1235
1236
	/*
	 * unpoison always clear PG_hwpoison inside page lock
	 */
	if (!PageHWPoison(p)) {
1237
		pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1238
		num_poisoned_pages_dec();
1239
1240
		unlock_page(p);
		put_hwpoison_page(p);
1241
		return 0;
1242
	}
Wu Fengguang's avatar
Wu Fengguang committed
1243
1244
	if (hwpoison_filter(p)) {
		if (TestClearPageHWPoison(p))
1245
			num_poisoned_pages_dec();
1246
1247
		unlock_page(p);
		put_hwpoison_page(p);
Wu Fengguang's avatar
Wu Fengguang committed
1248
1249
		return 0;
	}
1250

1251
	if (!PageTransTail(p) && !PageLRU(p))
1252
1253
		goto identify_page_state;

1254
1255
1256
1257
	/*
	 * It's very difficult to mess with pages currently under IO
	 * and in many cases impossible, so we just avoid it here.
	 */
1258
1259
1260
1261
	wait_on_page_writeback(p);

	/*
	 * Now take care of user space mappings.
1262
	 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1263
1264
1265
	 *
	 * When the raw error page is thp tail page, hpage points to the raw
	 * page after thp split.
1266
	 */
1267
	if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
1268
		action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
Wu Fengguang's avatar
Wu Fengguang committed
1269
1270
1271
		res = -EBUSY;
		goto out;
	}
1272
1273
1274
1275

	/*
	 * Torn down by someone else?
	 */
1276
	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1277
		action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
1278
		res = -EBUSY;
1279
1280
1281
		goto out;
	}

1282
identify_page_state: