fault.c 38 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2
/*
 *  Copyright (C) 1995  Linus Torvalds
Ingo Molnar's avatar
Ingo Molnar committed
3
 *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4
 *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
Linus Torvalds's avatar
Linus Torvalds committed
5
 */
6
#include <linux/sched.h>		/* test_thread_flag(), ...	*/
7
#include <linux/sched/task_stack.h>	/* task_stack_*(), ...		*/
8
#include <linux/kdebug.h>		/* oops_begin/end, ...		*/
9
#include <linux/extable.h>		/* search_exception_tables	*/
10
#include <linux/bootmem.h>		/* max_low_pfn			*/
11
#include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/
12
#include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
13
#include <linux/perf_event.h>		/* perf_sw_event		*/
14
#include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
15
#include <linux/prefetch.h>		/* prefetchw			*/
16
#include <linux/context_tracking.h>	/* exception_enter(), ...	*/
17
#include <linux/uaccess.h>		/* faulthandler_disabled()	*/
Ingo Molnar's avatar
Ingo Molnar committed
18

19
#include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
20 21
#include <asm/traps.h>			/* dotraplinkage, ...		*/
#include <asm/pgalloc.h>		/* pgd_*(), ...			*/
Vegard Nossum's avatar
Vegard Nossum committed
22
#include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
23 24
#include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
#include <asm/vsyscall.h>		/* emulate_vsyscall		*/
25
#include <asm/vm86.h>			/* struct vm86			*/
26
#include <asm/mmu_context.h>		/* vma_pkey()			*/
Linus Torvalds's avatar
Linus Torvalds committed
27

28 29 30
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>

31
/*
Ingo Molnar's avatar
Ingo Molnar committed
32 33 34 35 36 37 38
 * Page fault error code bits:
 *
 *   bit 0 ==	 0: no page found	1: protection fault
 *   bit 1 ==	 0: read access		1: write access
 *   bit 2 ==	 0: kernel-mode access	1: user-mode access
 *   bit 3 ==				1: use of reserved bit detected
 *   bit 4 ==				1: fault was an instruction fetch
39
 *   bit 5 ==				1: protection keys block access
40
 */
Ingo Molnar's avatar
Ingo Molnar committed
41 42 43 44 45 46 47
enum x86_pf_error_code {

	PF_PROT		=		1 << 0,
	PF_WRITE	=		1 << 1,
	PF_USER		=		1 << 2,
	PF_RSVD		=		1 << 3,
	PF_INSTR	=		1 << 4,
48
	PF_PK		=		1 << 5,
Ingo Molnar's avatar
Ingo Molnar committed
49
};
50

51
/*
52 53
 * Returns 0 if mmiotrace is disabled, or if the fault is not
 * handled by mmiotrace:
54
 */
55
static nokprobe_inline int
56
kmmio_fault(struct pt_regs *regs, unsigned long addr)
57
{
58 59 60 61
	if (unlikely(is_kmmio_active()))
		if (kmmio_handler(regs, addr) == 1)
			return -1;
	return 0;
62 63
}

64
static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
65
{
66 67 68
	int ret = 0;

	/* kprobe_running() needs smp_processor_id() */
69
	if (kprobes_built_in() && !user_mode(regs)) {
70 71 72 73 74
		preempt_disable();
		if (kprobe_running() && kprobe_fault_handler(regs, 14))
			ret = 1;
		preempt_enable();
	}
75

76
	return ret;
77
}
78

79
/*
Ingo Molnar's avatar
Ingo Molnar committed
80 81 82 83 84 85
 * Prefetch quirks:
 *
 * 32-bit mode:
 *
 *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
 *   Check that here and ignore it.
86
 *
Ingo Molnar's avatar
Ingo Molnar committed
87
 * 64-bit mode:
88
 *
Ingo Molnar's avatar
Ingo Molnar committed
89 90 91 92
 *   Sometimes the CPU reports invalid exceptions on prefetch.
 *   Check that here and ignore it.
 *
 * Opcode checker based on code by Richard Brunner.
93
 */
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
static inline int
check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
		      unsigned char opcode, int *prefetch)
{
	unsigned char instr_hi = opcode & 0xf0;
	unsigned char instr_lo = opcode & 0x0f;

	switch (instr_hi) {
	case 0x20:
	case 0x30:
		/*
		 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
		 * In X86_64 long mode, the CPU will signal invalid
		 * opcode if some of these prefixes are present so
		 * X86_64 will never get here anyway
		 */
		return ((instr_lo & 7) == 0x6);
#ifdef CONFIG_X86_64
	case 0x40:
		/*
		 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
		 * Need to figure out under what instruction mode the
		 * instruction was issued. Could check the LDT for lm,
		 * but for now it's good enough to assume that long
		 * mode only uses well known segments or kernel.
		 */
120
		return (!user_mode(regs) || user_64bit_mode(regs));
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
#endif
	case 0x60:
		/* 0x64 thru 0x67 are valid prefixes in all modes. */
		return (instr_lo & 0xC) == 0x4;
	case 0xF0:
		/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
		return !instr_lo || (instr_lo>>1) == 1;
	case 0x00:
		/* Prefetch instruction is 0x0F0D or 0x0F18 */
		if (probe_kernel_address(instr, opcode))
			return 0;

		*prefetch = (instr_lo == 0xF) &&
			(opcode == 0x0D || opcode == 0x18);
		return 0;
	default:
		return 0;
	}
}

Ingo Molnar's avatar
Ingo Molnar committed
141 142
static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
143
{
Ingo Molnar's avatar
Ingo Molnar committed
144
	unsigned char *max_instr;
145
	unsigned char *instr;
146
	int prefetch = 0;
Linus Torvalds's avatar
Linus Torvalds committed
147

Ingo Molnar's avatar
Ingo Molnar committed
148 149 150 151
	/*
	 * If it was a exec (instruction fetch) fault on NX page, then
	 * do not ignore the fault:
	 */
152
	if (error_code & PF_INSTR)
Linus Torvalds's avatar
Linus Torvalds committed
153
		return 0;
154

155
	instr = (void *)convert_ip_to_linear(current, regs);
156
	max_instr = instr + 15;
Linus Torvalds's avatar
Linus Torvalds committed
157

158
	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
Linus Torvalds's avatar
Linus Torvalds committed
159 160
		return 0;

161
	while (instr < max_instr) {
Ingo Molnar's avatar
Ingo Molnar committed
162
		unsigned char opcode;
Linus Torvalds's avatar
Linus Torvalds committed
163

164
		if (probe_kernel_address(instr, opcode))
165
			break;
Linus Torvalds's avatar
Linus Torvalds committed
166 167 168

		instr++;

169
		if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
Linus Torvalds's avatar
Linus Torvalds committed
170 171 172 173 174
			break;
	}
	return prefetch;
}

175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
/*
 * A protection key fault means that the PKRU value did not allow
 * access to some PTE.  Userspace can figure out what PKRU was
 * from the XSAVE state, and this function fills out a field in
 * siginfo so userspace can discover which protection key was set
 * on the PTE.
 *
 * If we get here, we know that the hardware signaled a PF_PK
 * fault and that there was a VMA once we got in the fault
 * handler.  It does *not* guarantee that the VMA we find here
 * was the one that we faulted on.
 *
 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
 * 2. T1   : set PKRU to deny access to pkey=4, touches page
 * 3. T1   : faults...
 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
 * 5. T1   : enters fault handler, takes mmap_sem, etc...
 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
 *	     faulted on a pte with its pkey=4.
 */
static void fill_sig_info_pkey(int si_code, siginfo_t *info,
		struct vm_area_struct *vma)
{
	/* This is effectively an #ifdef */
	if (!boot_cpu_has(X86_FEATURE_OSPKE))
		return;

	/* Fault not from Protection Keys: nothing to do */
	if (si_code != SEGV_PKUERR)
		return;
	/*
	 * force_sig_info_fault() is called from a number of
	 * contexts, some of which have a VMA and some of which
	 * do not.  The PF_PK handing happens after we have a
	 * valid VMA, so we should never reach this without a
	 * valid VMA.
	 */
	if (!vma) {
		WARN_ONCE(1, "PKU fault with no VMA passed in");
		info->si_pkey = 0;
		return;
	}
	/*
	 * si_pkey should be thought of as a strong hint, but not
	 * absolutely guranteed to be 100% accurate because of
	 * the race explained above.
	 */
	info->si_pkey = vma_pkey(vma);
}

Ingo Molnar's avatar
Ingo Molnar committed
225 226
static void
force_sig_info_fault(int si_signo, int si_code, unsigned long address,
227 228
		     struct task_struct *tsk, struct vm_area_struct *vma,
		     int fault)
229
{
230
	unsigned lsb = 0;
231 232
	siginfo_t info;

Ingo Molnar's avatar
Ingo Molnar committed
233 234 235 236
	info.si_signo	= si_signo;
	info.si_errno	= 0;
	info.si_code	= si_code;
	info.si_addr	= (void __user *)address;
237 238 239 240 241
	if (fault & VM_FAULT_HWPOISON_LARGE)
		lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 
	if (fault & VM_FAULT_HWPOISON)
		lsb = PAGE_SHIFT;
	info.si_addr_lsb = lsb;
Ingo Molnar's avatar
Ingo Molnar committed
242

243 244
	fill_sig_info_pkey(si_code, &info, vma);

245 246 247
	force_sig_info(si_signo, &info, tsk);
}

248 249 250 251 252
DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);

#ifdef CONFIG_X86_32
static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
253
{
254 255
	unsigned index = pgd_index(address);
	pgd_t *pgd_k;
256
	p4d_t *p4d, *p4d_k;
257 258
	pud_t *pud, *pud_k;
	pmd_t *pmd, *pmd_k;
Ingo Molnar's avatar
Ingo Molnar committed
259

260 261 262 263 264 265 266 267 268
	pgd += index;
	pgd_k = init_mm.pgd + index;

	if (!pgd_present(*pgd_k))
		return NULL;

	/*
	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
	 * and redundant with the set_pmd() on non-PAE. As would
269
	 * set_p4d/set_pud.
270
	 */
271 272 273 274 275 276 277
	p4d = p4d_offset(pgd, address);
	p4d_k = p4d_offset(pgd_k, address);
	if (!p4d_present(*p4d_k))
		return NULL;

	pud = pud_offset(p4d, address);
	pud_k = pud_offset(p4d_k, address);
278 279 280 281 282 283 284 285
	if (!pud_present(*pud_k))
		return NULL;

	pmd = pmd_offset(pud, address);
	pmd_k = pmd_offset(pud_k, address);
	if (!pmd_present(*pmd_k))
		return NULL;

286
	if (!pmd_present(*pmd))
287
		set_pmd(pmd, *pmd_k);
288
	else
289 290 291 292 293 294 295 296 297 298 299 300 301
		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));

	return pmd_k;
}

void vmalloc_sync_all(void)
{
	unsigned long address;

	if (SHARED_KERNEL_PMD)
		return;

	for (address = VMALLOC_START & PMD_MASK;
302
	     address >= TASK_SIZE_MAX && address < FIXADDR_TOP;
303 304 305
	     address += PMD_SIZE) {
		struct page *page;

306
		spin_lock(&pgd_lock);
307
		list_for_each_entry(page, &pgd_list, lru) {
308
			spinlock_t *pgt_lock;
309
			pmd_t *ret;
310

311
			/* the pgt_lock only for Xen */
312 313 314 315 316 317 318
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;

			spin_lock(pgt_lock);
			ret = vmalloc_sync_one(page_address(page), address);
			spin_unlock(pgt_lock);

			if (!ret)
319 320
				break;
		}
321
		spin_unlock(&pgd_lock);
322 323 324 325 326 327 328 329
	}
}

/*
 * 32-bit:
 *
 *   Handle a fault on the vmalloc or module mapping area
 */
330
static noinline int vmalloc_fault(unsigned long address)
331 332 333 334 335 336 337 338 339
{
	unsigned long pgd_paddr;
	pmd_t *pmd_k;
	pte_t *pte_k;

	/* Make sure we are in vmalloc area: */
	if (!(address >= VMALLOC_START && address < VMALLOC_END))
		return -1;

340 341
	WARN_ON_ONCE(in_nmi());

342 343 344 345 346 347 348
	/*
	 * Synchronize this task's top level page-table
	 * with the 'reference' page table.
	 *
	 * Do _not_ use "current" here. We might be inside
	 * an interrupt in the middle of a task switch..
	 */
349
	pgd_paddr = read_cr3_pa();
350 351 352 353
	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
	if (!pmd_k)
		return -1;

354 355 356
	if (pmd_huge(*pmd_k))
		return 0;

357 358 359 360 361 362
	pte_k = pte_offset_kernel(pmd_k, address);
	if (!pte_present(*pte_k))
		return -1;

	return 0;
}
363
NOKPROBE_SYMBOL(vmalloc_fault);
364 365 366 367 368 369 370 371

/*
 * Did it hit the DOS screen memory VA from vm86 mode?
 */
static inline void
check_v8086_mode(struct pt_regs *regs, unsigned long address,
		 struct task_struct *tsk)
{
372
#ifdef CONFIG_VM86
373 374
	unsigned long bit;

375
	if (!v8086_mode(regs) || !tsk->thread.vm86)
376 377 378 379
		return;

	bit = (address - 0xA0000) >> PAGE_SHIFT;
	if (bit < 32)
380 381
		tsk->thread.vm86->screen_bitmap |= 1 << bit;
#endif
382
}
Linus Torvalds's avatar
Linus Torvalds committed
383

Akinobu Mita's avatar
Akinobu Mita committed
384
static bool low_pfn(unsigned long pfn)
Linus Torvalds's avatar
Linus Torvalds committed
385
{
Akinobu Mita's avatar
Akinobu Mita committed
386 387
	return pfn < max_low_pfn;
}
388

Akinobu Mita's avatar
Akinobu Mita committed
389 390
static void dump_pagetable(unsigned long address)
{
391
	pgd_t *base = __va(read_cr3_pa());
Akinobu Mita's avatar
Akinobu Mita committed
392
	pgd_t *pgd = &base[pgd_index(address)];
393 394
	p4d_t *p4d;
	pud_t *pud;
Akinobu Mita's avatar
Akinobu Mita committed
395 396
	pmd_t *pmd;
	pte_t *pte;
Ingo Molnar's avatar
Ingo Molnar committed
397

398
#ifdef CONFIG_X86_PAE
399
	pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
Akinobu Mita's avatar
Akinobu Mita committed
400 401
	if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
		goto out;
402 403 404
#define pr_pde pr_cont
#else
#define pr_pde pr_info
405
#endif
406 407 408
	p4d = p4d_offset(pgd, address);
	pud = pud_offset(p4d, address);
	pmd = pmd_offset(pud, address);
409 410
	pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
#undef pr_pde
411 412 413 414 415

	/*
	 * We must not directly access the pte in the highpte
	 * case if the page table is located in highmem.
	 * And let's rather not kmap-atomic the pte, just in case
Ingo Molnar's avatar
Ingo Molnar committed
416
	 * it's allocated already:
417
	 */
Akinobu Mita's avatar
Akinobu Mita committed
418 419
	if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
		goto out;
420

Akinobu Mita's avatar
Akinobu Mita committed
421
	pte = pte_offset_kernel(pmd, address);
422
	pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
Akinobu Mita's avatar
Akinobu Mita committed
423
out:
424
	pr_cont("\n");
425 426 427 428 429 430
}

#else /* CONFIG_X86_64: */

void vmalloc_sync_all(void)
{
431
	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
432 433 434 435 436 437 438
}

/*
 * 64-bit:
 *
 *   Handle a fault on the vmalloc area
 */
439
static noinline int vmalloc_fault(unsigned long address)
440 441
{
	pgd_t *pgd, *pgd_ref;
442
	p4d_t *p4d, *p4d_ref;
443 444 445 446 447 448 449 450
	pud_t *pud, *pud_ref;
	pmd_t *pmd, *pmd_ref;
	pte_t *pte, *pte_ref;

	/* Make sure we are in vmalloc area: */
	if (!(address >= VMALLOC_START && address < VMALLOC_END))
		return -1;

451 452
	WARN_ON_ONCE(in_nmi());

453 454 455 456 457
	/*
	 * Copy kernel mappings over when needed. This can also
	 * happen within a race in page table update. In the later
	 * case just flush:
	 */
458
	pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
459 460 461 462
	pgd_ref = pgd_offset_k(address);
	if (pgd_none(*pgd_ref))
		return -1;

463
	if (pgd_none(*pgd)) {
464
		set_pgd(pgd, *pgd_ref);
465
		arch_flush_lazy_mmu_mode();
466 467 468 469 470 471 472 473
	} else if (CONFIG_PGTABLE_LEVELS > 4) {
		/*
		 * With folded p4d, pgd_none() is always false, so the pgd may
		 * point to an empty page table entry and pgd_page_vaddr()
		 * will return garbage.
		 *
		 * We will do the correct sanity check on the p4d level.
		 */
474
		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
475
	}
476

477 478 479 480 481 482 483 484 485 486 487 488 489
	/* With 4-level paging, copying happens on the p4d level. */
	p4d = p4d_offset(pgd, address);
	p4d_ref = p4d_offset(pgd_ref, address);
	if (p4d_none(*p4d_ref))
		return -1;

	if (p4d_none(*p4d)) {
		set_p4d(p4d, *p4d_ref);
		arch_flush_lazy_mmu_mode();
	} else {
		BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref));
	}

490 491 492 493 494
	/*
	 * Below here mismatches are bugs because these lower tables
	 * are shared:
	 */

495 496
	pud = pud_offset(p4d, address);
	pud_ref = pud_offset(p4d_ref, address);
497 498 499
	if (pud_none(*pud_ref))
		return -1;

500
	if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref))
501 502
		BUG();

503 504 505
	if (pud_huge(*pud))
		return 0;

506 507 508 509 510
	pmd = pmd_offset(pud, address);
	pmd_ref = pmd_offset(pud_ref, address);
	if (pmd_none(*pmd_ref))
		return -1;

511
	if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref))
512 513
		BUG();

514 515 516
	if (pmd_huge(*pmd))
		return 0;

517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
	pte_ref = pte_offset_kernel(pmd_ref, address);
	if (!pte_present(*pte_ref))
		return -1;

	pte = pte_offset_kernel(pmd, address);

	/*
	 * Don't use pte_page here, because the mappings can point
	 * outside mem_map, and the NUMA hash lookup cannot handle
	 * that:
	 */
	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
		BUG();

	return 0;
}
533
NOKPROBE_SYMBOL(vmalloc_fault);
534

535
#ifdef CONFIG_CPU_SUP_AMD
536
static const char errata93_warning[] =
537 538 539 540 541
KERN_ERR 
"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
"******* Working around it, but it may cause SEGVs or burn power.\n"
"******* Please consider a BIOS update.\n"
"******* Disabling USB legacy in the BIOS may also help.\n";
542
#endif
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561

/*
 * No vm86 mode in 64-bit mode:
 */
static inline void
check_v8086_mode(struct pt_regs *regs, unsigned long address,
		 struct task_struct *tsk)
{
}

static int bad_address(void *p)
{
	unsigned long dummy;

	return probe_kernel_address((unsigned long *)p, dummy);
}

static void dump_pagetable(unsigned long address)
{
562
	pgd_t *base = __va(read_cr3_pa());
Akinobu Mita's avatar
Akinobu Mita committed
563
	pgd_t *pgd = base + pgd_index(address);
564
	p4d_t *p4d;
Linus Torvalds's avatar
Linus Torvalds committed
565 566 567 568
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

Ingo Molnar's avatar
Ingo Molnar committed
569 570 571
	if (bad_address(pgd))
		goto bad;

572
	pr_info("PGD %lx ", pgd_val(*pgd));
Ingo Molnar's avatar
Ingo Molnar committed
573 574 575

	if (!pgd_present(*pgd))
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
576

577 578 579 580
	p4d = p4d_offset(pgd, address);
	if (bad_address(p4d))
		goto bad;

581
	pr_cont("P4D %lx ", p4d_val(*p4d));
582 583 584 585
	if (!p4d_present(*p4d) || p4d_large(*p4d))
		goto out;

	pud = pud_offset(p4d, address);
Ingo Molnar's avatar
Ingo Molnar committed
586 587 588
	if (bad_address(pud))
		goto bad;

589
	pr_cont("PUD %lx ", pud_val(*pud));
590
	if (!pud_present(*pud) || pud_large(*pud))
Ingo Molnar's avatar
Ingo Molnar committed
591
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
592 593

	pmd = pmd_offset(pud, address);
Ingo Molnar's avatar
Ingo Molnar committed
594 595 596
	if (bad_address(pmd))
		goto bad;

597
	pr_cont("PMD %lx ", pmd_val(*pmd));
Ingo Molnar's avatar
Ingo Molnar committed
598 599
	if (!pmd_present(*pmd) || pmd_large(*pmd))
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
600 601

	pte = pte_offset_kernel(pmd, address);
Ingo Molnar's avatar
Ingo Molnar committed
602 603 604
	if (bad_address(pte))
		goto bad;

605
	pr_cont("PTE %lx", pte_val(*pte));
Ingo Molnar's avatar
Ingo Molnar committed
606
out:
607
	pr_cont("\n");
Linus Torvalds's avatar
Linus Torvalds committed
608 609
	return;
bad:
610
	pr_info("BAD\n");
611 612
}

613
#endif /* CONFIG_X86_64 */
Linus Torvalds's avatar
Linus Torvalds committed
614

Ingo Molnar's avatar
Ingo Molnar committed
615 616 617 618 619 620 621 622 623 624 625 626 627
/*
 * Workaround for K8 erratum #93 & buggy BIOS.
 *
 * BIOS SMM functions are required to use a specific workaround
 * to avoid corruption of the 64bit RIP register on C stepping K8.
 *
 * A lot of BIOS that didn't get tested properly miss this.
 *
 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
 * Try to work around it here.
 *
 * Note we only handle faults in kernel here.
 * Does nothing on 32-bit.
628
 */
629
static int is_errata93(struct pt_regs *regs, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
630
{
631 632 633 634 635
#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
	    || boot_cpu_data.x86 != 0xf)
		return 0;

636
	if (address != regs->ip)
Linus Torvalds's avatar
Linus Torvalds committed
637
		return 0;
Ingo Molnar's avatar
Ingo Molnar committed
638

639
	if ((address >> 32) != 0)
Linus Torvalds's avatar
Linus Torvalds committed
640
		return 0;
Ingo Molnar's avatar
Ingo Molnar committed
641

Linus Torvalds's avatar
Linus Torvalds committed
642
	address |= 0xffffffffUL << 32;
643 644
	if ((address >= (u64)_stext && address <= (u64)_etext) ||
	    (address >= MODULES_VADDR && address <= MODULES_END)) {
645
		printk_once(errata93_warning);
646
		regs->ip = address;
Linus Torvalds's avatar
Linus Torvalds committed
647 648
		return 1;
	}
649
#endif
Linus Torvalds's avatar
Linus Torvalds committed
650
	return 0;
651
}
Linus Torvalds's avatar
Linus Torvalds committed
652

653
/*
Ingo Molnar's avatar
Ingo Molnar committed
654 655 656 657 658
 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
 * to illegal addresses >4GB.
 *
 * We catch this in the page fault handler because these addresses
 * are not reachable. Just detect this case and return.  Any code
659 660 661 662 663
 * segment in LDT is compatibility mode.
 */
static int is_errata100(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_64
Ingo Molnar's avatar
Ingo Molnar committed
664
	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
665 666 667 668 669
		return 1;
#endif
	return 0;
}

670 671 672 673
static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
	unsigned long nr;
Ingo Molnar's avatar
Ingo Molnar committed
674

675
	/*
Ingo Molnar's avatar
Ingo Molnar committed
676
	 * Pentium F0 0F C7 C8 bug workaround:
677
	 */
678
	if (boot_cpu_has_bug(X86_BUG_F00F)) {
679 680 681 682 683 684 685 686 687 688 689
		nr = (address - idt_descr.address) >> 3;

		if (nr == 6) {
			do_invalid_op(regs, 0);
			return 1;
		}
	}
#endif
	return 0;
}

690 691
static const char nx_warning[] = KERN_CRIT
"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
692 693
static const char smep_warning[] = KERN_CRIT
"unable to execute userspace code (SMEP?) (uid: %d)\n";
694

Ingo Molnar's avatar
Ingo Molnar committed
695 696 697
static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code,
		unsigned long address)
698
{
699 700 701 702
	if (!oops_may_print())
		return;

	if (error_code & PF_INSTR) {
703
		unsigned int level;
704 705
		pgd_t *pgd;
		pte_t *pte;
Ingo Molnar's avatar
Ingo Molnar committed
706

707
		pgd = __va(read_cr3_pa());
708 709 710
		pgd += pgd_index(address);

		pte = lookup_address_in_pgd(pgd, address, &level);
711

712
		if (pte && pte_present(*pte) && !pte_exec(*pte))
713
			printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
714 715
		if (pte && pte_present(*pte) && pte_exec(*pte) &&
				(pgd_flags(*pgd) & _PAGE_USER) &&
716
				(__read_cr4() & X86_CR4_SMEP))
717
			printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
718 719
	}

720
	printk(KERN_ALERT "BUG: unable to handle kernel ");
721
	if (address < PAGE_SIZE)
722
		printk(KERN_CONT "NULL pointer dereference");
723
	else
724
		printk(KERN_CONT "paging request");
Ingo Molnar's avatar
Ingo Molnar committed
725

726
	printk(KERN_CONT " at %p\n", (void *) address);
727
	printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);
Ingo Molnar's avatar
Ingo Molnar committed
728

729 730 731
	dump_pagetable(address);
}

Ingo Molnar's avatar
Ingo Molnar committed
732 733 734
static noinline void
pgtable_bad(struct pt_regs *regs, unsigned long error_code,
	    unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
735
{
Ingo Molnar's avatar
Ingo Molnar committed
736 737 738 739 740 741 742
	struct task_struct *tsk;
	unsigned long flags;
	int sig;

	flags = oops_begin();
	tsk = current;
	sig = SIGKILL;
743

Linus Torvalds's avatar
Linus Torvalds committed
744
	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
745
	       tsk->comm, address);
Linus Torvalds's avatar
Linus Torvalds committed
746
	dump_pagetable(address);
Ingo Molnar's avatar
Ingo Molnar committed
747 748

	tsk->thread.cr2		= address;
749
	tsk->thread.trap_nr	= X86_TRAP_PF;
Ingo Molnar's avatar
Ingo Molnar committed
750 751
	tsk->thread.error_code	= error_code;

752
	if (__die("Bad pagetable", regs, error_code))
753
		sig = 0;
Ingo Molnar's avatar
Ingo Molnar committed
754

755
	oops_end(flags, regs, sig);
Linus Torvalds's avatar
Linus Torvalds committed
756 757
}

Ingo Molnar's avatar
Ingo Molnar committed
758 759
static noinline void
no_context(struct pt_regs *regs, unsigned long error_code,
760
	   unsigned long address, int signal, int si_code)
761 762 763 764
{
	struct task_struct *tsk = current;
	unsigned long flags;
	int sig;
765 766
	/* No context means no VMA to pass down */
	struct vm_area_struct *vma = NULL;
767

Ingo Molnar's avatar
Ingo Molnar committed
768
	/* Are we prepared to handle this kernel fault? */
769
	if (fixup_exception(regs, X86_TRAP_PF)) {
770 771 772 773 774 775 776 777 778 779 780 781 782 783
		/*
		 * Any interrupt that takes a fault gets the fixup. This makes
		 * the below recursive fault logic only apply to a faults from
		 * task context.
		 */
		if (in_interrupt())
			return;

		/*
		 * Per the above we're !in_interrupt(), aka. task context.
		 *
		 * In this case we need to make sure we're not recursively
		 * faulting through the emulate_vsyscall() logic.
		 */
784
		if (current->thread.sig_on_uaccess_err && signal) {
785
			tsk->thread.trap_nr = X86_TRAP_PF;
786 787 788 789
			tsk->thread.error_code = error_code | PF_USER;
			tsk->thread.cr2 = address;

			/* XXX: hwpoison faults will set the wrong code. */
790 791
			force_sig_info_fault(signal, si_code, address,
					     tsk, vma, 0);
792
		}
793 794 795 796

		/*
		 * Barring that, we can do the fixup and be happy.
		 */
797
		return;
798
	}
799

800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
#ifdef CONFIG_VMAP_STACK
	/*
	 * Stack overflow?  During boot, we can fault near the initial
	 * stack in the direct map, but that's not an overflow -- check
	 * that we're in vmalloc space to avoid this.
	 */
	if (is_vmalloc_addr((void *)address) &&
	    (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
	     address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
		register void *__sp asm("rsp");
		unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
		/*
		 * We're likely to be running with very little stack space
		 * left.  It's plausible that we'd hit this condition but
		 * double-fault even before we get this far, in which case
		 * we're fine: the double-fault handler will deal with it.
		 *
		 * We don't want to make it all the way into the oops code
		 * and then double-fault, though, because we're likely to
		 * break the console driver and lose most of the stack dump.
		 */
		asm volatile ("movq %[stack], %%rsp\n\t"
			      "call handle_stack_overflow\n\t"
			      "1: jmp 1b"
			      : "+r" (__sp)
			      : "D" ("kernel stack overflow (page fault)"),
				"S" (regs), "d" (address),
				[stack] "rm" (stack));
		unreachable();
	}
#endif

832
	/*
Ingo Molnar's avatar
Ingo Molnar committed
833 834 835 836 837 838 839
	 * 32-bit:
	 *
	 *   Valid to do another page fault here, because if this fault
	 *   had been triggered by is_prefetch fixup_exception would have
	 *   handled it.
	 *
	 * 64-bit:
840
	 *
Ingo Molnar's avatar
Ingo Molnar committed
841
	 *   Hall of shame of CPU/BIOS bugs.
842 843 844 845 846 847 848 849 850
	 */
	if (is_prefetch(regs, error_code, address))
		return;

	if (is_errata93(regs, address))
		return;

	/*
	 * Oops. The kernel tried to access some bad page. We'll have to
Ingo Molnar's avatar
Ingo Molnar committed
851
	 * terminate things with extreme prejudice:
852 853 854 855 856
	 */
	flags = oops_begin();

	show_fault_oops(regs, error_code, address);

857
	if (task_stack_end_corrupted(tsk))
858
		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
859

860
	tsk->thread.cr2		= address;
861
	tsk->thread.trap_nr	= X86_TRAP_PF;
862
	tsk->thread.error_code	= error_code;
863 864 865 866

	sig = SIGKILL;
	if (__die("Oops", regs, error_code))
		sig = 0;
Ingo Molnar's avatar
Ingo Molnar committed
867

868
	/* Executive summary in case the body of the oops scrolled away */
869
	printk(KERN_DEFAULT "CR2: %016lx\n", address);
Ingo Molnar's avatar
Ingo Molnar committed
870

871 872 873
	oops_end(flags, regs, sig);
}

Ingo Molnar's avatar
Ingo Molnar committed
874 875 876 877 878 879 880 881 882 883 884 885 886 887
/*
 * Print out info about fatal segfaults, if the show_unhandled_signals
 * sysctl is set:
 */
static inline void
show_signal_msg(struct pt_regs *regs, unsigned long error_code,
		unsigned long address, struct task_struct *tsk)
{
	if (!unhandled_signal(tsk, SIGSEGV))
		return;

	if (!printk_ratelimit())
		return;

888
	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
Ingo Molnar's avatar
Ingo Molnar committed
889 890 891 892 893 894 895 896 897 898 899
		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
		tsk->comm, task_pid_nr(tsk), address,
		(void *)regs->ip, (void *)regs->sp, error_code);

	print_vma_addr(KERN_CONT " in ", regs->ip);

	printk(KERN_CONT "\n");
}

static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
900 901
		       unsigned long address, struct vm_area_struct *vma,
		       int si_code)
902 903 904 905 906 907
{
	struct task_struct *tsk = current;

	/* User mode accesses just cause a SIGSEGV */
	if (error_code & PF_USER) {
		/*
Ingo Molnar's avatar
Ingo Molnar committed
908
		 * It's possible to have interrupts off here:
909 910 911 912 913
		 */
		local_irq_enable();

		/*
		 * Valid to do another page fault here because this one came
Ingo Molnar's avatar
Ingo Molnar committed
914
		 * from user space:
915 916 917 918 919 920 921
		 */
		if (is_prefetch(regs, error_code, address))
			return;

		if (is_errata100(regs, address))
			return;

922 923 924 925 926 927
#ifdef CONFIG_X86_64
		/*
		 * Instruction fetch faults in the vsyscall page might need
		 * emulation.
		 */
		if (unlikely((error_code & PF_INSTR) &&
928
			     ((address & ~0xfff) == VSYSCALL_ADDR))) {
929 930 931 932
			if (emulate_vsyscall(regs, address))
				return;
		}
#endif
933 934 935 936 937 938 939

		/*
		 * To avoid leaking information about the kernel page table
		 * layout, pretend that user-mode accesses to kernel addresses
		 * are always protection faults.
		 */
		if (address >= TASK_SIZE_MAX)
940
			error_code |= PF_PROT;
941

942
		if (likely(show_unhandled_signals))
Ingo Molnar's avatar
Ingo Molnar committed
943 944 945
			show_signal_msg(regs, error_code, address, tsk);

		tsk->thread.cr2		= address;
946
		tsk->thread.error_code	= error_code;
947
		tsk->thread.trap_nr	= X86_TRAP_PF;
948

949
		force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
Ingo Molnar's avatar
Ingo Molnar committed
950

951 952 953 954 955 956
		return;
	}

	if (is_f00f_bug(regs, address))
		return;

957
	no_context(regs, error_code, address, SIGSEGV, si_code);
958 959
}

Ingo Molnar's avatar
Ingo Molnar committed
960 961
static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
962
		     unsigned long address, struct vm_area_struct *vma)
963
{
964
	__bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
965 966
}

Ingo Molnar's avatar
Ingo Molnar committed
967 968
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
969
	   unsigned long address,  struct vm_area_struct *vma, int si_code)
970 971 972 973 974 975 976 977 978
{
	struct mm_struct *mm = current->mm;

	/*
	 * Something tried to access memory that isn't in our memory map..
	 * Fix it, but check if it's kernel or user first..
	 */
	up_read(&mm->mmap_sem);

979
	__bad_area_nosemaphore(regs, error_code, address, vma, si_code);
980 981
}

Ingo Molnar's avatar
Ingo Molnar committed
982 983
static noinline void
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
984
{
985
	__bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
986 987
}