mmu.c 65.8 KB
Newer Older
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
/*
 * Xen mmu operations
 *
 * This file contains the various mmu fetch and update operations.
 * The most important job they must perform is the mapping between the
 * domain's pfn and the overall machine mfns.
 *
 * Xen allows guests to directly update the pagetable, in a controlled
 * fashion.  In other words, the guest modifies the same pagetable
 * that the CPU actually uses, which eliminates the overhead of having
 * a separate shadow pagetable.
 *
 * In order to allow this, it falls on the guest domain to map its
 * notion of a "physical" pfn - which is just a domain-local linear
 * address - into a real "machine address" which the CPU's MMU can
 * use.
 *
 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
 * inserted directly into the pagetable.  When creating a new
 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
 * when reading the content back with __(pgd|pmd|pte)_val, it converts
 * the mfn back into a pfn.
 *
 * The other constraint is that all pages which make up a pagetable
 * must be mapped read-only in the guest.  This prevents uncontrolled
 * guest updates to the pagetable.  Xen strictly enforces this, and
 * will disallow any pagetable update which will end up mapping a
 * pagetable page RW, and will disallow using any writable page as a
 * pagetable.
 *
 * Naively, when loading %cr3 with the base of a new pagetable, Xen
 * would need to validate the whole pagetable before going on.
 * Naturally, this is quite slow.  The solution is to "pin" a
 * pagetable, which enforces all the constraints on the pagetable even
 * when it is not actively in use.  This menas that Xen can be assured
 * that it is still valid when you do load it into %cr3, and doesn't
 * need to revalidate it.
 *
 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
 */
41
#include <linux/sched.h>
42
#include <linux/highmem.h>
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
43
#include <linux/debugfs.h>
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
44
#include <linux/bug.h>
45
#include <linux/vmalloc.h>
46
#include <linux/module.h>
47
#include <linux/gfp.h>
48
#include <linux/memblock.h>
49
#include <linux/seq_file.h>
50
#include <linux/crash_dump.h>
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
51

52 53
#include <trace/events/xen.h>

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
54 55
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
56
#include <asm/fixmap.h>
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
57
#include <asm/mmu_context.h>
58
#include <asm/setup.h>
59
#include <asm/paravirt.h>
60
#include <asm/e820.h>
61
#include <asm/linkage.h>
62
#include <asm/page.h>
63
#include <asm/init.h>
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
64
#include <asm/pat.h>
Andrew Jones's avatar
Andrew Jones committed
65
#include <asm/smp.h>
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
66 67

#include <asm/xen/hypercall.h>
68
#include <asm/xen/hypervisor.h>
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
69

70
#include <xen/xen.h>
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
71 72
#include <xen/page.h>
#include <xen/interface/xen.h>
73
#include <xen/interface/hvm/hvm_op.h>
74
#include <xen/interface/version.h>
75
#include <xen/interface/memory.h>
76
#include <xen/hvc-console.h>
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
77

78
#include "multicalls.h"
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
79
#include "mmu.h"
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
80 81
#include "debugfs.h"

Alex Nixon's avatar
Alex Nixon committed
82 83
/*
 * Protects atomic reservation decrease/increase against concurrent increases.
84
 * Also protects non-atomic updates of current_pages and balloon lists.
Alex Nixon's avatar
Alex Nixon committed
85 86 87
 */
DEFINE_SPINLOCK(xen_reservation_lock);

88
#ifdef CONFIG_X86_32
89 90 91 92 93
/*
 * Identity map, in addition to plain kernel map.  This needs to be
 * large enough to allocate page table pages to allocate the rest.
 * Each page can map 2MB.
 */
94 95
#define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
96
#endif
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
#ifdef CONFIG_X86_64
/* l3 pud for userspace vsyscall mapping */
static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
#endif /* CONFIG_X86_64 */

/*
 * Note about cr3 (pagetable base) values:
 *
 * xen_cr3 contains the current logical cr3 value; it contains the
 * last set cr3.  This may not be the current effective cr3, because
 * its update may be being lazily deferred.  However, a vcpu looking
 * at its own cr3 can use this value knowing that it everything will
 * be self-consistent.
 *
 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 * hypercall to set the vcpu cr3 is complete (so it may be a little
 * out of date, but it will never be set early).  If one vcpu is
 * looking at another vcpu's cr3 value, it should use this variable.
 */
DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */


120 121 122 123 124 125
/*
 * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 * redzone above it, so round it up to a PGD boundary.
 */
#define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)

126 127 128 129 130 131 132
unsigned long arbitrary_virt_to_mfn(void *vaddr)
{
	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);

	return PFN_DOWN(maddr.maddr);
}

133
xmaddr_t arbitrary_virt_to_machine(void *vaddr)
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
134
{
135
	unsigned long address = (unsigned long)vaddr;
136
	unsigned int level;
137 138
	pte_t *pte;
	unsigned offset;
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
139

140 141 142 143 144 145 146 147
	/*
	 * if the PFN is in the linear mapped vaddr range, we can just use
	 * the (quick) virt_to_machine() p2m lookup
	 */
	if (virt_addr_valid(vaddr))
		return virt_to_machine(vaddr);

	/* otherwise we have to do a (slower) full page-table walk */
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
148

149 150 151
	pte = lookup_address(address, &level);
	BUG_ON(pte == NULL);
	offset = address & ~PAGE_MASK;
152
	return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
153
}
154
EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
155 156 157 158 159

void make_lowmem_page_readonly(void *vaddr)
{
	pte_t *pte, ptev;
	unsigned long address = (unsigned long)vaddr;
160
	unsigned int level;
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
161

162
	pte = lookup_address(address, &level);
163 164
	if (pte == NULL)
		return;		/* vaddr missing */
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
165 166 167 168 169 170 171 172 173 174 175

	ptev = pte_wrprotect(*pte);

	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
		BUG();
}

void make_lowmem_page_readwrite(void *vaddr)
{
	pte_t *pte, ptev;
	unsigned long address = (unsigned long)vaddr;
176
	unsigned int level;
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
177

178
	pte = lookup_address(address, &level);
179 180
	if (pte == NULL)
		return;		/* vaddr missing */
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
181 182 183 184 185 186 187 188

	ptev = pte_mkwrite(*pte);

	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
		BUG();
}


189
static bool xen_page_pinned(void *ptr)
190 191 192 193 194 195
{
	struct page *page = virt_to_page(ptr);

	return PagePinned(page);
}

196
void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
197 198 199 200
{
	struct multicall_space mcs;
	struct mmu_update *u;

201 202
	trace_xen_mmu_set_domain_pte(ptep, pteval, domid);

203 204 205 206
	mcs = xen_mc_entry(sizeof(*u));
	u = mcs.args;

	/* ptep might be kmapped when using 32-bit HIGHPTE */
207
	u->ptr = virt_to_machine(ptep).maddr;
208 209
	u->val = pte_val_ma(pteval);

210
	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
211 212 213

	xen_mc_issue(PARAVIRT_LAZY_MMU);
}
214 215
EXPORT_SYMBOL_GPL(xen_set_domain_pte);

216
static void xen_extend_mmu_update(const struct mmu_update *update)
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
217
{
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
218 219
	struct multicall_space mcs;
	struct mmu_update *u;
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
220

221 222
	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
223
	if (mcs.mc != NULL) {
224
		mcs.mc->args[1]++;
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
225
	} else {
226 227 228
		mcs = __xen_mc_entry(sizeof(*u));
		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
	}
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
229 230

	u = mcs.args;
231 232 233
	*u = *update;
}

234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
static void xen_extend_mmuext_op(const struct mmuext_op *op)
{
	struct multicall_space mcs;
	struct mmuext_op *u;

	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));

	if (mcs.mc != NULL) {
		mcs.mc->args[1]++;
	} else {
		mcs = __xen_mc_entry(sizeof(*u));
		MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
	}

	u = mcs.args;
	*u = *op;
}

252
static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
253 254 255 256 257 258 259
{
	struct mmu_update u;

	preempt_disable();

	xen_mc_batch();

260 261
	/* ptr may be ioremapped for 64-bit pagetable setup */
	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
262
	u.val = pmd_val_ma(val);
263
	xen_extend_mmu_update(&u);
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
264 265 266 267

	xen_mc_issue(PARAVIRT_LAZY_MMU);

	preempt_enable();
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
268 269
}

270
static void xen_set_pmd(pmd_t *ptr, pmd_t val)
271
{
272 273
	trace_xen_mmu_set_pmd(ptr, val);

274 275
	/* If page is not pinned, we can just update the entry
	   directly */
276
	if (!xen_page_pinned(ptr)) {
277 278 279 280 281 282 283
		*ptr = val;
		return;
	}

	xen_set_pmd_hyper(ptr, val);
}

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
284 285 286 287 288 289
/*
 * Associate a virtual page frame with a given physical page frame
 * and protection flags for that frame.
 */
void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
290
	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
291 292
}

293
static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
294
{
295
	struct mmu_update u;
296

297 298
	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
		return false;
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
299

300
	xen_mc_batch();
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
301

302 303 304
	u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
	u.val = pte_val_ma(pteval);
	xen_extend_mmu_update(&u);
305

306
	xen_mc_issue(PARAVIRT_LAZY_MMU);
307

308 309 310
	return true;
}

311
static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
312
{
313 314 315 316 317 318 319 320 321 322 323 324 325 326
	if (!xen_batched_set_pte(ptep, pteval)) {
		/*
		 * Could call native_set_pte() here and trap and
		 * emulate the PTE write but with 32-bit guests this
		 * needs two traps (one for each of the two 32-bit
		 * words in the PTE) so do one hypercall directly
		 * instead.
		 */
		struct mmu_update u;

		u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
		u.val = pte_val_ma(pteval);
		HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
	}
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
327 328
}

329 330 331 332 333 334
static void xen_set_pte(pte_t *ptep, pte_t pteval)
{
	trace_xen_mmu_set_pte(ptep, pteval);
	__xen_set_pte(ptep, pteval);
}

335
static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
336 337
		    pte_t *ptep, pte_t pteval)
{
338 339
	trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
	__xen_set_pte(ptep, pteval);
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
340 341
}

Tej's avatar
Tej committed
342 343
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
				 unsigned long addr, pte_t *ptep)
344
{
345
	/* Just return the pte as-is.  We preserve the bits on commit */
346
	trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
347 348 349 350 351 352
	return *ptep;
}

void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
				 pte_t *ptep, pte_t pte)
{
353
	struct mmu_update u;
354

355
	trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
356
	xen_mc_batch();
357

358
	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
359
	u.val = pte_val_ma(pte);
360
	xen_extend_mmu_update(&u);
361

362
	xen_mc_issue(PARAVIRT_LAZY_MMU);
363 364
}

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
365 366
/* Assume pteval_t is equivalent to all the other *val_t types. */
static pteval_t pte_mfn_to_pfn(pteval_t val)
367
{
368
	if (val & _PAGE_PRESENT) {
369
		unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
370 371
		unsigned long pfn = mfn_to_pfn(mfn);

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
372
		pteval_t flags = val & PTE_FLAGS_MASK;
373 374 375 376
		if (unlikely(pfn == ~0))
			val = flags & ~_PAGE_PRESENT;
		else
			val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
377
	}
378

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
379
	return val;
380 381
}

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
382
static pteval_t pte_pfn_to_mfn(pteval_t val)
383
{
384
	if (val & _PAGE_PRESENT) {
385
		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
386
		pteval_t flags = val & PTE_FLAGS_MASK;
387
		unsigned long mfn;
388

389 390 391 392
		if (!xen_feature(XENFEAT_auto_translated_physmap))
			mfn = get_phys_to_machine(pfn);
		else
			mfn = pfn;
393 394 395 396 397 398 399 400 401
		/*
		 * If there's no mfn for the pfn, then just create an
		 * empty non-present pte.  Unfortunately this loses
		 * information about the original pfn, so
		 * pte_mfn_to_pfn is asymmetric.
		 */
		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
			mfn = 0;
			flags = 0;
402 403 404 405 406 407 408 409 410 411 412
		} else {
			/*
			 * Paramount to do this test _after_ the
			 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
			 * IDENTITY_FRAME_BIT resolves to true.
			 */
			mfn &= ~FOREIGN_FRAME_BIT;
			if (mfn & IDENTITY_FRAME_BIT) {
				mfn &= ~IDENTITY_FRAME_BIT;
				flags |= _PAGE_IOMAP;
			}
413 414
		}
		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
415 416
	}

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
417
	return val;
418 419
}

420 421 422 423 424 425 426 427 428 429 430 431 432 433
static pteval_t iomap_pte(pteval_t val)
{
	if (val & _PAGE_PRESENT) {
		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
		pteval_t flags = val & PTE_FLAGS_MASK;

		/* We assume the pte frame number is a MFN, so
		   just use it as-is. */
		val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
	}

	return val;
}

434
__visible pteval_t xen_pte_val(pte_t pte)
435
{
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
436
	pteval_t pteval = pte.pte;
437
#if 0
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
438 439 440 441 442
	/* If this is a WC pte, convert back from Xen WC to Linux WC */
	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
		WARN_ON(!pat_enabled);
		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
	}
443
#endif
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
444 445 446 447
	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
		return pteval;

	return pte_mfn_to_pfn(pteval);
448
}
449
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
450

451
__visible pgdval_t xen_pgd_val(pgd_t pgd)
452
{
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
453
	return pte_mfn_to_pfn(pgd.pgd);
454
}
455
PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
456

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
457 458 459 460 461 462 463 464 465 466 467 468 469 470
/*
 * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
 * are reserved for now, to correspond to the Intel-reserved PAT
 * types.
 *
 * We expect Linux's PAT set as follows:
 *
 * Idx  PTE flags        Linux    Xen    Default
 * 0                     WB       WB     WB
 * 1            PWT      WC       WT     WT
 * 2        PCD          UC-      UC-    UC-
 * 3        PCD PWT      UC       UC     UC
 * 4    PAT              WB       WC     WB
 * 5    PAT     PWT      WC       WP     WT
471 472
 * 6    PAT PCD          UC-      rsv    UC-
 * 7    PAT PCD PWT      UC       rsv    UC
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
473 474 475 476 477 478 479 480 481
 */

void xen_set_pat(u64 pat)
{
	/* We expect Linux to use a PAT setting of
	 * UC UC- WC WB (ignoring the PAT flag) */
	WARN_ON(pat != 0x0007010600070106ull);
}

482
__visible pte_t xen_make_pte(pteval_t pte)
483
{
484
	phys_addr_t addr = (pte & PTE_PFN_MASK);
485
#if 0
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
486 487 488 489 490 491 492 493 494 495 496 497
	/* If Linux is trying to set a WC pte, then map to the Xen WC.
	 * If _PAGE_PAT is set, then it probably means it is really
	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
	 * things work out OK...
	 *
	 * (We should never see kernel mappings with _PAGE_PSE set,
	 * but we could see hugetlbfs mappings, I think.).
	 */
	if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
	}
498
#endif
499 500 501 502 503 504 505 506
	/*
	 * Unprivileged domains are allowed to do IOMAPpings for
	 * PCI passthrough, but not map ISA space.  The ISA
	 * mappings are just dummy local mappings to keep other
	 * parts of the kernel happy.
	 */
	if (unlikely(pte & _PAGE_IOMAP) &&
	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
507
		pte = iomap_pte(pte);
508 509
	} else {
		pte &= ~_PAGE_IOMAP;
510
		pte = pte_pfn_to_mfn(pte);
511
	}
512

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
513
	return native_make_pte(pte);
514
}
515
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
516

517
__visible pgd_t xen_make_pgd(pgdval_t pgd)
518
{
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
519 520
	pgd = pte_pfn_to_mfn(pgd);
	return native_make_pgd(pgd);
521
}
522
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
523

524
__visible pmdval_t xen_pmd_val(pmd_t pmd)
525
{
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
526
	return pte_mfn_to_pfn(pmd.pmd);
527
}
528
PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
529

530
static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
531
{
532
	struct mmu_update u;
533

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
534 535
	preempt_disable();

536 537
	xen_mc_batch();

538 539
	/* ptr may be ioremapped for 64-bit pagetable setup */
	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
540
	u.val = pud_val_ma(val);
541
	xen_extend_mmu_update(&u);
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
542 543 544 545

	xen_mc_issue(PARAVIRT_LAZY_MMU);

	preempt_enable();
546 547
}

548
static void xen_set_pud(pud_t *ptr, pud_t val)
549
{
550 551
	trace_xen_mmu_set_pud(ptr, val);

552 553
	/* If page is not pinned, we can just update the entry
	   directly */
554
	if (!xen_page_pinned(ptr)) {
555 556 557 558 559 560 561
		*ptr = val;
		return;
	}

	xen_set_pud_hyper(ptr, val);
}

562
#ifdef CONFIG_X86_PAE
563
static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
564
{
565
	trace_xen_mmu_set_pte_atomic(ptep, pte);
566
	set_64bit((u64 *)ptep, native_pte_val(pte));
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
567 568
}

569
static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
570
{
571
	trace_xen_mmu_pte_clear(mm, addr, ptep);
572 573
	if (!xen_batched_set_pte(ptep, native_make_pte(0)))
		native_pte_clear(mm, addr, ptep);
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
574 575
}

576
static void xen_pmd_clear(pmd_t *pmdp)
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
577
{
578
	trace_xen_mmu_pmd_clear(pmdp);
579
	set_pmd(pmdp, __pmd(0));
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
580
}
581
#endif	/* CONFIG_X86_PAE */
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
582

583
__visible pmd_t xen_make_pmd(pmdval_t pmd)
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
584
{
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
585
	pmd = pte_pfn_to_mfn(pmd);
586
	return native_make_pmd(pmd);
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
587
}
588
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
589

590
#if PAGETABLE_LEVELS == 4
591
__visible pudval_t xen_pud_val(pud_t pud)
592 593 594
{
	return pte_mfn_to_pfn(pud.pud);
}
595
PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
596

597
__visible pud_t xen_make_pud(pudval_t pud)
598 599 600 601 602
{
	pud = pte_pfn_to_mfn(pud);

	return native_make_pud(pud);
}
603
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
604

605
static pgd_t *xen_get_user_pgd(pgd_t *pgd)
606
{
607 608 609
	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
	unsigned offset = pgd - pgd_page;
	pgd_t *user_ptr = NULL;
610

611 612 613 614 615 616
	if (offset < pgd_index(USER_LIMIT)) {
		struct page *page = virt_to_page(pgd_page);
		user_ptr = (pgd_t *)page->private;
		if (user_ptr)
			user_ptr += offset;
	}
617

618 619 620 621 622 623
	return user_ptr;
}

static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
{
	struct mmu_update u;
624 625 626

	u.ptr = virt_to_machine(ptr).maddr;
	u.val = pgd_val_ma(val);
627
	xen_extend_mmu_update(&u);
628 629 630 631 632 633 634 635 636
}

/*
 * Raw hypercall-based set_pgd, intended for in early boot before
 * there's a page structure.  This implies:
 *  1. The only existing pagetable is the kernel's
 *  2. It is always pinned
 *  3. It has no user pagetable attached to it
 */
637
static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
638 639 640 641 642 643
{
	preempt_disable();

	xen_mc_batch();

	__xen_set_pgd_hyper(ptr, val);
644 645 646 647 648 649

	xen_mc_issue(PARAVIRT_LAZY_MMU);

	preempt_enable();
}

650
static void xen_set_pgd(pgd_t *ptr, pgd_t val)
651
{
652 653
	pgd_t *user_ptr = xen_get_user_pgd(ptr);

654 655
	trace_xen_mmu_set_pgd(ptr, user_ptr, val);

656 657
	/* If page is not pinned, we can just update the entry
	   directly */
658
	if (!xen_page_pinned(ptr)) {
659
		*ptr = val;
660
		if (user_ptr) {
661
			WARN_ON(xen_page_pinned(user_ptr));
662 663
			*user_ptr = val;
		}
664 665 666
		return;
	}

667 668 669 670 671 672 673 674 675
	/* If it's pinned, then we can at least batch the kernel and
	   user updates together. */
	xen_mc_batch();

	__xen_set_pgd_hyper(ptr, val);
	if (user_ptr)
		__xen_set_pgd_hyper(user_ptr, val);

	xen_mc_issue(PARAVIRT_LAZY_MMU);
676 677 678
}
#endif	/* PAGETABLE_LEVELS == 4 */

679
/*
680 681 682 683 684 685 686 687 688 689 690 691 692 693
 * (Yet another) pagetable walker.  This one is intended for pinning a
 * pagetable.  This means that it walks a pagetable and calls the
 * callback function on each page it finds making up the page table,
 * at every level.  It walks the entire pagetable, but it only bothers
 * pinning pte pages which are below limit.  In the normal case this
 * will be STACK_TOP_MAX, but at boot we need to pin up to
 * FIXADDR_TOP.
 *
 * For 32-bit the important bit is that we don't pin beyond there,
 * because then we start getting into Xen's ptes.
 *
 * For 64-bit, we must skip the Xen hole in the middle of the address
 * space, just after the big x86-64 virtual hole.
 */
Ian Campbell's avatar
Ian Campbell committed
694 695 696 697
static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
			  int (*func)(struct mm_struct *mm, struct page *,
				      enum pt_level),
			  unsigned long limit)
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
698
{
699
	int flush = 0;
700 701 702
	unsigned hole_low, hole_high;
	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
	unsigned pgdidx, pudidx, pmdidx;
703

704 705 706
	/* The limit is the last byte to be touched */
	limit--;
	BUG_ON(limit >= FIXADDR_TOP);
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
707 708

	if (xen_feature(XENFEAT_auto_translated_physmap))
709 710
		return 0;

711 712 713 714 715
	/*
	 * 64-bit has a great big hole in the middle of the address
	 * space, which contains the Xen mappings.  On 32-bit these
	 * will end up making a zero-sized hole and so is a no-op.
	 */
716
	hole_low = pgd_index(USER_LIMIT);
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731
	hole_high = pgd_index(PAGE_OFFSET);

	pgdidx_limit = pgd_index(limit);
#if PTRS_PER_PUD > 1
	pudidx_limit = pud_index(limit);
#else
	pudidx_limit = 0;
#endif
#if PTRS_PER_PMD > 1
	pmdidx_limit = pmd_index(limit);
#else
	pmdidx_limit = 0;
#endif

	for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
732
		pud_t *pud;
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
733

734 735
		if (pgdidx >= hole_low && pgdidx < hole_high)
			continue;
736

737
		if (!pgd_val(pgd[pgdidx]))
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
738
			continue;
739

740
		pud = pud_offset(&pgd[pgdidx], 0);
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
741 742

		if (PTRS_PER_PUD > 1) /* not folded */
743
			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
744

745
		for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
746 747
			pmd_t *pmd;

748 749 750
			if (pgdidx == pgdidx_limit &&
			    pudidx > pudidx_limit)
				goto out;
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
751

752
			if (pud_none(pud[pudidx]))
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
753
				continue;
754

755
			pmd = pmd_offset(&pud[pudidx], 0);
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
756 757

			if (PTRS_PER_PMD > 1) /* not folded */
758
				flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
759

760 761 762 763 764 765 766
			for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
				struct page *pte;

				if (pgdidx == pgdidx_limit &&
				    pudidx == pudidx_limit &&
				    pmdidx > pmdidx_limit)
					goto out;
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
767

768
				if (pmd_none(pmd[pmdidx]))
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
769 770
					continue;

771
				pte = pmd_page(pmd[pmdidx]);
772
				flush |= (*func)(mm, pte, PT_PTE);
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
773 774 775
			}
		}
	}
776

777
out:
778 779
	/* Do the top level last, so that the callbacks can use it as
	   a cue to do final things like tlb flushes. */
780
	flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
781 782

	return flush;
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
783 784
}

Ian Campbell's avatar
Ian Campbell committed
785 786 787 788 789 790 791 792
static int xen_pgd_walk(struct mm_struct *mm,
			int (*func)(struct mm_struct *mm, struct page *,
				    enum pt_level),
			unsigned long limit)
{
	return __xen_pgd_walk(mm, mm->pgd, func, limit);
}

793 794
/* If we're using split pte locks, then take the page's lock and
   return a pointer to it.  Otherwise return NULL. */
795
static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
796 797 798
{
	spinlock_t *ptl = NULL;

799
#if USE_SPLIT_PTE_PTLOCKS
800
	ptl = ptlock_ptr(page);
801
	spin_lock_nest_lock(ptl, &mm->page_table_lock);
802 803 804 805 806
#endif

	return ptl;
}

807
static void xen_pte_unlock(void *v)
808 809 810 811 812 813 814
{
	spinlock_t *ptl = v;
	spin_unlock(ptl);
}

static void xen_do_pin(unsigned level, unsigned long pfn)
{
815
	struct mmuext_op op;
816

817 818 819 820
	op.cmd = level;
	op.arg1.mfn = pfn_to_mfn(pfn);

	xen_extend_mmuext_op(&op);
821 822
}

823 824
static int xen_pin_page(struct mm_struct *mm, struct page *page,
			enum pt_level level)
825
{
826
	unsigned pgfl = TestSetPagePinned(page);
827 828 829 830 831 832 833 834 835 836 837 838
	int flush;

	if (pgfl)
		flush = 0;		/* already pinned */
	else if (PageHighMem(page))
		/* kmaps need flushing if we found an unpinned
		   highpage */
		flush = 1;
	else {
		void *pt = lowmem_page_address(page);
		unsigned long pfn = page_to_pfn(page);
		struct multicall_space mcs = __xen_mc_entry(0);
839
		spinlock_t *ptl;
840 841 842

		flush = 0;

843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
		/*
		 * We need to hold the pagetable lock between the time
		 * we make the pagetable RO and when we actually pin
		 * it.  If we don't, then other users may come in and
		 * attempt to update the pagetable by writing it,
		 * which will fail because the memory is RO but not
		 * pinned, so Xen won't do the trap'n'emulate.
		 *
		 * If we're using split pte locks, we can't hold the
		 * entire pagetable's worth of locks during the
		 * traverse, because we may wrap the preempt count (8
		 * bits).  The solution is to mark RO and pin each PTE
		 * page while holding the lock.  This means the number
		 * of locks we end up holding is never more than a
		 * batch size (~32 entries, at present).
		 *
		 * If we're not using split pte locks, we needn't pin
		 * the PTE pages independently, because we're
		 * protected by the overall pagetable lock.
		 */
863 864
		ptl = NULL;
		if (level == PT_PTE)
865
			ptl = xen_pte_lock(page, mm);
866

867 868
		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
					pfn_pte(pfn, PAGE_KERNEL_RO),
869 870
					level == PT_PGD ? UVMF_TLB_FLUSH : 0);

871
		if (ptl) {
872 873 874 875
			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);

			/* Queue a deferred unlock for when this batch
			   is completed. */
876
			xen_mc_callback(xen_pte_unlock, ptl);
877
		}
878 879 880 881
	}

	return flush;
}
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
882

883 884 885
/* This is called just after a mm has been created, but it has not
   been used yet.  We need to make sure that its pagetable is all
   read-only, and can be pinned. */
886
static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
887
{
888 889
	trace_xen_mmu_pgd_pin(mm, pgd);

890
	xen_mc_batch();
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
891

Ian Campbell's avatar
Ian Campbell committed
892
	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
893
		/* re-enable interrupts for flushing */
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
894
		xen_mc_issue(0);
895

896
		kmap_flush_unused();
897

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
898 899
		xen_mc_batch();
	}
900

901 902 903 904 905 906 907
#ifdef CONFIG_X86_64
	{
		pgd_t *user_pgd = xen_get_user_pgd(pgd);

		xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));

		if (user_pgd) {
908
			xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
Tej's avatar
Tej committed
909 910
			xen_do_pin(MMUEXT_PIN_L4_TABLE,
				   PFN_DOWN(__pa(user_pgd)));
911 912 913
		}
	}
#else /* CONFIG_X86_32 */
914 915
#ifdef CONFIG_X86_PAE
	/* Need to make sure unshared kernel PMD is pinnable */
916