mmu.c 169 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Avi Kivity's avatar
Avi Kivity committed
2
3
4
5
6
7
8
9
10
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
11
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
Avi Kivity's avatar
Avi Kivity committed
12
13
14
15
16
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 */
Avi Kivity's avatar
Avi Kivity committed
17

18
#include "irq.h"
19
#include "ioapic.h"
20
#include "mmu.h"
21
#include "x86.h"
Avi Kivity's avatar
Avi Kivity committed
22
#include "kvm_cache_regs.h"
23
#include "kvm_emulate.h"
24
#include "cpuid.h"
Avi Kivity's avatar
Avi Kivity committed
25

26
#include <linux/kvm_host.h>
Avi Kivity's avatar
Avi Kivity committed
27
28
29
30
#include <linux/types.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/highmem.h>
31
32
#include <linux/moduleparam.h>
#include <linux/export.h>
33
#include <linux/swap.h>
Marcelo Tosatti's avatar
Marcelo Tosatti committed
34
#include <linux/hugetlb.h>
35
#include <linux/compiler.h>
36
#include <linux/srcu.h>
37
#include <linux/slab.h>
38
#include <linux/sched/signal.h>
39
#include <linux/uaccess.h>
40
#include <linux/hash.h>
41
#include <linux/kern_levels.h>
42
#include <linux/kthread.h>
Avi Kivity's avatar
Avi Kivity committed
43

Avi Kivity's avatar
Avi Kivity committed
44
#include <asm/page.h>
45
#include <asm/memtype.h>
Avi Kivity's avatar
Avi Kivity committed
46
#include <asm/cmpxchg.h>
47
#include <asm/e820/api.h>
48
#include <asm/io.h>
49
#include <asm/vmx.h>
50
#include <asm/kvm_page_track.h>
51
#include "trace.h"
Avi Kivity's avatar
Avi Kivity committed
52

53
54
55
extern bool itlb_multihit_kvm_mitigation;

static int __read_mostly nx_huge_pages = -1;
56
57
58
59
#ifdef CONFIG_PREEMPT_RT
/* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
#else
60
static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
61
#endif
62
63

static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
64
static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
65
66
67
68
69
70

static struct kernel_param_ops nx_huge_pages_ops = {
	.set = set_nx_huge_pages,
	.get = param_get_bool,
};

71
72
73
74
75
static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
	.set = set_nx_huge_pages_recovery_ratio,
	.get = param_get_uint,
};

76
77
module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
__MODULE_PARM_TYPE(nx_huge_pages, "bool");
78
79
80
module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
		&nx_huge_pages_recovery_ratio, 0644);
__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
81

82
83
84
static bool __read_mostly force_flush_and_sync_on_reuse;
module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);

85
86
87
88
89
90
91
/*
 * When setting this variable to true it enables Two-Dimensional-Paging
 * where the hardware walks 2 page tables:
 * 1. the guest-virtual to guest-physical
 * 2. while doing 1. it walks guest-physical to host-physical
 * If the hardware supports that we don't need to do shadow paging.
 */
92
bool tdp_enabled = false;
93

94
95
static int max_page_level __read_mostly;

96
97
98
99
enum {
	AUDIT_PRE_PAGE_FAULT,
	AUDIT_POST_PAGE_FAULT,
	AUDIT_PRE_PTE_WRITE,
100
101
102
	AUDIT_POST_PTE_WRITE,
	AUDIT_PRE_SYNC,
	AUDIT_POST_SYNC
103
};
104

105
#undef MMU_DEBUG
106
107

#ifdef MMU_DEBUG
108
109
static bool dbg = 0;
module_param(dbg, bool, 0644);
110
111
112

#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
113
#define MMU_WARN_ON(x) WARN_ON(x)
114
115
116
#else
#define pgprintk(x...) do { } while (0)
#define rmap_printk(x...) do { } while (0)
117
#define MMU_WARN_ON(x) do { } while (0)
118
#endif
Avi Kivity's avatar
Avi Kivity committed
119

120
121
#define PTE_PREFETCH_NUM		8

122
#define PT_FIRST_AVAIL_BITS_SHIFT 10
123
124
125
126
127
128
129
130
131
#define PT64_SECOND_AVAIL_BITS_SHIFT 54

/*
 * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
 * Access Tracking SPTEs.
 */
#define SPTE_SPECIAL_MASK (3ULL << 52)
#define SPTE_AD_ENABLED_MASK (0ULL << 52)
#define SPTE_AD_DISABLED_MASK (1ULL << 52)
132
#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
133
#define SPTE_MMIO_MASK (3ULL << 52)
Avi Kivity's avatar
Avi Kivity committed
134
135
136
137

#define PT64_LEVEL_BITS 9

#define PT64_LEVEL_SHIFT(level) \
Mike Day's avatar
Mike Day committed
138
		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
Avi Kivity's avatar
Avi Kivity committed
139
140
141
142
143
144
145
146

#define PT64_INDEX(address, level)\
	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))


#define PT32_LEVEL_BITS 10

#define PT32_LEVEL_SHIFT(level) \
Mike Day's avatar
Mike Day committed
147
		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
Avi Kivity's avatar
Avi Kivity committed
148

149
150
151
#define PT32_LVL_OFFSET_MASK(level) \
	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT32_LEVEL_BITS))) - 1))
Avi Kivity's avatar
Avi Kivity committed
152
153
154
155
156

#define PT32_INDEX(address, level)\
	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))


157
158
159
160
161
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
#else
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
#endif
162
163
164
165
166
167
#define PT64_LVL_ADDR_MASK(level) \
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT64_LEVEL_BITS))) - 1))
#define PT64_LVL_OFFSET_MASK(level) \
	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT64_LEVEL_BITS))) - 1))
Avi Kivity's avatar
Avi Kivity committed
168
169
170
171

#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
172
173
174
#define PT32_LVL_ADDR_MASK(level) \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
					    * PT32_LEVEL_BITS))) - 1))
Avi Kivity's avatar
Avi Kivity committed
175

176
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
177
			| shadow_x_mask | shadow_nx_mask | shadow_me_mask)
Avi Kivity's avatar
Avi Kivity committed
178

179
180
181
182
183
#define ACC_EXEC_MASK    1
#define ACC_WRITE_MASK   PT_WRITABLE_MASK
#define ACC_USER_MASK    PT_USER_MASK
#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)

184
185
186
187
/* The mask for the R/X bits in EPT PTEs */
#define PT64_EPT_READABLE_MASK			0x1ull
#define PT64_EPT_EXECUTABLE_MASK		0x4ull

188
189
#include <trace/events/kvm.h>

190
191
#define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
#define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
192

193
194
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

195
196
197
/* make pte_list_desc fit well in cache line */
#define PTE_LIST_EXT 3

198
199
200
201
202
203
204
205
206
207
208
209
210
211
/*
 * Return values of handle_mmio_page_fault and mmu.page_fault:
 * RET_PF_RETRY: let CPU fault again on the address.
 * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
 *
 * For handle_mmio_page_fault only:
 * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
 */
enum {
	RET_PF_RETRY = 0,
	RET_PF_EMULATE = 1,
	RET_PF_INVALID = 2,
};

212
213
214
struct pte_list_desc {
	u64 *sptes[PTE_LIST_EXT];
	struct pte_list_desc *more;
215
216
};

217
218
219
220
struct kvm_shadow_walk_iterator {
	u64 addr;
	hpa_t shadow_addr;
	u64 *sptep;
221
	int level;
222
223
224
	unsigned index;
};

225
226
227
228
229
230
231
#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
	for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
					 (_root), (_addr));                \
	     shadow_walk_okay(&(_walker));			           \
	     shadow_walk_next(&(_walker)))

#define for_each_shadow_entry(_vcpu, _addr, _walker)            \
232
233
234
235
	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
	     shadow_walk_okay(&(_walker));			\
	     shadow_walk_next(&(_walker)))

236
237
238
239
240
241
#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)	\
	for (shadow_walk_init(&(_walker), _vcpu, _addr);		\
	     shadow_walk_okay(&(_walker)) &&				\
		({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });	\
	     __shadow_walk_next(&(_walker), spte))

242
static struct kmem_cache *pte_list_desc_cache;
243
static struct kmem_cache *mmu_page_header_cache;
244
static struct percpu_counter kvm_total_used_mmu_pages;
245

Sheng Yang's avatar
Sheng Yang committed
246
247
248
249
250
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
251
static u64 __read_mostly shadow_mmio_value;
252
static u64 __read_mostly shadow_mmio_access_mask;
253
static u64 __read_mostly shadow_present_mask;
254
static u64 __read_mostly shadow_me_mask;
255

256
/*
257
258
259
 * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
 * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
 * pages.
260
261
262
263
264
265
266
267
268
269
270
271
272
 */
static u64 __read_mostly shadow_acc_track_mask;

/*
 * The mask/shift to use for saving the original R/X bits when marking the PTE
 * as not-present for access tracking purposes. We do not save the W bit as the
 * PTEs being access tracked also need to be dirty tracked, so the W bit will be
 * restored only when a write is attempted to the page.
 */
static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
						    PT64_EPT_EXECUTABLE_MASK;
static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;

273
274
275
276
277
278
279
280
281
282
283
/*
 * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
 * to guard against L1TF attacks.
 */
static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;

/*
 * The number of high-order 1 bits to use in the mask above.
 */
static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;

284
285
286
287
288
289
290
291
292
293
/*
 * In some cases, we need to preserve the GFN of a non-present or reserved
 * SPTE when we usurp the upper five bits of the physical address space to
 * defend against L1TF, e.g. for MMIO SPTEs.  To preserve the GFN, we'll
 * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
 * left into the reserved bits, i.e. the GFN in the SPTE will be split into
 * high and low parts.  This mask covers the lower bits of the GFN.
 */
static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;

294
295
296
297
298
/*
 * The number of non-reserved physical address bits irrespective of features
 * that repurpose legal bits, e.g. MKTME.
 */
static u8 __read_mostly shadow_phys_bits;
299

300
static void mmu_spte_set(u64 *sptep, u64 spte);
301
static bool is_executable_pte(u64 spte);
302
303
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
304

305
306
307
#define CREATE_TRACE_POINTS
#include "mmutrace.h"

308
309
310

static inline bool kvm_available_flush_tlb_with_range(void)
{
311
	return kvm_x86_ops.tlb_remote_flush_with_range;
312
313
314
315
316
317
318
}

static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
		struct kvm_tlb_range *range)
{
	int ret = -ENOTSUPP;

319
320
	if (range && kvm_x86_ops.tlb_remote_flush_with_range)
		ret = kvm_x86_ops.tlb_remote_flush_with_range(kvm, range);
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336

	if (ret)
		kvm_flush_remote_tlbs(kvm);
}

static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
		u64 start_gfn, u64 pages)
{
	struct kvm_tlb_range range;

	range.start_gfn = start_gfn;
	range.pages = pages;

	kvm_flush_remote_tlbs_with_range(kvm, &range);
}

337
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
338
{
339
	BUG_ON((u64)(unsigned)access_mask != access_mask);
340
341
	WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
	WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
342
	shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
343
	shadow_mmio_access_mask = access_mask;
344
345
346
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);

347
348
static bool is_mmio_spte(u64 spte)
{
349
	return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
350
351
}

352
353
354
355
356
static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
{
	return sp->role.ad_disabled;
}

357
358
359
360
361
362
363
364
365
366
367
static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
{
	/*
	 * When using the EPT page-modification log, the GPAs in the log
	 * would come from L2 rather than L1.  Therefore, we need to rely
	 * on write protection to record dirty pages.  This also bypasses
	 * PML, since writes now result in a vmexit.
	 */
	return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
}

368
369
static inline bool spte_ad_enabled(u64 spte)
{
370
	MMU_WARN_ON(is_mmio_spte(spte));
371
372
373
374
375
376
377
	return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
}

static inline bool spte_ad_need_write_protect(u64 spte)
{
	MMU_WARN_ON(is_mmio_spte(spte));
	return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
378
379
}

380
381
382
383
384
static bool is_nx_huge_page_enabled(void)
{
	return READ_ONCE(nx_huge_pages);
}

385
386
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
387
	MMU_WARN_ON(is_mmio_spte(spte));
388
389
390
391
392
	return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
}

static inline u64 spte_shadow_dirty_mask(u64 spte)
{
393
	MMU_WARN_ON(is_mmio_spte(spte));
394
395
396
	return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
}

397
398
static inline bool is_access_track_spte(u64 spte)
{
399
	return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
400
401
}

402
/*
403
404
 * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
 * the memslots generation and is derived as follows:
405
 *
406
407
 * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
 * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
408
 *
409
410
411
412
413
414
 * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
 * the MMIO generation number, as doing so would require stealing a bit from
 * the "real" generation number and thus effectively halve the maximum number
 * of MMIO generations that can be handled before encountering a wrap (which
 * requires a full MMU zap).  The flag is instead explicitly queried when
 * checking for MMIO spte cache hits.
415
 */
416
#define MMIO_SPTE_GEN_MASK		GENMASK_ULL(17, 0)
417

418
419
420
421
#define MMIO_SPTE_GEN_LOW_START		3
#define MMIO_SPTE_GEN_LOW_END		11
#define MMIO_SPTE_GEN_LOW_MASK		GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
						    MMIO_SPTE_GEN_LOW_START)
422

423
424
#define MMIO_SPTE_GEN_HIGH_START	PT64_SECOND_AVAIL_BITS_SHIFT
#define MMIO_SPTE_GEN_HIGH_END		62
425
426
#define MMIO_SPTE_GEN_HIGH_MASK		GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
						    MMIO_SPTE_GEN_HIGH_START)
427

428
static u64 generation_mmio_spte_mask(u64 gen)
429
430
431
{
	u64 mask;

432
	WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
433
	BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
434

435
436
	mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
	mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
437
438
439
	return mask;
}

440
static u64 get_mmio_spte_generation(u64 spte)
441
{
442
	u64 gen;
443

444
445
	gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
	gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
446
447
448
	return gen;
}

449
static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
450
{
451

452
	u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
453
	u64 mask = generation_mmio_spte_mask(gen);
454
	u64 gpa = gfn << PAGE_SHIFT;
455

456
	access &= shadow_mmio_access_mask;
457
458
459
460
	mask |= shadow_mmio_value | access;
	mask |= gpa | shadow_nonpresent_or_rsvd_mask;
	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
		<< shadow_nonpresent_or_rsvd_mask_len;
461

462
463
464
465
466
467
468
469
470
471
472
	return mask;
}

static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
			   unsigned int access)
{
	u64 mask = make_mmio_spte(vcpu, gfn, access);
	unsigned int gen = get_mmio_spte_generation(mask);

	access = mask & ACC_ALL;

473
	trace_mark_mmio_spte(sptep, gfn, access, gen);
474
	mmu_spte_set(sptep, mask);
475
476
477
478
}

static gfn_t get_mmio_spte_gfn(u64 spte)
{
479
	u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
480
481
482
483
484

	gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
	       & shadow_nonpresent_or_rsvd_mask;

	return gpa >> PAGE_SHIFT;
485
486
487
488
}

static unsigned get_mmio_spte_access(u64 spte)
{
489
	return spte & shadow_mmio_access_mask;
490
491
}

492
static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
493
			  kvm_pfn_t pfn, unsigned int access)
494
495
{
	if (unlikely(is_noslot_pfn(pfn))) {
496
		mark_mmio_spte(vcpu, sptep, gfn, access);
497
498
499
500
501
		return true;
	}

	return false;
}
502

503
static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
504
{
505
	u64 kvm_gen, spte_gen, gen;
506

507
508
509
	gen = kvm_vcpu_memslots(vcpu)->generation;
	if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
		return false;
510

511
	kvm_gen = gen & MMIO_SPTE_GEN_MASK;
512
513
514
515
	spte_gen = get_mmio_spte_generation(spte);

	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
	return likely(kvm_gen == spte_gen);
516
517
}

518
519
520
521
522
523
524
/*
 * Sets the shadow PTE masks used by the MMU.
 *
 * Assumptions:
 *  - Setting either @accessed_mask or @dirty_mask requires setting both
 *  - At least one of @accessed_mask or @acc_track_mask must be set
 */
Sheng Yang's avatar
Sheng Yang committed
525
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
526
		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
527
		u64 acc_track_mask, u64 me_mask)
Sheng Yang's avatar
Sheng Yang committed
528
{
529
530
	BUG_ON(!dirty_mask != !accessed_mask);
	BUG_ON(!accessed_mask && !acc_track_mask);
531
	BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
532

Sheng Yang's avatar
Sheng Yang committed
533
534
535
536
537
	shadow_user_mask = user_mask;
	shadow_accessed_mask = accessed_mask;
	shadow_dirty_mask = dirty_mask;
	shadow_nx_mask = nx_mask;
	shadow_x_mask = x_mask;
538
	shadow_present_mask = p_mask;
539
	shadow_acc_track_mask = acc_track_mask;
540
	shadow_me_mask = me_mask;
Sheng Yang's avatar
Sheng Yang committed
541
542
543
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);

544
545
546
static u8 kvm_get_shadow_phys_bits(void)
{
	/*
547
548
549
550
	 * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
	 * in CPU detection code, but the processor treats those reduced bits as
	 * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
	 * the physical address bits reported by CPUID.
551
	 */
552
553
	if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
		return cpuid_eax(0x80000008) & 0xff;
554

555
556
557
558
559
560
	/*
	 * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
	 * custom CPUID.  Proceed with whatever the kernel found since these features
	 * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
	 */
	return boot_cpu_data.x86_phys_bits;
561
562
}

563
static void kvm_mmu_reset_all_pte_masks(void)
564
{
565
566
	u8 low_phys_bits;

567
568
569
570
571
572
573
	shadow_user_mask = 0;
	shadow_accessed_mask = 0;
	shadow_dirty_mask = 0;
	shadow_nx_mask = 0;
	shadow_x_mask = 0;
	shadow_present_mask = 0;
	shadow_acc_track_mask = 0;
574

575
576
	shadow_phys_bits = kvm_get_shadow_phys_bits();

577
578
579
580
	/*
	 * If the CPU has 46 or less physical address bits, then set an
	 * appropriate mask to guard against L1TF attacks. Otherwise, it is
	 * assumed that the CPU is not vulnerable to L1TF.
581
582
583
584
585
	 *
	 * Some Intel CPUs address the L1 cache using more PA bits than are
	 * reported by CPUID. Use the PA width of the L1 cache when possible
	 * to achieve more effective mitigation, e.g. if system RAM overlaps
	 * the most significant bits of legal physical address space.
586
	 */
587
	shadow_nonpresent_or_rsvd_mask = 0;
588
589
590
591
592
593
	low_phys_bits = boot_cpu_data.x86_phys_bits;
	if (boot_cpu_has_bug(X86_BUG_L1TF) &&
	    !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
			  52 - shadow_nonpresent_or_rsvd_mask_len)) {
		low_phys_bits = boot_cpu_data.x86_cache_bits
			- shadow_nonpresent_or_rsvd_mask_len;
594
		shadow_nonpresent_or_rsvd_mask =
595
596
			rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
	}
597

598
599
	shadow_nonpresent_or_rsvd_lower_gfn_mask =
		GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
600
601
}

Avi Kivity's avatar
Avi Kivity committed
602
603
604
605
606
static int is_cpuid_PSE36(void)
{
	return 1;
}

607
608
static int is_nx(struct kvm_vcpu *vcpu)
{
609
	return vcpu->arch.efer & EFER_NX;
610
611
}

612
613
static int is_shadow_present_pte(u64 pte)
{
614
	return (pte != 0) && !is_mmio_spte(pte);
615
616
}

Marcelo Tosatti's avatar
Marcelo Tosatti committed
617
618
619
620
621
static int is_large_pte(u64 pte)
{
	return pte & PT_PAGE_SIZE_MASK;
}

622
623
static int is_last_spte(u64 pte, int level)
{
624
	if (level == PG_LEVEL_4K)
625
		return 1;
626
	if (is_large_pte(pte))
627
628
629
630
		return 1;
	return 0;
}

631
632
633
634
635
static bool is_executable_pte(u64 spte)
{
	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
}

Dan Williams's avatar
Dan Williams committed
636
static kvm_pfn_t spte_to_pfn(u64 pte)
637
{
638
	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
639
640
}

641
642
643
644
645
646
647
static gfn_t pse36_gfn_delta(u32 gpte)
{
	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;

	return (gpte & PT32_DIR_PSE36_MASK) << shift;
}

648
#ifdef CONFIG_X86_64
Avi Kivity's avatar
Avi Kivity committed
649
static void __set_spte(u64 *sptep, u64 spte)
650
{
651
	WRITE_ONCE(*sptep, spte);
652
653
}

654
static void __update_clear_spte_fast(u64 *sptep, u64 spte)
655
{
656
	WRITE_ONCE(*sptep, spte);
657
658
659
660
661
662
}

static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
{
	return xchg(sptep, spte);
}
663
664
665

static u64 __get_spte_lockless(u64 *sptep)
{
666
	return READ_ONCE(*sptep);
667
}
668
#else
669
670
671
672
673
674
675
union split_spte {
	struct {
		u32 spte_low;
		u32 spte_high;
	};
	u64 spte;
};
676

677
678
679
680
681
682
683
684
685
686
687
688
static void count_spte_clear(u64 *sptep, u64 spte)
{
	struct kvm_mmu_page *sp =  page_header(__pa(sptep));

	if (is_shadow_present_pte(spte))
		return;

	/* Ensure the spte is completely set before we increase the count */
	smp_wmb();
	sp->clear_spte_count++;
}

689
690
691
static void __set_spte(u64 *sptep, u64 spte)
{
	union split_spte *ssptep, sspte;
692

693
694
695
696
697
698
699
700
701
702
703
704
	ssptep = (union split_spte *)sptep;
	sspte = (union split_spte)spte;

	ssptep->spte_high = sspte.spte_high;

	/*
	 * If we map the spte from nonpresent to present, We should store
	 * the high bits firstly, then set present bit, so cpu can not
	 * fetch this spte while we are setting the spte.
	 */
	smp_wmb();

705
	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
706
707
}

708
709
710
711
712
713
714
static void __update_clear_spte_fast(u64 *sptep, u64 spte)
{
	union split_spte *ssptep, sspte;

	ssptep = (union split_spte *)sptep;
	sspte = (union split_spte)spte;

715
	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
716
717
718
719
720
721
722
723

	/*
	 * If we map the spte from present to nonpresent, we should clear
	 * present bit firstly to avoid vcpu fetch the old high bits.
	 */
	smp_wmb();

	ssptep->spte_high = sspte.spte_high;
724
	count_spte_clear(sptep, spte);
725
726
727
728
729
730
731
732
733
734
735
}

static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
{
	union split_spte *ssptep, sspte, orig;

	ssptep = (union split_spte *)sptep;
	sspte = (union split_spte)spte;

	/* xchg acts as a barrier before the setting of the high bits */
	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
736
737
	orig.spte_high = ssptep->spte_high;
	ssptep->spte_high = sspte.spte_high;
738
	count_spte_clear(sptep, spte);
739
740
741

	return orig.spte;
}
742
743
744

/*
 * The idea using the light way get the spte on x86_32 guest is from
745
 * gup_get_pte (mm/gup.c).
746
747
748
749
750
751
752
753
754
755
756
757
758
759
 *
 * An spte tlb flush may be pending, because kvm_set_pte_rmapp
 * coalesces them and we are running out of the MMU lock.  Therefore
 * we need to protect against in-progress updates of the spte.
 *
 * Reading the spte while an update is in progress may get the old value
 * for the high part of the spte.  The race is fine for a present->non-present
 * change (because the high part of the spte is ignored for non-present spte),
 * but for a present->present change we must reread the spte.
 *
 * All such changes are done in two steps (present->non-present and
 * non-present->present), hence it is enough to count the number of
 * present->non-present updates: if it changed while reading the spte,
 * we might have hit the race.  This is done using clear_spte_count.
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
 */
static u64 __get_spte_lockless(u64 *sptep)
{
	struct kvm_mmu_page *sp =  page_header(__pa(sptep));
	union split_spte spte, *orig = (union split_spte *)sptep;
	int count;

retry:
	count = sp->clear_spte_count;
	smp_rmb();

	spte.spte_low = orig->spte_low;
	smp_rmb();

	spte.spte_high = orig->spte_high;
	smp_rmb();

	if (unlikely(spte.spte_low != orig->spte_low ||
	      count != sp->clear_spte_count))
		goto retry;

	return spte.spte;
}
783
784
#endif

785
static bool spte_can_locklessly_be_made_writable(u64 spte)
786
{
787
788
	return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
		(SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
789
790
}

791
792
static bool spte_has_volatile_bits(u64 spte)
{
793
794
795
	if (!is_shadow_present_pte(spte))
		return false;

796
	/*
797
	 * Always atomically update spte if it can be updated
798
799
800
801
	 * out of mmu-lock, it can ensure dirty bit is not lost,
	 * also, it can help us to get a stable is_writable_pte()
	 * to ensure tlb flush is not missed.
	 */
802
803
	if (spte_can_locklessly_be_made_writable(spte) ||
	    is_access_track_spte(spte))
804
805
		return true;

806
	if (spte_ad_enabled(spte)) {
807
808
809
810
		if ((spte & shadow_accessed_mask) == 0 ||
	    	    (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
			return true;
	}
811

812
	return false;
813
814
}

815
static bool is_accessed_spte(u64 spte)
816
{
817
818
819
820
	u64 accessed_mask = spte_shadow_accessed_mask(spte);

	return accessed_mask ? spte & accessed_mask
			     : !is_access_track_spte(spte);
821
822
}

823
static bool is_dirty_spte(u64 spte)
824
{
825
826
827
	u64 dirty_mask = spte_shadow_dirty_mask(spte);

	return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
828
829
}

830
831
832
833
834
835
836
837
838
839
840
841
/* Rules for using mmu_spte_set:
 * Set the sptep from nonpresent to present.
 * Note: the sptep being assigned *must* be either not present
 * or in a state where the hardware will not attempt to update
 * the spte.
 */
static void mmu_spte_set(u64 *sptep, u64 new_spte)
{
	WARN_ON(is_shadow_present_pte(*sptep));
	__set_spte(sptep, new_spte);
}

842
843
844
/*
 * Update the SPTE (excluding the PFN), but do not track changes in its
 * accessed/dirty status.
845
 */
846
static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
847
{
848
	u64 old_spte = *sptep;
849

850
	WARN_ON(!is_shadow_present_pte(new_spte));
851

852
853
	if (!is_shadow_present_pte(old_spte)) {
		mmu_spte_set(sptep, new_spte);
854
		return old_spte;
855
	}
856

857
	if (!spte_has_volatile_bits(old_spte))
858
		__update_clear_spte_fast(sptep, new_spte);
859
	else
860
		old_spte = __update_clear_spte_slow(sptep, new_spte);
861

862
863
	WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));

864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
	return old_spte;
}

/* Rules for using mmu_spte_update:
 * Update the state bits, it means the mapped pfn is not changed.
 *
 * Whenever we overwrite a writable spte with a read-only one we
 * should flush remote TLBs. Otherwise rmap_write_protect
 * will find a read-only spte, even though the writable spte
 * might be cached on a CPU's TLB, the return value indicates this
 * case.
 *
 * Returns true if the TLB needs to be flushed
 */
static bool mmu_spte_update(u64 *sptep, u64 new_spte)
{
	bool flush = false;
	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);

	if (!is_shadow_present_pte(old_spte))
		return false;

886
887
	/*
	 * For the spte updated out of mmu-lock is safe, since
888
	 * we always atomically update it, see the comments in
889
890
	 * spte_has_volatile_bits().
	 */
891
	if (spte_can_locklessly_be_made_writable(old_spte) &&
892
	      !is_writable_pte(new_spte))
893
		flush = true;
894

895
	/*
896
	 * Flush TLB when accessed/dirty states are changed in the page tables,
897
898
899
	 * to guarantee consistency between TLB and page tables.
	 */

900
901
	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
		flush = true;
902
		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
903
904
905
906
	}

	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
		flush = true;
907
		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
908
	}
909

910
	return flush;
911
912
}

913
914
915
916
/*
 * Rules for using mmu_spte_clear_track_bits:
 * It sets the sptep from present to nonpresent, and track the
 * state bits, it is used to clear the last level sptep.
917
 * Returns non-zero if the PTE was previously valid.
918
919
920
 */
static int mmu_spte_clear_track_bits(u64 *sptep)
{
Dan Williams's avatar
Dan Williams committed
921
	kvm_pfn_t pfn;
922
923
924
	u64 old_spte = *sptep;

	if (!spte_has_volatile_bits(old_spte))
925
		__update_clear_spte_fast(sptep, 0ull);
926
	else
927
		old_spte = __update_clear_spte_slow(sptep, 0ull);
928

929
	if (!is_shadow_present_pte(old_spte))
930
931
932
		return 0;

	pfn = spte_to_pfn(old_spte);
933
934
935
936
937
938

	/*
	 * KVM does not hold the refcount of the page used by
	 * kvm mmu, before reclaiming the page, we should
	 * unmap it from mmu first.
	 */
939
	WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
940

941
	if (is_accessed_spte(old_spte))
942
		kvm_set_pfn_accessed(pfn);
943
944

	if (is_dirty_spte(old_spte))
945
		kvm_set_pfn_dirty(pfn);
946

947
948
949
950
951
952
953
954
955
956
	return 1;
}

/*
 * Rules for using mmu_spte_clear_no_track:
 * Directly clear spte without caring the state bits of sptep,
 * it is used to set the upper level spte.
 */
static void mmu_spte_clear_no_track(u64 *sptep)
{
957
	__update_clear_spte_fast(sptep, 0ull);
958
959
}

960
961
962
963
964
static u64 mmu_spte_get_lockless(u64 *sptep)
{
	return __get_spte_lockless(sptep);
}

965
966
static u64 mark_spte_for_access_track(u64 spte)
{
967
	if (spte_ad_enabled(spte))
968
969
		return spte & ~shadow_accessed_mask;

970
	if (is_access_track_spte(spte))
971
972
973
		return spte;

	/*
974
975
976
	 * Making an Access Tracking PTE will result in removal of write access
	 * from the PTE. So, verify that we will be able to restore the write
	 * access in the fast page fault path later on.
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
	 */
	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
		  !spte_can_locklessly_be_made_writable(spte),
		  "kvm: Writable SPTE is not locklessly dirty-trackable\n");

	WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
			  shadow_acc_track_saved_bits_shift),
		  "kvm: Access Tracking saved bit locations are not zero\n");

	spte |= (spte & shadow_acc_track_saved_bits_mask) <<
		shadow_acc_track_saved_bits_shift;
	spte &= ~shadow_acc_track_mask;

	return spte;
}

993
994
995
996
997
998
999
/* Restore an acc-track PTE back to a regular PTE */
static u64 restore_acc_track_spte(u64 spte)
{
	u64 new_spte = spte;
	u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
			 & shadow_acc_track_saved_bits_mask;

1000
	WARN_ON_ONCE(spte_ad_enabled(spte));
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
	WARN_ON_ONCE(!is_access_track_spte(spte));

	new_spte &= ~shadow_acc_track_mask;
	new_spte &= ~(shadow_acc_track_saved_bits_mask <<
		      shadow_acc_track_saved_bits_shift);
	new_spte |= saved_bits;

	return new_spte;
}

1011
1012
1013
1014
1015
1016
1017
1018
/* Returns the Accessed status of the PTE and resets it at the same time. */
static bool mmu_spte_age(u64 *sptep)
{
	u64 spte = mmu_spte_get_lockless(sptep);

	if (!is_accessed_spte(spte))
		return false;

1019
	if (spte_ad_enabled(spte)) {
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
		clear_bit((ffs(shadow_accessed_mask) - 1),
			  (unsigned long *)sptep);
	} else {
		/*
		 * Capture the dirty status of the page, so that it doesn't get
		 * lost when the SPTE is marked for access tracking.
		 */
		if (is_writable_pte(spte))
			kvm_set_pfn_dirty(spte_to_pfn(spte));

		spte = mark_spte_for_access_track(spte);
		mmu_spte_update_no_track(sptep, spte);
	}

	return true;
}

1037
1038
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
1039
1040
1041
1042
1043
	/*
	 * Prevent page table teardown by making any free-er wait during
	 * kvm_flush_remote_tlbs() IPI to all active vcpus.
	 */
	local_irq_disable();
1044

1045
1046
1047
1048
	/*
	 * Make sure a following spte read is not reordered ahead of the write
	 * to vcpu->mode.
	 */
1049
	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
1050
1051
1052
1053
}

static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{
1054
1055
	/*
	 * Make sure the write to vcpu->mode is not reordered in front of
1056
	 * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
1057
1058
	 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
	 */
1059
	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
1060
	local_irq_enable();
1061
1062
}

1063
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
1064
				  struct kmem_cache *base_cache, int min)
1065
1066
1067
1068
{
	void *obj;

	if (cache->nobjs >= min)
1069
		return 0;
1070
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1071
		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
1072
		if (!obj)
1073
			return cache->nobjs >= min ? 0 : -ENOMEM;
1074
1075
		cache->objects[cache->nobjs++] = obj;
	}
1076
	return 0;
1077
1078
}

1079
1080
1081
1082
1083
static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
{
	return cache->nobjs;
}

1084
1085
static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
				  struct kmem_cache *cache)
1086
1087
{
	while (mc->nobjs)
1088
		kmem_cache_free(cache, mc->objects[--mc->nobjs]);
1089
1090
}

Avi Kivity's avatar
Avi Kivity committed
1091
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
1092
				       int min)
Avi Kivity's avatar
Avi Kivity committed
1093
{
1094
	void *page;
Avi Kivity's avatar
Avi Kivity committed
1095
1096
1097
1098

	if (cache->nobjs >= min)
		return 0;
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1099
		page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
Avi Kivity's avatar
Avi Kivity committed
1100
		if (!page)
1101
			return cache->nobjs >= min ? 0 : -ENOMEM;
1102
		cache->objects[cache->nobjs++] = page;
Avi Kivity's avatar
Avi Kivity committed
1103
1104
1105
1106
1107
1108
1109
	}
	return 0;
}

static void mmu_free_memory