mmu.c 169 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Avi Kivity's avatar
Avi Kivity committed
2
3
4
5
6
7
8
9
10
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
11
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
Avi Kivity's avatar
Avi Kivity committed
12
13
14
15
16
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 */
Avi Kivity's avatar
Avi Kivity committed
17

18
#include "irq.h"
19
#include "mmu.h"
20
#include "x86.h"
Avi Kivity's avatar
Avi Kivity committed
21
#include "kvm_cache_regs.h"
22
#include "kvm_emulate.h"
23
#include "cpuid.h"
Avi Kivity's avatar
Avi Kivity committed
24

25
#include <linux/kvm_host.h>
Avi Kivity's avatar
Avi Kivity committed
26
27
28
29
#include <linux/types.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/highmem.h>
30
31
#include <linux/moduleparam.h>
#include <linux/export.h>
32
#include <linux/swap.h>
Marcelo Tosatti's avatar
Marcelo Tosatti committed
33
#include <linux/hugetlb.h>
34
#include <linux/compiler.h>
35
#include <linux/srcu.h>
36
#include <linux/slab.h>
37
#include <linux/sched/signal.h>
38
#include <linux/uaccess.h>
39
#include <linux/hash.h>
40
#include <linux/kern_levels.h>
41
#include <linux/kthread.h>
Avi Kivity's avatar
Avi Kivity committed
42

Avi Kivity's avatar
Avi Kivity committed
43
#include <asm/page.h>
44
#include <asm/memtype.h>
Avi Kivity's avatar
Avi Kivity committed
45
#include <asm/cmpxchg.h>
46
#include <asm/e820/api.h>
47
#include <asm/io.h>
48
#include <asm/vmx.h>
49
#include <asm/kvm_page_track.h>
50
#include "trace.h"
Avi Kivity's avatar
Avi Kivity committed
51

52
53
54
extern bool itlb_multihit_kvm_mitigation;

static int __read_mostly nx_huge_pages = -1;
55
56
57
58
#ifdef CONFIG_PREEMPT_RT
/* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
#else
59
static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
60
#endif
61
62

static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
63
static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
64
65
66
67
68
69

static struct kernel_param_ops nx_huge_pages_ops = {
	.set = set_nx_huge_pages,
	.get = param_get_bool,
};

70
71
72
73
74
static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
	.set = set_nx_huge_pages_recovery_ratio,
	.get = param_get_uint,
};

75
76
module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
__MODULE_PARM_TYPE(nx_huge_pages, "bool");
77
78
79
module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
		&nx_huge_pages_recovery_ratio, 0644);
__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
80

81
82
83
84
85
86
87
/*
 * When setting this variable to true it enables Two-Dimensional-Paging
 * where the hardware walks 2 page tables:
 * 1. the guest-virtual to guest-physical
 * 2. while doing 1. it walks guest-physical to host-physical
 * If the hardware supports that we don't need to do shadow paging.
 */
88
bool tdp_enabled = false;
89

90
91
static int max_page_level __read_mostly;

92
93
94
95
enum {
	AUDIT_PRE_PAGE_FAULT,
	AUDIT_POST_PAGE_FAULT,
	AUDIT_PRE_PTE_WRITE,
96
97
98
	AUDIT_POST_PTE_WRITE,
	AUDIT_PRE_SYNC,
	AUDIT_POST_SYNC
99
};
100

101
#undef MMU_DEBUG
102
103

#ifdef MMU_DEBUG
104
105
static bool dbg = 0;
module_param(dbg, bool, 0644);
106
107
108

#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
109
#define MMU_WARN_ON(x) WARN_ON(x)
110
111
112
#else
#define pgprintk(x...) do { } while (0)
#define rmap_printk(x...) do { } while (0)
113
#define MMU_WARN_ON(x) do { } while (0)
114
#endif
Avi Kivity's avatar
Avi Kivity committed
115

116
117
#define PTE_PREFETCH_NUM		8

118
#define PT_FIRST_AVAIL_BITS_SHIFT 10
119
120
121
122
123
124
125
126
127
#define PT64_SECOND_AVAIL_BITS_SHIFT 54

/*
 * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
 * Access Tracking SPTEs.
 */
#define SPTE_SPECIAL_MASK (3ULL << 52)
#define SPTE_AD_ENABLED_MASK (0ULL << 52)
#define SPTE_AD_DISABLED_MASK (1ULL << 52)
128
#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
129
#define SPTE_MMIO_MASK (3ULL << 52)
Avi Kivity's avatar
Avi Kivity committed
130
131
132
133

#define PT64_LEVEL_BITS 9

#define PT64_LEVEL_SHIFT(level) \
Mike Day's avatar
Mike Day committed
134
		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
Avi Kivity's avatar
Avi Kivity committed
135
136
137
138
139
140
141
142

#define PT64_INDEX(address, level)\
	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))


#define PT32_LEVEL_BITS 10

#define PT32_LEVEL_SHIFT(level) \
Mike Day's avatar
Mike Day committed
143
		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
Avi Kivity's avatar
Avi Kivity committed
144

145
146
147
#define PT32_LVL_OFFSET_MASK(level) \
	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT32_LEVEL_BITS))) - 1))
Avi Kivity's avatar
Avi Kivity committed
148
149
150
151
152

#define PT32_INDEX(address, level)\
	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))


153
154
155
156
157
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
#else
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
#endif
158
159
160
161
162
163
#define PT64_LVL_ADDR_MASK(level) \
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT64_LEVEL_BITS))) - 1))
#define PT64_LVL_OFFSET_MASK(level) \
	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT64_LEVEL_BITS))) - 1))
Avi Kivity's avatar
Avi Kivity committed
164
165
166
167

#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
168
169
170
#define PT32_LVL_ADDR_MASK(level) \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
					    * PT32_LEVEL_BITS))) - 1))
Avi Kivity's avatar
Avi Kivity committed
171

172
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
173
			| shadow_x_mask | shadow_nx_mask | shadow_me_mask)
Avi Kivity's avatar
Avi Kivity committed
174

175
176
177
178
179
#define ACC_EXEC_MASK    1
#define ACC_WRITE_MASK   PT_WRITABLE_MASK
#define ACC_USER_MASK    PT_USER_MASK
#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)

180
181
182
183
/* The mask for the R/X bits in EPT PTEs */
#define PT64_EPT_READABLE_MASK			0x1ull
#define PT64_EPT_EXECUTABLE_MASK		0x4ull

184
185
#include <trace/events/kvm.h>

186
187
#define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
#define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
188

189
190
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

191
192
193
/* make pte_list_desc fit well in cache line */
#define PTE_LIST_EXT 3

194
195
196
197
198
199
200
201
202
203
204
205
206
207
/*
 * Return values of handle_mmio_page_fault and mmu.page_fault:
 * RET_PF_RETRY: let CPU fault again on the address.
 * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
 *
 * For handle_mmio_page_fault only:
 * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
 */
enum {
	RET_PF_RETRY = 0,
	RET_PF_EMULATE = 1,
	RET_PF_INVALID = 2,
};

208
209
210
struct pte_list_desc {
	u64 *sptes[PTE_LIST_EXT];
	struct pte_list_desc *more;
211
212
};

213
214
215
216
struct kvm_shadow_walk_iterator {
	u64 addr;
	hpa_t shadow_addr;
	u64 *sptep;
217
	int level;
218
219
220
	unsigned index;
};

221
222
223
224
225
226
227
#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
	for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
					 (_root), (_addr));                \
	     shadow_walk_okay(&(_walker));			           \
	     shadow_walk_next(&(_walker)))

#define for_each_shadow_entry(_vcpu, _addr, _walker)            \
228
229
230
231
	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
	     shadow_walk_okay(&(_walker));			\
	     shadow_walk_next(&(_walker)))

232
233
234
235
236
237
#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)	\
	for (shadow_walk_init(&(_walker), _vcpu, _addr);		\
	     shadow_walk_okay(&(_walker)) &&				\
		({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });	\
	     __shadow_walk_next(&(_walker), spte))

238
static struct kmem_cache *pte_list_desc_cache;
239
static struct kmem_cache *mmu_page_header_cache;
240
static struct percpu_counter kvm_total_used_mmu_pages;
241

Sheng Yang's avatar
Sheng Yang committed
242
243
244
245
246
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
247
static u64 __read_mostly shadow_mmio_mask;
248
static u64 __read_mostly shadow_mmio_value;
249
static u64 __read_mostly shadow_mmio_access_mask;
250
static u64 __read_mostly shadow_present_mask;
251
static u64 __read_mostly shadow_me_mask;
252

253
/*
254
255
256
 * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
 * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
 * pages.
257
258
259
260
261
262
263
264
265
266
267
268
269
 */
static u64 __read_mostly shadow_acc_track_mask;

/*
 * The mask/shift to use for saving the original R/X bits when marking the PTE
 * as not-present for access tracking purposes. We do not save the W bit as the
 * PTEs being access tracked also need to be dirty tracked, so the W bit will be
 * restored only when a write is attempted to the page.
 */
static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
						    PT64_EPT_EXECUTABLE_MASK;
static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;

270
271
272
273
274
275
276
277
278
279
280
/*
 * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
 * to guard against L1TF attacks.
 */
static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;

/*
 * The number of high-order 1 bits to use in the mask above.
 */
static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;

281
282
283
284
285
286
287
288
289
290
/*
 * In some cases, we need to preserve the GFN of a non-present or reserved
 * SPTE when we usurp the upper five bits of the physical address space to
 * defend against L1TF, e.g. for MMIO SPTEs.  To preserve the GFN, we'll
 * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
 * left into the reserved bits, i.e. the GFN in the SPTE will be split into
 * high and low parts.  This mask covers the lower bits of the GFN.
 */
static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;

291
292
293
294
295
/*
 * The number of non-reserved physical address bits irrespective of features
 * that repurpose legal bits, e.g. MKTME.
 */
static u8 __read_mostly shadow_phys_bits;
296

297
static void mmu_spte_set(u64 *sptep, u64 spte);
298
static bool is_executable_pte(u64 spte);
299
300
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
301

302
303
304
#define CREATE_TRACE_POINTS
#include "mmutrace.h"

305
306
307

static inline bool kvm_available_flush_tlb_with_range(void)
{
308
	return kvm_x86_ops.tlb_remote_flush_with_range;
309
310
311
312
313
314
315
}

static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
		struct kvm_tlb_range *range)
{
	int ret = -ENOTSUPP;

316
317
	if (range && kvm_x86_ops.tlb_remote_flush_with_range)
		ret = kvm_x86_ops.tlb_remote_flush_with_range(kvm, range);
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333

	if (ret)
		kvm_flush_remote_tlbs(kvm);
}

static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
		u64 start_gfn, u64 pages)
{
	struct kvm_tlb_range range;

	range.start_gfn = start_gfn;
	range.pages = pages;

	kvm_flush_remote_tlbs_with_range(kvm, &range);
}

334
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask)
335
{
336
	BUG_ON((u64)(unsigned)access_mask != access_mask);
337
	BUG_ON((mmio_mask & mmio_value) != mmio_value);
338
	shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
339
	shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
340
	shadow_mmio_access_mask = access_mask;
341
342
343
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);

344
345
346
347
348
static bool is_mmio_spte(u64 spte)
{
	return (spte & shadow_mmio_mask) == shadow_mmio_value;
}

349
350
351
352
353
static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
{
	return sp->role.ad_disabled;
}

354
355
356
357
358
359
360
361
362
363
364
static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
{
	/*
	 * When using the EPT page-modification log, the GPAs in the log
	 * would come from L2 rather than L1.  Therefore, we need to rely
	 * on write protection to record dirty pages.  This also bypasses
	 * PML, since writes now result in a vmexit.
	 */
	return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
}

365
366
static inline bool spte_ad_enabled(u64 spte)
{
367
	MMU_WARN_ON(is_mmio_spte(spte));
368
369
370
371
372
373
374
	return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
}

static inline bool spte_ad_need_write_protect(u64 spte)
{
	MMU_WARN_ON(is_mmio_spte(spte));
	return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
375
376
}

377
378
379
380
381
static bool is_nx_huge_page_enabled(void)
{
	return READ_ONCE(nx_huge_pages);
}

382
383
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
384
	MMU_WARN_ON(is_mmio_spte(spte));
385
386
387
388
389
	return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
}

static inline u64 spte_shadow_dirty_mask(u64 spte)
{
390
	MMU_WARN_ON(is_mmio_spte(spte));
391
392
393
	return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
}

394
395
static inline bool is_access_track_spte(u64 spte)
{
396
	return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
397
398
}

399
/*
400
401
 * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
 * the memslots generation and is derived as follows:
402
 *
403
404
 * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
 * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
405
 *
406
407
408
409
410
411
 * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
 * the MMIO generation number, as doing so would require stealing a bit from
 * the "real" generation number and thus effectively halve the maximum number
 * of MMIO generations that can be handled before encountering a wrap (which
 * requires a full MMU zap).  The flag is instead explicitly queried when
 * checking for MMIO spte cache hits.
412
 */
413
#define MMIO_SPTE_GEN_MASK		GENMASK_ULL(17, 0)
414

415
416
417
418
#define MMIO_SPTE_GEN_LOW_START		3
#define MMIO_SPTE_GEN_LOW_END		11
#define MMIO_SPTE_GEN_LOW_MASK		GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
						    MMIO_SPTE_GEN_LOW_START)
419

420
421
#define MMIO_SPTE_GEN_HIGH_START	PT64_SECOND_AVAIL_BITS_SHIFT
#define MMIO_SPTE_GEN_HIGH_END		62
422
423
#define MMIO_SPTE_GEN_HIGH_MASK		GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
						    MMIO_SPTE_GEN_HIGH_START)
424

425
static u64 generation_mmio_spte_mask(u64 gen)
426
427
428
{
	u64 mask;

429
	WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
430
	BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
431

432
433
	mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
	mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
434
435
436
	return mask;
}

437
static u64 get_mmio_spte_generation(u64 spte)
438
{
439
	u64 gen;
440

441
442
	gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
	gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
443
444
445
	return gen;
}

446
static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
447
{
448

449
	u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
450
	u64 mask = generation_mmio_spte_mask(gen);
451
	u64 gpa = gfn << PAGE_SHIFT;
452

453
	access &= shadow_mmio_access_mask;
454
455
456
457
	mask |= shadow_mmio_value | access;
	mask |= gpa | shadow_nonpresent_or_rsvd_mask;
	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
		<< shadow_nonpresent_or_rsvd_mask_len;
458

459
460
461
462
463
464
465
466
467
468
469
	return mask;
}

static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
			   unsigned int access)
{
	u64 mask = make_mmio_spte(vcpu, gfn, access);
	unsigned int gen = get_mmio_spte_generation(mask);

	access = mask & ACC_ALL;

470
	trace_mark_mmio_spte(sptep, gfn, access, gen);
471
	mmu_spte_set(sptep, mask);
472
473
474
475
}

static gfn_t get_mmio_spte_gfn(u64 spte)
{
476
	u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
477
478
479
480
481

	gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
	       & shadow_nonpresent_or_rsvd_mask;

	return gpa >> PAGE_SHIFT;
482
483
484
485
}

static unsigned get_mmio_spte_access(u64 spte)
{
486
	return spte & shadow_mmio_access_mask;
487
488
}

489
static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
490
			  kvm_pfn_t pfn, unsigned int access)
491
492
{
	if (unlikely(is_noslot_pfn(pfn))) {
493
		mark_mmio_spte(vcpu, sptep, gfn, access);
494
495
496
497
498
		return true;
	}

	return false;
}
499

500
static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
501
{
502
	u64 kvm_gen, spte_gen, gen;
503

504
505
506
	gen = kvm_vcpu_memslots(vcpu)->generation;
	if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
		return false;
507

508
	kvm_gen = gen & MMIO_SPTE_GEN_MASK;
509
510
511
512
	spte_gen = get_mmio_spte_generation(spte);

	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
	return likely(kvm_gen == spte_gen);
513
514
}

515
516
517
518
519
520
521
/*
 * Sets the shadow PTE masks used by the MMU.
 *
 * Assumptions:
 *  - Setting either @accessed_mask or @dirty_mask requires setting both
 *  - At least one of @accessed_mask or @acc_track_mask must be set
 */
Sheng Yang's avatar
Sheng Yang committed
522
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
523
		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
524
		u64 acc_track_mask, u64 me_mask)
Sheng Yang's avatar
Sheng Yang committed
525
{
526
527
	BUG_ON(!dirty_mask != !accessed_mask);
	BUG_ON(!accessed_mask && !acc_track_mask);
528
	BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
529

Sheng Yang's avatar
Sheng Yang committed
530
531
532
533
534
	shadow_user_mask = user_mask;
	shadow_accessed_mask = accessed_mask;
	shadow_dirty_mask = dirty_mask;
	shadow_nx_mask = nx_mask;
	shadow_x_mask = x_mask;
535
	shadow_present_mask = p_mask;
536
	shadow_acc_track_mask = acc_track_mask;
537
	shadow_me_mask = me_mask;
Sheng Yang's avatar
Sheng Yang committed
538
539
540
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);

541
542
543
static u8 kvm_get_shadow_phys_bits(void)
{
	/*
544
545
546
547
	 * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
	 * in CPU detection code, but the processor treats those reduced bits as
	 * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
	 * the physical address bits reported by CPUID.
548
	 */
549
550
	if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
		return cpuid_eax(0x80000008) & 0xff;
551

552
553
554
555
556
557
	/*
	 * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
	 * custom CPUID.  Proceed with whatever the kernel found since these features
	 * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
	 */
	return boot_cpu_data.x86_phys_bits;
558
559
}

560
static void kvm_mmu_reset_all_pte_masks(void)
561
{
562
563
	u8 low_phys_bits;

564
565
566
567
568
569
570
571
	shadow_user_mask = 0;
	shadow_accessed_mask = 0;
	shadow_dirty_mask = 0;
	shadow_nx_mask = 0;
	shadow_x_mask = 0;
	shadow_mmio_mask = 0;
	shadow_present_mask = 0;
	shadow_acc_track_mask = 0;
572

573
574
	shadow_phys_bits = kvm_get_shadow_phys_bits();

575
576
577
578
	/*
	 * If the CPU has 46 or less physical address bits, then set an
	 * appropriate mask to guard against L1TF attacks. Otherwise, it is
	 * assumed that the CPU is not vulnerable to L1TF.
579
580
581
582
583
	 *
	 * Some Intel CPUs address the L1 cache using more PA bits than are
	 * reported by CPUID. Use the PA width of the L1 cache when possible
	 * to achieve more effective mitigation, e.g. if system RAM overlaps
	 * the most significant bits of legal physical address space.
584
	 */
585
586
587
	shadow_nonpresent_or_rsvd_mask = 0;
	low_phys_bits = boot_cpu_data.x86_cache_bits;
	if (boot_cpu_data.x86_cache_bits <
588
	    52 - shadow_nonpresent_or_rsvd_mask_len) {
589
		shadow_nonpresent_or_rsvd_mask =
590
			rsvd_bits(boot_cpu_data.x86_cache_bits -
591
				  shadow_nonpresent_or_rsvd_mask_len,
592
				  boot_cpu_data.x86_cache_bits - 1);
593
		low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
594
595
596
	} else
		WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF));

597
598
	shadow_nonpresent_or_rsvd_lower_gfn_mask =
		GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
599
600
}

Avi Kivity's avatar
Avi Kivity committed
601
602
603
604
605
static int is_cpuid_PSE36(void)
{
	return 1;
}

606
607
static int is_nx(struct kvm_vcpu *vcpu)
{
608
	return vcpu->arch.efer & EFER_NX;
609
610
}

611
612
static int is_shadow_present_pte(u64 pte)
{
613
	return (pte != 0) && !is_mmio_spte(pte);
614
615
}

Marcelo Tosatti's avatar
Marcelo Tosatti committed
616
617
618
619
620
static int is_large_pte(u64 pte)
{
	return pte & PT_PAGE_SIZE_MASK;
}

621
622
623
624
static int is_last_spte(u64 pte, int level)
{
	if (level == PT_PAGE_TABLE_LEVEL)
		return 1;
625
	if (is_large_pte(pte))
626
627
628
629
		return 1;
	return 0;
}

630
631
632
633
634
static bool is_executable_pte(u64 spte)
{
	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
}

Dan Williams's avatar
Dan Williams committed
635
static kvm_pfn_t spte_to_pfn(u64 pte)
636
{
637
	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
638
639
}

640
641
642
643
644
645
646
static gfn_t pse36_gfn_delta(u32 gpte)
{
	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;

	return (gpte & PT32_DIR_PSE36_MASK) << shift;
}

647
#ifdef CONFIG_X86_64
Avi Kivity's avatar
Avi Kivity committed
648
static void __set_spte(u64 *sptep, u64 spte)
649
{
650
	WRITE_ONCE(*sptep, spte);
651
652
}

653
static void __update_clear_spte_fast(u64 *sptep, u64 spte)
654
{
655
	WRITE_ONCE(*sptep, spte);
656
657
658
659
660
661
}

static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
{
	return xchg(sptep, spte);
}
662
663
664

static u64 __get_spte_lockless(u64 *sptep)
{
665
	return READ_ONCE(*sptep);
666
}
667
#else
668
669
670
671
672
673
674
union split_spte {
	struct {
		u32 spte_low;
		u32 spte_high;
	};
	u64 spte;
};
675

676
677
678
679
680
681
682
683
684
685
686
687
static void count_spte_clear(u64 *sptep, u64 spte)
{
	struct kvm_mmu_page *sp =  page_header(__pa(sptep));

	if (is_shadow_present_pte(spte))
		return;

	/* Ensure the spte is completely set before we increase the count */
	smp_wmb();
	sp->clear_spte_count++;
}

688
689
690
static void __set_spte(u64 *sptep, u64 spte)
{
	union split_spte *ssptep, sspte;
691

692
693
694
695
696
697
698
699
700
701
702
703
	ssptep = (union split_spte *)sptep;
	sspte = (union split_spte)spte;

	ssptep->spte_high = sspte.spte_high;

	/*
	 * If we map the spte from nonpresent to present, We should store
	 * the high bits firstly, then set present bit, so cpu can not
	 * fetch this spte while we are setting the spte.
	 */
	smp_wmb();

704
	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
705
706
}

707
708
709
710
711
712
713
static void __update_clear_spte_fast(u64 *sptep, u64 spte)
{
	union split_spte *ssptep, sspte;

	ssptep = (union split_spte *)sptep;
	sspte = (union split_spte)spte;

714
	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
715
716
717
718
719
720
721
722

	/*
	 * If we map the spte from present to nonpresent, we should clear
	 * present bit firstly to avoid vcpu fetch the old high bits.
	 */
	smp_wmb();

	ssptep->spte_high = sspte.spte_high;
723
	count_spte_clear(sptep, spte);
724
725
726
727
728
729
730
731
732
733
734
}

static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
{
	union split_spte *ssptep, sspte, orig;

	ssptep = (union split_spte *)sptep;
	sspte = (union split_spte)spte;

	/* xchg acts as a barrier before the setting of the high bits */
	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
735
736
	orig.spte_high = ssptep->spte_high;
	ssptep->spte_high = sspte.spte_high;
737
	count_spte_clear(sptep, spte);
738
739
740

	return orig.spte;
}
741
742
743

/*
 * The idea using the light way get the spte on x86_32 guest is from
744
 * gup_get_pte (mm/gup.c).
745
746
747
748
749
750
751
752
753
754
755
756
757
758
 *
 * An spte tlb flush may be pending, because kvm_set_pte_rmapp
 * coalesces them and we are running out of the MMU lock.  Therefore
 * we need to protect against in-progress updates of the spte.
 *
 * Reading the spte while an update is in progress may get the old value
 * for the high part of the spte.  The race is fine for a present->non-present
 * change (because the high part of the spte is ignored for non-present spte),
 * but for a present->present change we must reread the spte.
 *
 * All such changes are done in two steps (present->non-present and
 * non-present->present), hence it is enough to count the number of
 * present->non-present updates: if it changed while reading the spte,
 * we might have hit the race.  This is done using clear_spte_count.
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
 */
static u64 __get_spte_lockless(u64 *sptep)
{
	struct kvm_mmu_page *sp =  page_header(__pa(sptep));
	union split_spte spte, *orig = (union split_spte *)sptep;
	int count;

retry:
	count = sp->clear_spte_count;
	smp_rmb();

	spte.spte_low = orig->spte_low;
	smp_rmb();

	spte.spte_high = orig->spte_high;
	smp_rmb();

	if (unlikely(spte.spte_low != orig->spte_low ||
	      count != sp->clear_spte_count))
		goto retry;

	return spte.spte;
}
782
783
#endif

784
static bool spte_can_locklessly_be_made_writable(u64 spte)
785
{
786
787
	return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
		(SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
788
789
}

790
791
static bool spte_has_volatile_bits(u64 spte)
{
792
793
794
	if (!is_shadow_present_pte(spte))
		return false;

795
	/*
796
	 * Always atomically update spte if it can be updated
797
798
799
800
	 * out of mmu-lock, it can ensure dirty bit is not lost,
	 * also, it can help us to get a stable is_writable_pte()
	 * to ensure tlb flush is not missed.
	 */
801
802
	if (spte_can_locklessly_be_made_writable(spte) ||
	    is_access_track_spte(spte))
803
804
		return true;

805
	if (spte_ad_enabled(spte)) {
806
807
808
809
		if ((spte & shadow_accessed_mask) == 0 ||
	    	    (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
			return true;
	}
810

811
	return false;
812
813
}

814
static bool is_accessed_spte(u64 spte)
815
{
816
817
818
819
	u64 accessed_mask = spte_shadow_accessed_mask(spte);

	return accessed_mask ? spte & accessed_mask
			     : !is_access_track_spte(spte);
820
821
}

822
static bool is_dirty_spte(u64 spte)
823
{
824
825
826
	u64 dirty_mask = spte_shadow_dirty_mask(spte);

	return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
827
828
}

829
830
831
832
833
834
835
836
837
838
839
840
/* Rules for using mmu_spte_set:
 * Set the sptep from nonpresent to present.
 * Note: the sptep being assigned *must* be either not present
 * or in a state where the hardware will not attempt to update
 * the spte.
 */
static void mmu_spte_set(u64 *sptep, u64 new_spte)
{
	WARN_ON(is_shadow_present_pte(*sptep));
	__set_spte(sptep, new_spte);
}

841
842
843
/*
 * Update the SPTE (excluding the PFN), but do not track changes in its
 * accessed/dirty status.
844
 */
845
static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
846
{
847
	u64 old_spte = *sptep;
848

849
	WARN_ON(!is_shadow_present_pte(new_spte));
850

851
852
	if (!is_shadow_present_pte(old_spte)) {
		mmu_spte_set(sptep, new_spte);
853
		return old_spte;
854
	}
855

856
	if (!spte_has_volatile_bits(old_spte))
857
		__update_clear_spte_fast(sptep, new_spte);
858
	else
859
		old_spte = __update_clear_spte_slow(sptep, new_spte);
860

861
862
	WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));

863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
	return old_spte;
}

/* Rules for using mmu_spte_update:
 * Update the state bits, it means the mapped pfn is not changed.
 *
 * Whenever we overwrite a writable spte with a read-only one we
 * should flush remote TLBs. Otherwise rmap_write_protect
 * will find a read-only spte, even though the writable spte
 * might be cached on a CPU's TLB, the return value indicates this
 * case.
 *
 * Returns true if the TLB needs to be flushed
 */
static bool mmu_spte_update(u64 *sptep, u64 new_spte)
{
	bool flush = false;
	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);

	if (!is_shadow_present_pte(old_spte))
		return false;

885
886
	/*
	 * For the spte updated out of mmu-lock is safe, since
887
	 * we always atomically update it, see the comments in
888
889
	 * spte_has_volatile_bits().
	 */
890
	if (spte_can_locklessly_be_made_writable(old_spte) &&
891
	      !is_writable_pte(new_spte))
892
		flush = true;
893

894
	/*
895
	 * Flush TLB when accessed/dirty states are changed in the page tables,
896
897
898
	 * to guarantee consistency between TLB and page tables.
	 */

899
900
	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
		flush = true;
901
		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
902
903
904
905
	}

	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
		flush = true;
906
		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
907
	}
908

909
	return flush;
910
911
}

912
913
914
915
/*
 * Rules for using mmu_spte_clear_track_bits:
 * It sets the sptep from present to nonpresent, and track the
 * state bits, it is used to clear the last level sptep.
916
 * Returns non-zero if the PTE was previously valid.
917
918
919
 */
static int mmu_spte_clear_track_bits(u64 *sptep)
{
Dan Williams's avatar
Dan Williams committed
920
	kvm_pfn_t pfn;
921
922
923
	u64 old_spte = *sptep;

	if (!spte_has_volatile_bits(old_spte))
924
		__update_clear_spte_fast(sptep, 0ull);
925
	else
926
		old_spte = __update_clear_spte_slow(sptep, 0ull);
927

928
	if (!is_shadow_present_pte(old_spte))
929
930
931
		return 0;

	pfn = spte_to_pfn(old_spte);
932
933
934
935
936
937

	/*
	 * KVM does not hold the refcount of the page used by
	 * kvm mmu, before reclaiming the page, we should
	 * unmap it from mmu first.
	 */
938
	WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
939

940
	if (is_accessed_spte(old_spte))
941
		kvm_set_pfn_accessed(pfn);
942
943

	if (is_dirty_spte(old_spte))
944
		kvm_set_pfn_dirty(pfn);
945

946
947
948
949
950
951
952
953
954
955
	return 1;
}

/*
 * Rules for using mmu_spte_clear_no_track:
 * Directly clear spte without caring the state bits of sptep,
 * it is used to set the upper level spte.
 */
static void mmu_spte_clear_no_track(u64 *sptep)
{
956
	__update_clear_spte_fast(sptep, 0ull);
957
958
}

959
960
961
962
963
static u64 mmu_spte_get_lockless(u64 *sptep)
{
	return __get_spte_lockless(sptep);
}

964
965
static u64 mark_spte_for_access_track(u64 spte)
{
966
	if (spte_ad_enabled(spte))
967
968
		return spte & ~shadow_accessed_mask;

969
	if (is_access_track_spte(spte))
970
971
972
		return spte;

	/*
973
974
975
	 * Making an Access Tracking PTE will result in removal of write access
	 * from the PTE. So, verify that we will be able to restore the write
	 * access in the fast page fault path later on.
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
	 */
	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
		  !spte_can_locklessly_be_made_writable(spte),
		  "kvm: Writable SPTE is not locklessly dirty-trackable\n");

	WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
			  shadow_acc_track_saved_bits_shift),
		  "kvm: Access Tracking saved bit locations are not zero\n");

	spte |= (spte & shadow_acc_track_saved_bits_mask) <<
		shadow_acc_track_saved_bits_shift;
	spte &= ~shadow_acc_track_mask;

	return spte;
}

992
993
994
995
996
997
998
/* Restore an acc-track PTE back to a regular PTE */
static u64 restore_acc_track_spte(u64 spte)
{
	u64 new_spte = spte;
	u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
			 & shadow_acc_track_saved_bits_mask;

999
	WARN_ON_ONCE(spte_ad_enabled(spte));
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
	WARN_ON_ONCE(!is_access_track_spte(spte));

	new_spte &= ~shadow_acc_track_mask;
	new_spte &= ~(shadow_acc_track_saved_bits_mask <<
		      shadow_acc_track_saved_bits_shift);
	new_spte |= saved_bits;

	return new_spte;
}

1010
1011
1012
1013
1014
1015
1016
1017
/* Returns the Accessed status of the PTE and resets it at the same time. */
static bool mmu_spte_age(u64 *sptep)
{
	u64 spte = mmu_spte_get_lockless(sptep);

	if (!is_accessed_spte(spte))
		return false;

1018
	if (spte_ad_enabled(spte)) {
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
		clear_bit((ffs(shadow_accessed_mask) - 1),
			  (unsigned long *)sptep);
	} else {
		/*
		 * Capture the dirty status of the page, so that it doesn't get
		 * lost when the SPTE is marked for access tracking.
		 */
		if (is_writable_pte(spte))
			kvm_set_pfn_dirty(spte_to_pfn(spte));

		spte = mark_spte_for_access_track(spte);
		mmu_spte_update_no_track(sptep, spte);
	}

	return true;
}

1036
1037
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
1038
1039
1040
1041
1042
	/*
	 * Prevent page table teardown by making any free-er wait during
	 * kvm_flush_remote_tlbs() IPI to all active vcpus.
	 */
	local_irq_disable();
1043

1044
1045
1046
1047
	/*
	 * Make sure a following spte read is not reordered ahead of the write
	 * to vcpu->mode.
	 */
1048
	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
1049
1050
1051
1052
}

static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{
1053
1054
	/*
	 * Make sure the write to vcpu->mode is not reordered in front of
1055
	 * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
1056
1057
	 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
	 */
1058
	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
1059
	local_irq_enable();
1060
1061
}

1062
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
1063
				  struct kmem_cache *base_cache, int min)
1064
1065
1066
1067
{
	void *obj;

	if (cache->nobjs >= min)
1068
		return 0;
1069
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1070
		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
1071
		if (!obj)
1072
			return cache->nobjs >= min ? 0 : -ENOMEM;
1073
1074
		cache->objects[cache->nobjs++] = obj;
	}
1075
	return 0;
1076
1077
}

1078
1079
1080
1081
1082
static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
{
	return cache->nobjs;
}

1083
1084
static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
				  struct kmem_cache *cache)
1085
1086
{
	while (mc->nobjs)
1087
		kmem_cache_free(cache, mc->objects[--mc->nobjs]);
1088
1089
}

Avi Kivity's avatar
Avi Kivity committed
1090
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
1091
				       int min)
Avi Kivity's avatar
Avi Kivity committed
1092
{
1093
	void *page;
Avi Kivity's avatar
Avi Kivity committed
1094
1095
1096
1097

	if (cache->nobjs >= min)
		return 0;
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1098
		page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
Avi Kivity's avatar
Avi Kivity committed
1099
		if (!page)
1100
			return cache->nobjs >= min ? 0 : -ENOMEM;
1101
		cache->objects[cache->nobjs++] = page;
Avi Kivity's avatar
Avi Kivity committed
1102
1103
1104
1105
1106
1107
1108
	}
	return 0;
}

static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
{
	while (mc->nobjs)