x86.c 223 KB
Newer Older
1
2
3
4
5
6
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * derived from drivers/kvm/kvm_main.c
 *
 * Copyright (C) 2006 Qumranet, Inc.
Ben-Ami Yassour's avatar
Ben-Ami Yassour committed
7
8
 * Copyright (C) 2008 Qumranet, Inc.
 * Copyright IBM Corporation, 2008
9
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10
11
12
13
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
Ben-Ami Yassour's avatar
Ben-Ami Yassour committed
14
15
 *   Amit Shah    <amit.shah@qumranet.com>
 *   Ben-Ami Yassour <benami@il.ibm.com>
16
17
18
19
20
21
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

22
#include <linux/kvm_host.h>
23
#include "irq.h"
24
#include "mmu.h"
Sheng Yang's avatar
Sheng Yang committed
25
#include "i8254.h"
26
#include "tss.h"
27
#include "kvm_cache_regs.h"
28
#include "x86.h"
Avi Kivity's avatar
Avi Kivity committed
29
#include "cpuid.h"
30
#include "pmu.h"
31
#include "hyperv.h"
32

33
#include <linux/clocksource.h>
Ben-Ami Yassour's avatar
Ben-Ami Yassour committed
34
#include <linux/interrupt.h>
35
36
37
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
38
39
#include <linux/export.h>
#include <linux/moduleparam.h>
40
#include <linux/mman.h>
41
#include <linux/highmem.h>
42
#include <linux/iommu.h>
43
#include <linux/intel-iommu.h>
44
#include <linux/cpufreq.h>
45
#include <linux/user-return-notifier.h>
46
#include <linux/srcu.h>
47
#include <linux/slab.h>
48
#include <linux/perf_event.h>
49
#include <linux/uaccess.h>
50
#include <linux/hash.h>
51
#include <linux/pci.h>
52
53
#include <linux/timekeeper_internal.h>
#include <linux/pvclock_gtod.h>
Feng Wu's avatar
Feng Wu committed
54
55
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
56
57
#include <linux/sched/stat.h>

Avi Kivity's avatar
Avi Kivity committed
58
#include <trace/events/kvm.h>
Xiao Guangrong's avatar
Xiao Guangrong committed
59

60
#include <asm/debugreg.h>
61
#include <asm/msr.h>
62
#include <asm/desc.h>
Huang Ying's avatar
Huang Ying committed
63
#include <asm/mce.h>
64
#include <linux/kernel_stat.h>
65
#include <asm/fpu/internal.h> /* Ugh! */
66
#include <asm/pvclock.h>
67
#include <asm/div64.h>
68
#include <asm/irq_remapping.h>
69

70
71
72
#define CREATE_TRACE_POINTS
#include "trace.h"

73
#define MAX_IO_MSRS 256
Huang Ying's avatar
Huang Ying committed
74
#define KVM_MAX_MCE_BANKS 32
75
76
u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
Huang Ying's avatar
Huang Ying committed
77

78
79
80
#define emul_to_vcpu(ctxt) \
	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)

81
82
83
84
85
/* EFER defaults:
 * - enable syscall per default because its emulated by KVM
 * - enable LME and LMA per default on 64 bit KVM
 */
#ifdef CONFIG_X86_64
86
87
static
u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
88
#else
89
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
90
#endif
91

92
93
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
94

95
96
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
                                    KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
97

98
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
Avi Kivity's avatar
Avi Kivity committed
99
static void process_nmi(struct kvm_vcpu *vcpu);
100
static void enter_smm(struct kvm_vcpu *vcpu);
101
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
102

103
struct kvm_x86_ops *kvm_x86_ops __read_mostly;
104
EXPORT_SYMBOL_GPL(kvm_x86_ops);
105

106
static bool __read_mostly ignore_msrs = 0;
107
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
108

109
110
111
unsigned int min_timer_period_us = 500;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);

112
113
114
static bool __read_mostly kvmclock_periodic_sync = true;
module_param(kvmclock_periodic_sync, bool, S_IRUGO);

115
bool __read_mostly kvm_has_tsc_control;
116
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
117
u32  __read_mostly kvm_max_guest_tsc_khz;
118
EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
119
120
121
122
u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
u64  __read_mostly kvm_max_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
123
124
u64 __read_mostly kvm_default_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
125

126
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
127
static u32 __read_mostly tsc_tolerance_ppm = 250;
128
129
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);

130
/* lapic timer advance (tscdeadline mode only) in nanoseconds */
131
unsigned int __read_mostly lapic_timer_advance_ns = 0;
132
133
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);

134
135
136
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);

137
static bool __read_mostly backwards_tsc_observed = false;
138

139
140
141
142
#define KVM_NR_SHARED_MSRS 16

struct kvm_shared_msrs_global {
	int nr;
143
	u32 msrs[KVM_NR_SHARED_MSRS];
144
145
146
147
148
};

struct kvm_shared_msrs {
	struct user_return_notifier urn;
	bool registered;
149
150
151
152
	struct kvm_shared_msr_values {
		u64 host;
		u64 curr;
	} values[KVM_NR_SHARED_MSRS];
153
154
155
};

static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
156
static struct kvm_shared_msrs __percpu *shared_msrs;
157

158
struct kvm_stats_debugfs_item debugfs_entries[] = {
159
160
161
162
163
164
165
166
167
	{ "pf_fixed", VCPU_STAT(pf_fixed) },
	{ "pf_guest", VCPU_STAT(pf_guest) },
	{ "tlb_flush", VCPU_STAT(tlb_flush) },
	{ "invlpg", VCPU_STAT(invlpg) },
	{ "exits", VCPU_STAT(exits) },
	{ "io_exits", VCPU_STAT(io_exits) },
	{ "mmio_exits", VCPU_STAT(mmio_exits) },
	{ "signal_exits", VCPU_STAT(signal_exits) },
	{ "irq_window", VCPU_STAT(irq_window_exits) },
168
	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
169
	{ "halt_exits", VCPU_STAT(halt_exits) },
170
	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
171
	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
172
	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
173
	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
174
	{ "hypercalls", VCPU_STAT(hypercalls) },
175
176
177
178
179
180
181
	{ "request_irq", VCPU_STAT(request_irq_exits) },
	{ "irq_exits", VCPU_STAT(irq_exits) },
	{ "host_state_reload", VCPU_STAT(host_state_reload) },
	{ "efer_reload", VCPU_STAT(efer_reload) },
	{ "fpu_reload", VCPU_STAT(fpu_reload) },
	{ "insn_emulation", VCPU_STAT(insn_emulation) },
	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
182
	{ "irq_injections", VCPU_STAT(irq_injections) },
183
	{ "nmi_injections", VCPU_STAT(nmi_injections) },
184
	{ "req_event", VCPU_STAT(req_event) },
Avi Kivity's avatar
Avi Kivity committed
185
186
187
188
189
190
	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
	{ "mmu_flooded", VM_STAT(mmu_flooded) },
	{ "mmu_recycled", VM_STAT(mmu_recycled) },
191
	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
192
	{ "mmu_unsync", VM_STAT(mmu_unsync) },
193
	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
Marcelo Tosatti's avatar
Marcelo Tosatti committed
194
	{ "largepages", VM_STAT(lpages) },
195
196
	{ "max_mmu_page_hash_collisions",
		VM_STAT(max_mmu_page_hash_collisions) },
197
198
199
	{ NULL }
};

200
201
u64 __read_mostly host_xcr0;

202
static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
203

204
205
206
207
208
209
210
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
	int i;
	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
		vcpu->arch.apf.gfns[i] = ~0;
}

211
212
213
214
215
static void kvm_on_user_return(struct user_return_notifier *urn)
{
	unsigned slot;
	struct kvm_shared_msrs *locals
		= container_of(urn, struct kvm_shared_msrs, urn);
216
	struct kvm_shared_msr_values *values;
217
218
219
220
221
222
223
224
225
226
227
228
	unsigned long flags;

	/*
	 * Disabling irqs at this point since the following code could be
	 * interrupted and executed through kvm_arch_hardware_disable()
	 */
	local_irq_save(flags);
	if (locals->registered) {
		locals->registered = false;
		user_return_notifier_unregister(urn);
	}
	local_irq_restore(flags);
229
	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
230
231
232
233
		values = &locals->values[slot];
		if (values->host != values->curr) {
			wrmsrl(shared_msrs_global.msrs[slot], values->host);
			values->curr = values->host;
234
235
236
237
		}
	}
}

238
static void shared_msr_update(unsigned slot, u32 msr)
239
240
{
	u64 value;
241
242
	unsigned int cpu = smp_processor_id();
	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
243

244
245
246
247
248
249
250
251
252
253
254
255
256
	/* only read, and nobody should modify it at this time,
	 * so don't need lock */
	if (slot >= shared_msrs_global.nr) {
		printk(KERN_ERR "kvm: invalid MSR slot!");
		return;
	}
	rdmsrl_safe(msr, &value);
	smsr->values[slot].host = value;
	smsr->values[slot].curr = value;
}

void kvm_define_shared_msr(unsigned slot, u32 msr)
{
257
	BUG_ON(slot >= KVM_NR_SHARED_MSRS);
258
	shared_msrs_global.msrs[slot] = msr;
259
260
261
262
263
264
265
266
267
268
	if (slot >= shared_msrs_global.nr)
		shared_msrs_global.nr = slot + 1;
}
EXPORT_SYMBOL_GPL(kvm_define_shared_msr);

static void kvm_shared_msr_cpu_online(void)
{
	unsigned i;

	for (i = 0; i < shared_msrs_global.nr; ++i)
269
		shared_msr_update(i, shared_msrs_global.msrs[i]);
270
271
}

272
int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
273
{
274
275
	unsigned int cpu = smp_processor_id();
	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
276
	int err;
277

278
	if (((value ^ smsr->values[slot].curr) & mask) == 0)
279
		return 0;
280
	smsr->values[slot].curr = value;
281
282
283
284
	err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
	if (err)
		return 1;

285
286
287
288
289
	if (!smsr->registered) {
		smsr->urn.on_user_return = kvm_on_user_return;
		user_return_notifier_register(&smsr->urn);
		smsr->registered = true;
	}
290
	return 0;
291
292
293
}
EXPORT_SYMBOL_GPL(kvm_set_shared_msr);

294
static void drop_user_return_notifiers(void)
295
{
296
297
	unsigned int cpu = smp_processor_id();
	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
298
299
300
301
302

	if (smsr->registered)
		kvm_on_user_return(&smsr->urn);
}

303
304
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
305
	return vcpu->arch.apic_base;
306
307
308
}
EXPORT_SYMBOL_GPL(kvm_get_apic_base);

309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
	u64 old_state = vcpu->arch.apic_base &
		(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
	u64 new_state = msr_info->data &
		(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
	u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) |
		0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE);

	if (!msr_info->host_initiated &&
	    ((msr_info->data & reserved_bits) != 0 ||
	     new_state == X2APIC_ENABLE ||
	     (new_state == MSR_IA32_APICBASE_ENABLE &&
	      old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) ||
	     (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) &&
	      old_state == 0)))
		return 1;

	kvm_lapic_set_base(vcpu, msr_info->data);
	return 0;
329
330
331
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);

332
asmlinkage __visible void kvm_spurious_fault(void)
333
334
335
336
337
338
{
	/* Fault while not rebooting.  We want the trace. */
	BUG();
}
EXPORT_SYMBOL_GPL(kvm_spurious_fault);

339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
#define EXCPT_BENIGN		0
#define EXCPT_CONTRIBUTORY	1
#define EXCPT_PF		2

static int exception_class(int vector)
{
	switch (vector) {
	case PF_VECTOR:
		return EXCPT_PF;
	case DE_VECTOR:
	case TS_VECTOR:
	case NP_VECTOR:
	case SS_VECTOR:
	case GP_VECTOR:
		return EXCPT_CONTRIBUTORY;
	default:
		break;
	}
	return EXCPT_BENIGN;
}

360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
#define EXCPT_FAULT		0
#define EXCPT_TRAP		1
#define EXCPT_ABORT		2
#define EXCPT_INTERRUPT		3

static int exception_type(int vector)
{
	unsigned int mask;

	if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
		return EXCPT_INTERRUPT;

	mask = 1 << vector;

	/* #DB is trap, as instruction watchpoints are handled elsewhere */
	if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
		return EXCPT_TRAP;

	if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
		return EXCPT_ABORT;

	/* Reserved exceptions will result in fault */
	return EXCPT_FAULT;
}

385
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
386
387
		unsigned nr, bool has_error, u32 error_code,
		bool reinject)
388
389
390
391
{
	u32 prev_nr;
	int class1, class2;

392
393
	kvm_make_request(KVM_REQ_EVENT, vcpu);

394
395
	if (!vcpu->arch.exception.pending) {
	queue:
396
397
		if (has_error && !is_protmode(vcpu))
			has_error = false;
398
399
400
401
		vcpu->arch.exception.pending = true;
		vcpu->arch.exception.has_error_code = has_error;
		vcpu->arch.exception.nr = nr;
		vcpu->arch.exception.error_code = error_code;
402
		vcpu->arch.exception.reinject = reinject;
403
404
405
406
407
408
409
		return;
	}

	/* to check exception */
	prev_nr = vcpu->arch.exception.nr;
	if (prev_nr == DF_VECTOR) {
		/* triple fault -> shutdown */
410
		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
		return;
	}
	class1 = exception_class(prev_nr);
	class2 = exception_class(nr);
	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
		/* generate double fault per SDM Table 5-5 */
		vcpu->arch.exception.pending = true;
		vcpu->arch.exception.has_error_code = true;
		vcpu->arch.exception.nr = DF_VECTOR;
		vcpu->arch.exception.error_code = 0;
	} else
		/* replace previous exception with a new one in a hope
		   that instruction re-execution will regenerate lost
		   exception */
		goto queue;
}

429
430
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
431
	kvm_multiple_exception(vcpu, nr, false, 0, false);
432
433
434
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);

435
436
437
438
439
440
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
	kvm_multiple_exception(vcpu, nr, false, 0, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);

441
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
442
{
443
444
445
	if (err)
		kvm_inject_gp(vcpu, 0);
	else
446
447
448
		return kvm_skip_emulated_instruction(vcpu);

	return 1;
449
450
}
EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
451

452
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
453
454
{
	++vcpu->stat.pf_guest;
455
456
	vcpu->arch.cr2 = fault->address;
	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
457
}
Nadav Har'El's avatar
Nadav Har'El committed
458
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
459

460
static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
461
{
462
463
	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
464
	else
465
		vcpu->arch.mmu.inject_page_fault(vcpu, fault);
466
467

	return fault->nested_page_fault;
468
469
}

470
471
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
Avi Kivity's avatar
Avi Kivity committed
472
473
	atomic_inc(&vcpu->arch.nmi_queued);
	kvm_make_request(KVM_REQ_NMI, vcpu);
474
475
476
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);

477
478
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
479
	kvm_multiple_exception(vcpu, nr, true, error_code, false);
480
481
482
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);

483
484
485
486
487
488
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
	kvm_multiple_exception(vcpu, nr, true, error_code, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);

489
490
491
492
493
/*
 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
 * a #GP and return false.
 */
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
494
{
495
496
497
498
	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
		return true;
	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
	return false;
499
}
500
EXPORT_SYMBOL_GPL(kvm_require_cpl);
501

502
503
504
505
506
507
508
509
510
511
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
{
	if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
		return true;

	kvm_queue_exception(vcpu, UD_VECTOR);
	return false;
}
EXPORT_SYMBOL_GPL(kvm_require_dr);

512
513
/*
 * This function will be used to read from the physical memory of the currently
514
 * running guest. The difference to kvm_vcpu_read_guest_page is that this function
515
516
517
518
519
520
 * can read from guest physical or from the guest's guest physical memory.
 */
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
			    gfn_t ngfn, void *data, int offset, int len,
			    u32 access)
{
521
	struct x86_exception exception;
522
523
524
525
	gfn_t real_gfn;
	gpa_t ngpa;

	ngpa     = gfn_to_gpa(ngfn);
526
	real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
527
528
529
530
531
	if (real_gfn == UNMAPPED_GVA)
		return -EFAULT;

	real_gfn = gpa_to_gfn(real_gfn);

532
	return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
533
534
535
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);

536
static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
537
538
539
540
541
542
			       void *data, int offset, int len, u32 access)
{
	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
				       data, offset, len, access);
}

543
544
545
/*
 * Load the pae pdptrs.  Return true is they are all valid.
 */
546
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
547
548
549
550
551
{
	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
	int i;
	int ret;
552
	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
553

554
555
556
	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
				      offset * sizeof(u64), sizeof(pdpte),
				      PFERR_USER_MASK|PFERR_WRITE_MASK);
557
558
559
560
561
	if (ret < 0) {
		ret = 0;
		goto out;
	}
	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
562
		if ((pdpte[i] & PT_PRESENT_MASK) &&
563
564
		    (pdpte[i] &
		     vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
565
566
567
568
569
570
			ret = 0;
			goto out;
		}
	}
	ret = 1;

571
	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
Avi Kivity's avatar
Avi Kivity committed
572
573
574
575
	__set_bit(VCPU_EXREG_PDPTR,
		  (unsigned long *)&vcpu->arch.regs_avail);
	__set_bit(VCPU_EXREG_PDPTR,
		  (unsigned long *)&vcpu->arch.regs_dirty);
576
577
578
579
out:

	return ret;
}
580
EXPORT_SYMBOL_GPL(load_pdptrs);
581

582
bool pdptrs_changed(struct kvm_vcpu *vcpu)
583
{
584
	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
585
	bool changed = true;
586
587
	int offset;
	gfn_t gfn;
588
589
590
591
592
	int r;

	if (is_long_mode(vcpu) || !is_pae(vcpu))
		return false;

Avi Kivity's avatar
Avi Kivity committed
593
594
595
596
	if (!test_bit(VCPU_EXREG_PDPTR,
		      (unsigned long *)&vcpu->arch.regs_avail))
		return true;

597
598
	gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
	offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
599
600
	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
				       PFERR_USER_MASK | PFERR_WRITE_MASK);
601
602
	if (r < 0)
		goto out;
603
	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
604
605
606
607
out:

	return changed;
}
608
EXPORT_SYMBOL_GPL(pdptrs_changed);
609

610
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
611
{
612
	unsigned long old_cr0 = kvm_read_cr0(vcpu);
613
	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
614

615
616
	cr0 |= X86_CR0_ET;

617
#ifdef CONFIG_X86_64
618
619
	if (cr0 & 0xffffffff00000000UL)
		return 1;
620
621
622
#endif

	cr0 &= ~CR0_RESERVED_BITS;
623

624
625
	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
		return 1;
626

627
628
	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
		return 1;
629
630
631

	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
632
		if ((vcpu->arch.efer & EFER_LME)) {
633
634
			int cs_db, cs_l;

635
636
			if (!is_pae(vcpu))
				return 1;
637
			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
638
639
			if (cs_l)
				return 1;
640
641
		} else
#endif
642
		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
643
						 kvm_read_cr3(vcpu)))
644
			return 1;
645
646
	}

647
648
649
	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
		return 1;

650
651
	kvm_x86_ops->set_cr0(vcpu, cr0);

652
	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
653
		kvm_clear_async_pf_completion_queue(vcpu);
654
655
		kvm_async_pf_hash_reset(vcpu);
	}
656

657
658
	if ((cr0 ^ old_cr0) & update_bits)
		kvm_mmu_reset_context(vcpu);
659

660
661
662
	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
	    kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
	    !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
663
664
		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);

665
666
	return 0;
}
667
EXPORT_SYMBOL_GPL(kvm_set_cr0);
668

669
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
670
{
671
	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
672
}
673
EXPORT_SYMBOL_GPL(kvm_lmsw);
674

675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
{
	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
			!vcpu->guest_xcr0_loaded) {
		/* kvm_set_xcr() also depends on this */
		xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
		vcpu->guest_xcr0_loaded = 1;
	}
}

static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
{
	if (vcpu->guest_xcr0_loaded) {
		if (vcpu->arch.xcr0 != host_xcr0)
			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
		vcpu->guest_xcr0_loaded = 0;
	}
}

694
static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
695
{
696
697
	u64 xcr0 = xcr;
	u64 old_xcr0 = vcpu->arch.xcr0;
698
	u64 valid_bits;
699
700
701
702

	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
	if (index != XCR_XFEATURE_ENABLED_MASK)
		return 1;
Dave Hansen's avatar
Dave Hansen committed
703
	if (!(xcr0 & XFEATURE_MASK_FP))
704
		return 1;
Dave Hansen's avatar
Dave Hansen committed
705
	if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
706
		return 1;
707
708
709
710
711
712

	/*
	 * Do not allow the guest to set bits that we do not support
	 * saving.  However, xcr0 bit 0 is always set, even if the
	 * emulated CPU does not support XSAVE (see fx_init).
	 */
Dave Hansen's avatar
Dave Hansen committed
713
	valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
714
	if (xcr0 & ~valid_bits)
715
		return 1;
716

Dave Hansen's avatar
Dave Hansen committed
717
718
	if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
	    (!(xcr0 & XFEATURE_MASK_BNDCSR)))
719
720
		return 1;

Dave Hansen's avatar
Dave Hansen committed
721
722
	if (xcr0 & XFEATURE_MASK_AVX512) {
		if (!(xcr0 & XFEATURE_MASK_YMM))
723
			return 1;
Dave Hansen's avatar
Dave Hansen committed
724
		if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
725
726
			return 1;
	}
727
	vcpu->arch.xcr0 = xcr0;
728

Dave Hansen's avatar
Dave Hansen committed
729
	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
730
		kvm_update_cpuid(vcpu);
731
732
733
734
735
	return 0;
}

int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
736
737
	if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
	    __kvm_set_xcr(vcpu, index, xcr)) {
738
739
740
741
742
743
744
		kvm_inject_gp(vcpu, 0);
		return 1;
	}
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);

745
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
746
{
747
	unsigned long old_cr4 = kvm_read_cr4(vcpu);
748
	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
749
				   X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
750

751
752
	if (cr4 & CR4_RESERVED_BITS)
		return 1;
753

754
755
756
	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
		return 1;

757
758
759
	if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
		return 1;

760
761
762
	if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP))
		return 1;

763
	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
764
765
		return 1;

766
767
768
	if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE))
		return 1;

769
	if (is_long_mode(vcpu)) {
770
771
		if (!(cr4 & X86_CR4_PAE))
			return 1;
772
773
	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
		   && ((cr4 ^ old_cr4) & pdptr_bits)
774
775
		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
				   kvm_read_cr3(vcpu)))
776
777
		return 1;

778
779
780
781
782
783
784
785
786
	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
		if (!guest_cpuid_has_pcid(vcpu))
			return 1;

		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
			return 1;
	}

787
	if (kvm_x86_ops->set_cr4(vcpu, cr4))
788
		return 1;
789

790
791
	if (((cr4 ^ old_cr4) & pdptr_bits) ||
	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
792
		kvm_mmu_reset_context(vcpu);
793

794
	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
Avi Kivity's avatar
Avi Kivity committed
795
		kvm_update_cpuid(vcpu);
796

797
798
	return 0;
}
799
EXPORT_SYMBOL_GPL(kvm_set_cr4);
800

801
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
802
{
803
#ifdef CONFIG_X86_64
804
	cr3 &= ~CR3_PCID_INVD;
805
#endif
806

807
	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
808
		kvm_mmu_sync_roots(vcpu);
809
		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
810
		return 0;
811
812
	}

813
	if (is_long_mode(vcpu)) {
814
815
816
817
		if (cr3 & CR3_L_MODE_RESERVED_BITS)
			return 1;
	} else if (is_pae(vcpu) && is_paging(vcpu) &&
		   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
Nadav Amit's avatar
Nadav Amit committed
818
		return 1;
819

820
	vcpu->arch.cr3 = cr3;
821
	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
822
	kvm_mmu_new_cr3(vcpu);
823
824
	return 0;
}
825
EXPORT_SYMBOL_GPL(kvm_set_cr3);
826

Andre Przywara's avatar
Andre Przywara committed
827
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
828
{
829
830
	if (cr8 & CR8_RESERVED_BITS)
		return 1;
831
	if (lapic_in_kernel(vcpu))
832
833
		kvm_lapic_set_tpr(vcpu, cr8);
	else
834
		vcpu->arch.cr8 = cr8;
835
836
	return 0;
}
837
EXPORT_SYMBOL_GPL(kvm_set_cr8);
838

839
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
840
{
841
	if (lapic_in_kernel(vcpu))
842
843
		return kvm_lapic_get_cr8(vcpu);
	else
844
		return vcpu->arch.cr8;
845
}
846
EXPORT_SYMBOL_GPL(kvm_get_cr8);
847

848
849
850
851
852
853
854
855
856
857
858
static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
{
	int i;

	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
		for (i = 0; i < KVM_NR_DB_REGS; i++)
			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
	}
}

Jan Kiszka's avatar
Jan Kiszka committed
859
860
861
862
863
864
static void kvm_update_dr6(struct kvm_vcpu *vcpu)
{
	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
		kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
}

865
866
867
868
869
870
871
872
873
static void kvm_update_dr7(struct kvm_vcpu *vcpu)
{
	unsigned long dr7;

	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
		dr7 = vcpu->arch.guest_debug_dr7;
	else
		dr7 = vcpu->arch.dr7;
	kvm_x86_ops->set_dr7(vcpu, dr7);
874
875
876
	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
	if (dr7 & DR7_BP_EN_MASK)
		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
877
878
}

879
880
881
882
883
884
885
886
887
static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
{
	u64 fixed = DR6_FIXED_1;

	if (!guest_cpuid_has_rtm(vcpu))
		fixed |= DR6_RTM;
	return fixed;
}

888
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
889
890
891
892
893
894
895
896
897
898
{
	switch (dr) {
	case 0 ... 3:
		vcpu->arch.db[dr] = val;
		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
			vcpu->arch.eff_db[dr] = val;
		break;
	case 4:
		/* fall through */
	case 6:
899
900
		if (val & 0xffffffff00000000ULL)
			return -1; /* #GP */
901
		vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
Jan Kiszka's avatar
Jan Kiszka committed
902
		kvm_update_dr6(vcpu);
903
904
905
906
		break;
	case 5:
		/* fall through */
	default: /* 7 */
907
908
		if (val & 0xffffffff00000000ULL)
			return -1; /* #GP */
909
		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
910
		kvm_update_dr7(vcpu);
911
912
913
914
915
		break;
	}

	return 0;
}
916
917
918

int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
919
	if (__kvm_set_dr(vcpu, dr, val)) {
920
		kvm_inject_gp(vcpu, 0);
921
922
923
		return 1;
	}
	return 0;
924
}
925
926
EXPORT_SYMBOL_GPL(kvm_set_dr);

927
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
928
929
930
931
932
933
934
935
{
	switch (dr) {
	case 0 ... 3:
		*val = vcpu->arch.db[dr];
		break;
	case 4:
		/* fall through */
	case 6:
Jan Kiszka's avatar
Jan Kiszka committed
936
937
938
939
		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
			*val = vcpu->arch.dr6;
		else
			*val = kvm_x86_ops->get_dr6(vcpu);
940
941
942
943
944
945
946
		break;
	case 5:
		/* fall through */
	default: /* 7 */
		*val = vcpu->arch.dr7;
		break;
	}
947
948
	return 0;
}
949
950
EXPORT_SYMBOL_GPL(kvm_get_dr);

Avi Kivity's avatar
Avi Kivity committed
951
952
953
954
955
956
bool kvm_rdpmc(struct kvm_vcpu *vcpu)
{
	u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
	u64 data;
	int err;

957
	err = kvm_pmu_rdpmc(vcpu, ecx, &data);
Avi Kivity's avatar
Avi Kivity committed
958
959
960
961
962
963
964
965
	if (err)
		return err;
	kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
	kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
	return err;
}
EXPORT_SYMBOL_GPL(kvm_rdpmc);

966
967
968
969
970
/*
 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 *
 * This list is modified at module load time to reflect the
971
 * capabilities of the host cpu. This capabilities test skips MSRs that are
972
973
 * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
 * may depend on host virtualization features rather than host cpu features.
974
 */
975

976
977
static u32 msrs_to_save[] = {
	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
Brian Gerst's avatar
Brian Gerst committed
978
	MSR_STAR,
979
980
981
#ifdef CONFIG_X86_64
	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
982
	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
983
	MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
984
985
986
987
};

static unsigned num_msrs_to_save;

988
989
990
991
992
static u32 emulated_msrs[] = {
	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
993
994
	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
995
	HV_X64_MSR_RESET,
996
	HV_X64_MSR_VP_INDEX,
997
	HV_X64_MSR_VP_RUNTIME,
998
	HV_X64_MSR_SCONTROL,
999
	HV_X64_MSR_STIMER0_CONFIG,
1000
1001
1002
	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
	MSR_KVM_PV_EOI_EN,

1003
	MSR_IA32_TSC_ADJUST,
1004
	MSR_IA32_TSCDEADLINE,
1005
	MSR_IA32_MISC_ENABLE,
1006
1007
	MSR_IA32_MCG_STATUS,
	MSR_IA32_MCG_CTL,
1008
	MSR_IA32_MCG_EXT_CTL,
1009
	MSR_IA32_SMBASE,
1010
1011
	MSR_PLATFORM_INFO,
	MSR_MISC_FEATURES_ENABLES,
1012
1013
};

1014
1015
static unsigned num_emulated_msrs;

1016
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)