svm.c 138 KB
Newer Older
Avi Kivity's avatar
Avi Kivity committed
1
2
3
4
5
6
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * AMD SVM support
 *
 * Copyright (C) 2006 Qumranet, Inc.
7
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
Avi Kivity's avatar
Avi Kivity committed
8
9
10
11
12
13
14
15
16
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */
17
18
19

#define pr_fmt(fmt) "SVM: " fmt

20
21
#include <linux/kvm_host.h>

22
#include "irq.h"
23
#include "mmu.h"
24
#include "kvm_cache_regs.h"
25
#include "x86.h"
26
#include "cpuid.h"
27
#include "pmu.h"
Avi Kivity's avatar
Avi Kivity committed
28

Avi Kivity's avatar
Avi Kivity committed
29
#include <linux/module.h>
30
#include <linux/mod_devicetable.h>
31
#include <linux/kernel.h>
Avi Kivity's avatar
Avi Kivity committed
32
33
#include <linux/vmalloc.h>
#include <linux/highmem.h>
Alexey Dobriyan's avatar
Alexey Dobriyan committed
34
#include <linux/sched.h>
35
#include <linux/trace_events.h>
36
#include <linux/slab.h>
37
38
#include <linux/amd-iommu.h>
#include <linux/hashtable.h>
Avi Kivity's avatar
Avi Kivity committed
39

40
#include <asm/apic.h>
41
#include <asm/perf_event.h>
42
#include <asm/tlbflush.h>
Avi Kivity's avatar
Avi Kivity committed
43
#include <asm/desc.h>
44
#include <asm/debugreg.h>
45
#include <asm/kvm_para.h>
46
#include <asm/irq_remapping.h>
Avi Kivity's avatar
Avi Kivity committed
47

48
#include <asm/virtext.h>
49
#include "trace.h"
50

51
52
#define __ex(x) __kvm_handle_fault_on_reboot(x)

Avi Kivity's avatar
Avi Kivity committed
53
54
55
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

56
57
58
59
60
61
static const struct x86_cpu_id svm_cpu_id[] = {
	X86_FEATURE_MATCH(X86_FEATURE_SVM),
	{}
};
MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);

Avi Kivity's avatar
Avi Kivity committed
62
63
64
65
66
67
#define IOPM_ALLOC_ORDER 2
#define MSRPM_ALLOC_ORDER 1

#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3

68
69
70
71
#define SVM_FEATURE_NPT            (1 <<  0)
#define SVM_FEATURE_LBRV           (1 <<  1)
#define SVM_FEATURE_SVML           (1 <<  2)
#define SVM_FEATURE_NRIP           (1 <<  3)
72
73
74
75
#define SVM_FEATURE_TSC_RATE       (1 <<  4)
#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
#define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
76
#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
77

78
79
#define SVM_AVIC_DOORBELL	0xc001011b

80
81
82
83
#define NESTED_EXIT_HOST	0	/* Exit handled on host level */
#define NESTED_EXIT_DONE	1	/* Exit caused nested vmexit  */
#define NESTED_EXIT_CONTINUE	2	/* Further checks needed      */

84
85
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))

86
#define TSC_RATIO_RSVD          0xffffff0000000000ULL
87
88
#define TSC_RATIO_MIN		0x0000000000000001ULL
#define TSC_RATIO_MAX		0x000000ffffffffffULL
89

90
#define AVIC_HPA_MASK	~((0xFFFULL << 52) | 0xFFF)
91
92
93
94
95
96
97

/*
 * 0xff is broadcast, so the max index allowed for physical APIC ID
 * table is 0xfe.  APIC IDs above 0xff are reserved.
 */
#define AVIC_MAX_PHYSICAL_ID_COUNT	255

98
99
100
101
#define AVIC_UNACCEL_ACCESS_WRITE_MASK		1
#define AVIC_UNACCEL_ACCESS_OFFSET_MASK		0xFF0
#define AVIC_UNACCEL_ACCESS_VECTOR_MASK		0xFFFFFFFF

102
103
104
105
106
107
108
109
110
111
112
113
114
/* AVIC GATAG is encoded using VM and VCPU IDs */
#define AVIC_VCPU_ID_BITS		8
#define AVIC_VCPU_ID_MASK		((1 << AVIC_VCPU_ID_BITS) - 1)

#define AVIC_VM_ID_BITS			24
#define AVIC_VM_ID_NR			(1 << AVIC_VM_ID_BITS)
#define AVIC_VM_ID_MASK			((1 << AVIC_VM_ID_BITS) - 1)

#define AVIC_GATAG(x, y)		(((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
						(y & AVIC_VCPU_ID_MASK))
#define AVIC_GATAG_TO_VMID(x)		((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
#define AVIC_GATAG_TO_VCPUID(x)		(x & AVIC_VCPU_ID_MASK)

115
116
static bool erratum_383_found __read_mostly;

117
118
119
120
121
122
static const u32 host_save_user_msrs[] = {
#ifdef CONFIG_X86_64
	MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
	MSR_FS_BASE,
#endif
	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
123
	MSR_TSC_AUX,
124
125
126
127
128
129
};

#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)

struct kvm_vcpu;

130
131
132
struct nested_state {
	struct vmcb *hsave;
	u64 hsave_msr;
133
	u64 vm_cr_msr;
134
135
136
137
138
139
140
	u64 vmcb;

	/* These are the merged vectors */
	u32 *msrpm;

	/* gpa pointers to the real vectors */
	u64 vmcb_msrpm;
141
	u64 vmcb_iopm;
142

143
144
145
	/* A VMEXIT is required but not yet emulated */
	bool exit_required;

146
	/* cache for intercepts of the guest */
147
	u32 intercept_cr;
148
	u32 intercept_dr;
149
150
151
	u32 intercept_exceptions;
	u64 intercept;

152
153
	/* Nested Paging related state */
	u64 nested_cr3;
154
155
};

156
157
158
#define MSRPM_OFFSETS	16
static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;

159
160
161
162
163
164
/*
 * Set osvw_len to higher value when updated Revision Guides
 * are published and we know what the new status bits are
 */
static uint64_t osvw_len = 4, osvw_status;

165
166
167
168
169
170
171
172
struct vcpu_svm {
	struct kvm_vcpu vcpu;
	struct vmcb *vmcb;
	unsigned long vmcb_pa;
	struct svm_cpu_data *svm_data;
	uint64_t asid_generation;
	uint64_t sysenter_esp;
	uint64_t sysenter_eip;
173
	uint64_t tsc_aux;
174
175
176
177

	u64 next_rip;

	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
178
	struct {
179
180
181
		u16 fs;
		u16 gs;
		u16 ldt;
182
183
		u64 gs_base;
	} host;
184
185
186

	u32 *msrpm;

187
188
	ulong nmi_iret_rip;

189
	struct nested_state nested;
Jan Kiszka's avatar
Jan Kiszka committed
190
191

	bool nmi_singlestep;
192
193
194

	unsigned int3_injected;
	unsigned long int3_rip;
195
	u32 apf_reason;
196

197
198
	/* cached guest cpuid flags for faster access */
	bool nrips_enabled	: 1;
199

200
	u32 ldr_reg;
201
202
	struct page *avic_backing_page;
	u64 *avic_physical_id_cache;
203
	bool avic_is_running;
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220

	/*
	 * Per-vcpu list of struct amd_svm_iommu_ir:
	 * This is used mainly to store interrupt remapping information used
	 * when update the vcpu affinity. This avoids the need to scan for
	 * IRTE and try to match ga_tag in the IOMMU driver.
	 */
	struct list_head ir_list;
	spinlock_t ir_list_lock;
};

/*
 * This is a wrapper of struct amd_iommu_ir_data.
 */
struct amd_svm_iommu_ir {
	struct list_head node;	/* Used by SVM for per-vcpu ir_list */
	void *data;		/* Storing pointer to struct amd_ir_data */
221
222
};

223
224
225
226
227
228
229
230
#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK	(0xFF)
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK		(1 << 31)

#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK	(0xFFULL)
#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK	(0xFFFFFFFFFFULL << 12)
#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK		(1ULL << 62)
#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK		(1ULL << 63)

231
232
233
static DEFINE_PER_CPU(u64, current_tsc_ratio);
#define TSC_RATIO_DEFAULT	0x0100000000ULL

234
235
#define MSR_INVALID			0xffffffffU

236
static const struct svm_direct_access_msrs {
237
238
239
	u32 index;   /* Index of the MSR */
	bool always; /* True if intercept is always on */
} direct_access_msrs[] = {
Brian Gerst's avatar
Brian Gerst committed
240
	{ .index = MSR_STAR,				.always = true  },
241
242
243
244
245
246
247
248
249
250
251
252
253
254
	{ .index = MSR_IA32_SYSENTER_CS,		.always = true  },
#ifdef CONFIG_X86_64
	{ .index = MSR_GS_BASE,				.always = true  },
	{ .index = MSR_FS_BASE,				.always = true  },
	{ .index = MSR_KERNEL_GS_BASE,			.always = true  },
	{ .index = MSR_LSTAR,				.always = true  },
	{ .index = MSR_CSTAR,				.always = true  },
	{ .index = MSR_SYSCALL_MASK,			.always = true  },
#endif
	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
	{ .index = MSR_IA32_LASTINTTOIP,		.always = false },
	{ .index = MSR_INVALID,				.always = false },
255
256
};

257
258
259
260
/* enable NPT for AMD64 and X86 with PAE */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
static bool npt_enabled = true;
#else
Joerg Roedel's avatar
Joerg Roedel committed
261
static bool npt_enabled;
262
#endif
263

264
265
/* allow nested paging (virtualized MMU) for all guests */
static int npt = true;
266
module_param(npt, int, S_IRUGO);
267

268
269
/* allow nested virtualization in KVM/SVM */
static int nested = true;
270
271
module_param(nested, int, S_IRUGO);

272
273
/* enable / disable AVIC */
static int avic;
274
#ifdef CONFIG_X86_LOCAL_APIC
275
module_param(avic, int, S_IRUGO);
276
#endif
277

278
279
280
281
/* AVIC VM ID bit masks and lock */
static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
static DEFINE_SPINLOCK(avic_vm_id_lock);

282
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
283
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
284
static void svm_complete_interrupts(struct vcpu_svm *svm);
285

286
static int nested_svm_exit_handled(struct vcpu_svm *svm);
287
static int nested_svm_intercept(struct vcpu_svm *svm);
288
289
290
291
static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
				      bool has_error_code, u32 error_code);

292
enum {
293
294
	VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
			    pause filter count */
295
	VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
296
	VMCB_ASID,	 /* ASID */
297
	VMCB_INTR,	 /* int_ctl, int_vector */
298
	VMCB_NPT,        /* npt_en, nCR3, gPAT */
299
	VMCB_CR,	 /* CR0, CR3, CR4, EFER */
300
	VMCB_DR,         /* DR6, DR7 */
301
	VMCB_DT,         /* GDT, IDT */
302
	VMCB_SEG,        /* CS, DS, SS, ES, CPL */
303
	VMCB_CR2,        /* CR2 only */
304
	VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
305
306
307
308
	VMCB_AVIC,       /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
			  * AVIC PHYSICAL_TABLE pointer,
			  * AVIC LOGICAL_TABLE pointer
			  */
309
310
311
	VMCB_DIRTY_MAX,
};

312
313
/* TPR and CR2 are always written before VMRUN */
#define VMCB_ALWAYS_DIRTY_MASK	((1U << VMCB_INTR) | (1U << VMCB_CR2))
314

315
316
#define VMCB_AVIC_APIC_BAR_MASK		0xFFFFFFFFFF000ULL

317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
static inline void mark_all_dirty(struct vmcb *vmcb)
{
	vmcb->control.clean = 0;
}

static inline void mark_all_clean(struct vmcb *vmcb)
{
	vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
			       & ~VMCB_ALWAYS_DIRTY_MASK;
}

static inline void mark_dirty(struct vmcb *vmcb, int bit)
{
	vmcb->control.clean &= ~(1 << bit);
}

333
334
static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
335
	return container_of(vcpu, struct vcpu_svm, vcpu);
336
337
}

338
339
340
341
342
343
static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
{
	svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
	mark_dirty(svm->vmcb, VMCB_AVIC);
}

344
345
346
347
348
349
350
351
352
353
354
static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	u64 *entry = svm->avic_physical_id_cache;

	if (!entry)
		return false;

	return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
}

355
356
357
358
359
static void recalc_intercepts(struct vcpu_svm *svm)
{
	struct vmcb_control_area *c, *h;
	struct nested_state *g;

360
361
	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);

362
363
364
365
366
367
368
	if (!is_guest_mode(&svm->vcpu))
		return;

	c = &svm->vmcb->control;
	h = &svm->nested.hsave->control;
	g = &svm->nested;

369
	c->intercept_cr = h->intercept_cr | g->intercept_cr;
370
	c->intercept_dr = h->intercept_dr | g->intercept_dr;
371
372
373
374
	c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
	c->intercept = h->intercept | g->intercept;
}

375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
{
	if (is_guest_mode(&svm->vcpu))
		return svm->nested.hsave;
	else
		return svm->vmcb;
}

static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
{
	struct vmcb *vmcb = get_host_vmcb(svm);

	vmcb->control.intercept_cr |= (1U << bit);

	recalc_intercepts(svm);
}

static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
{
	struct vmcb *vmcb = get_host_vmcb(svm);

	vmcb->control.intercept_cr &= ~(1U << bit);

	recalc_intercepts(svm);
}

static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
{
	struct vmcb *vmcb = get_host_vmcb(svm);

	return vmcb->control.intercept_cr & (1U << bit);
}

408
static inline void set_dr_intercepts(struct vcpu_svm *svm)
409
410
411
{
	struct vmcb *vmcb = get_host_vmcb(svm);

412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
	vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
		| (1 << INTERCEPT_DR1_READ)
		| (1 << INTERCEPT_DR2_READ)
		| (1 << INTERCEPT_DR3_READ)
		| (1 << INTERCEPT_DR4_READ)
		| (1 << INTERCEPT_DR5_READ)
		| (1 << INTERCEPT_DR6_READ)
		| (1 << INTERCEPT_DR7_READ)
		| (1 << INTERCEPT_DR0_WRITE)
		| (1 << INTERCEPT_DR1_WRITE)
		| (1 << INTERCEPT_DR2_WRITE)
		| (1 << INTERCEPT_DR3_WRITE)
		| (1 << INTERCEPT_DR4_WRITE)
		| (1 << INTERCEPT_DR5_WRITE)
		| (1 << INTERCEPT_DR6_WRITE)
		| (1 << INTERCEPT_DR7_WRITE);
428
429
430
431

	recalc_intercepts(svm);
}

432
static inline void clr_dr_intercepts(struct vcpu_svm *svm)
433
434
435
{
	struct vmcb *vmcb = get_host_vmcb(svm);

436
	vmcb->control.intercept_dr = 0;
437
438
439
440

	recalc_intercepts(svm);
}

441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
{
	struct vmcb *vmcb = get_host_vmcb(svm);

	vmcb->control.intercept_exceptions |= (1U << bit);

	recalc_intercepts(svm);
}

static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
{
	struct vmcb *vmcb = get_host_vmcb(svm);

	vmcb->control.intercept_exceptions &= ~(1U << bit);

	recalc_intercepts(svm);
}

459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
static inline void set_intercept(struct vcpu_svm *svm, int bit)
{
	struct vmcb *vmcb = get_host_vmcb(svm);

	vmcb->control.intercept |= (1ULL << bit);

	recalc_intercepts(svm);
}

static inline void clr_intercept(struct vcpu_svm *svm, int bit)
{
	struct vmcb *vmcb = get_host_vmcb(svm);

	vmcb->control.intercept &= ~(1ULL << bit);

	recalc_intercepts(svm);
}

477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
static inline void enable_gif(struct vcpu_svm *svm)
{
	svm->vcpu.arch.hflags |= HF_GIF_MASK;
}

static inline void disable_gif(struct vcpu_svm *svm)
{
	svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
}

static inline bool gif_set(struct vcpu_svm *svm)
{
	return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
}

492
static unsigned long iopm_base;
Avi Kivity's avatar
Avi Kivity committed
493
494
495
496

struct kvm_ldttss_desc {
	u16 limit0;
	u16 base0;
Joerg Roedel's avatar
Joerg Roedel committed
497
498
	unsigned base1:8, type:5, dpl:2, p:1;
	unsigned limit1:4, zero0:3, g:1, base2:8;
Avi Kivity's avatar
Avi Kivity committed
499
500
501
502
503
504
505
	u32 base3;
	u32 zero1;
} __attribute__((packed));

struct svm_cpu_data {
	int cpu;

Avi Kivity's avatar
Avi Kivity committed
506
507
508
	u64 asid_generation;
	u32 max_asid;
	u32 next_asid;
Avi Kivity's avatar
Avi Kivity committed
509
510
511
512
513
514
515
516
517
518
519
520
	struct kvm_ldttss_desc *tss_desc;

	struct page *save_area;
};

static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);

struct svm_init_data {
	int cpu;
	int r;
};

521
static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
Avi Kivity's avatar
Avi Kivity committed
522

523
#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
Avi Kivity's avatar
Avi Kivity committed
524
525
526
#define MSRS_RANGE_SIZE 2048
#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)

527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
static u32 svm_msrpm_offset(u32 msr)
{
	u32 offset;
	int i;

	for (i = 0; i < NUM_MSR_MAPS; i++) {
		if (msr < msrpm_ranges[i] ||
		    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
			continue;

		offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
		offset += (i * MSRS_RANGE_SIZE);       /* add range offset */

		/* Now we have the u8 offset - but need the u32 offset */
		return offset / 4;
	}

	/* MSR not in any range */
	return MSR_INVALID;
}

Avi Kivity's avatar
Avi Kivity committed
548
549
550
551
#define MAX_INST_SIZE 15

static inline void clgi(void)
{
552
	asm volatile (__ex(SVM_CLGI));
Avi Kivity's avatar
Avi Kivity committed
553
554
555
556
}

static inline void stgi(void)
{
557
	asm volatile (__ex(SVM_STGI));
Avi Kivity's avatar
Avi Kivity committed
558
559
560
561
}

static inline void invlpga(unsigned long addr, u32 asid)
{
Joerg Roedel's avatar
Joerg Roedel committed
562
	asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
Avi Kivity's avatar
Avi Kivity committed
563
564
}

565
566
567
568
569
570
571
572
573
static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
	return PT64_ROOT_LEVEL;
#else
	return PT32E_ROOT_LEVEL;
#endif
}

Avi Kivity's avatar
Avi Kivity committed
574
575
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
576
	vcpu->arch.efer = efer;
577
	if (!npt_enabled && !(efer & EFER_LMA))
578
		efer &= ~EFER_LME;
Avi Kivity's avatar
Avi Kivity committed
579

580
	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
581
	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
Avi Kivity's avatar
Avi Kivity committed
582
583
584
585
586
587
588
589
}

static int is_external_interrupt(u32 info)
{
	info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
	return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
}

590
static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
591
592
593
594
595
{
	struct vcpu_svm *svm = to_svm(vcpu);
	u32 ret = 0;

	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
596
597
		ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
	return ret;
598
599
600
601
602
603
604
605
606
607
608
609
610
}

static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	if (mask == 0)
		svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
	else
		svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;

}

Avi Kivity's avatar
Avi Kivity committed
611
612
static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
613
614
	struct vcpu_svm *svm = to_svm(vcpu);

615
	if (svm->vmcb->control.next_rip != 0) {
616
		WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
617
		svm->next_rip = svm->vmcb->control.next_rip;
618
	}
619

620
	if (!svm->next_rip) {
621
		if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
622
623
				EMULATE_DONE)
			printk(KERN_DEBUG "%s: NOP\n", __func__);
Avi Kivity's avatar
Avi Kivity committed
624
625
		return;
	}
626
627
628
	if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
		printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
		       __func__, kvm_rip_read(vcpu), svm->next_rip);
Avi Kivity's avatar
Avi Kivity committed
629

630
	kvm_rip_write(vcpu, svm->next_rip);
631
	svm_set_interrupt_shadow(vcpu, 0);
Avi Kivity's avatar
Avi Kivity committed
632
633
}

634
static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
635
636
				bool has_error_code, u32 error_code,
				bool reinject)
637
638
639
{
	struct vcpu_svm *svm = to_svm(vcpu);

Joerg Roedel's avatar
Joerg Roedel committed
640
641
642
643
	/*
	 * If we are within a nested VM we'd better #VMEXIT and let the guest
	 * handle the exception
	 */
644
645
	if (!reinject &&
	    nested_svm_check_exception(svm, nr, has_error_code, error_code))
646
647
		return;

648
	if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
		unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);

		/*
		 * For guest debugging where we have to reinject #BP if some
		 * INT3 is guest-owned:
		 * Emulate nRIP by moving RIP forward. Will fail if injection
		 * raises a fault that is not intercepted. Still better than
		 * failing in all cases.
		 */
		skip_emulated_instruction(&svm->vcpu);
		rip = kvm_rip_read(&svm->vcpu);
		svm->int3_rip = rip + svm->vmcb->save.cs.base;
		svm->int3_injected = rip - old_rip;
	}

664
665
666
667
668
669
670
	svm->vmcb->control.event_inj = nr
		| SVM_EVTINJ_VALID
		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
		| SVM_EVTINJ_TYPE_EXEPT;
	svm->vmcb->control.event_inj_err = error_code;
}

671
672
673
674
675
676
static void svm_init_erratum_383(void)
{
	u32 low, high;
	int err;
	u64 val;

677
	if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
		return;

	/* Use _safe variants to not break nested virtualization */
	val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
	if (err)
		return;

	val |= (1ULL << 47);

	low  = lower_32_bits(val);
	high = upper_32_bits(val);

	native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);

	erratum_383_found = true;
}

695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
static void svm_init_osvw(struct kvm_vcpu *vcpu)
{
	/*
	 * Guests should see errata 400 and 415 as fixed (assuming that
	 * HLT and IO instructions are intercepted).
	 */
	vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
	vcpu->arch.osvw.status = osvw_status & ~(6ULL);

	/*
	 * By increasing VCPU's osvw.length to 3 we are telling the guest that
	 * all osvw.status bits inside that length, including bit 0 (which is
	 * reserved for erratum 298), are valid. However, if host processor's
	 * osvw_len is 0 then osvw_status[0] carries no information. We need to
	 * be conservative here and therefore we tell the guest that erratum 298
	 * is present (because we really don't know).
	 */
	if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
		vcpu->arch.osvw.status |= 1;
}

Avi Kivity's avatar
Avi Kivity committed
716
717
static int has_svm(void)
{
718
	const char *msg;
Avi Kivity's avatar
Avi Kivity committed
719

720
	if (!cpu_has_svm(&msg)) {
Joe Perches's avatar
Joe Perches committed
721
		printk(KERN_INFO "has_svm: %s\n", msg);
Avi Kivity's avatar
Avi Kivity committed
722
723
724
725
726
727
		return 0;
	}

	return 1;
}

728
static void svm_hardware_disable(void)
Avi Kivity's avatar
Avi Kivity committed
729
{
730
731
732
733
	/* Make sure we clean up behind us */
	if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
		wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);

734
	cpu_svm_disable();
735
736

	amd_pmu_disable_virt();
Avi Kivity's avatar
Avi Kivity committed
737
738
}

739
static int svm_hardware_enable(void)
Avi Kivity's avatar
Avi Kivity committed
740
741
{

742
	struct svm_cpu_data *sd;
Avi Kivity's avatar
Avi Kivity committed
743
744
745
746
	uint64_t efer;
	struct desc_struct *gdt;
	int me = raw_smp_processor_id();

747
748
749
750
	rdmsrl(MSR_EFER, efer);
	if (efer & EFER_SVME)
		return -EBUSY;

Avi Kivity's avatar
Avi Kivity committed
751
	if (!has_svm()) {
752
		pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
753
		return -EINVAL;
Avi Kivity's avatar
Avi Kivity committed
754
	}
755
756
	sd = per_cpu(svm_data, me);
	if (!sd) {
757
		pr_err("%s: svm_data is NULL on %d\n", __func__, me);
758
		return -EINVAL;
Avi Kivity's avatar
Avi Kivity committed
759
760
	}

761
762
763
	sd->asid_generation = 1;
	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
	sd->next_asid = sd->max_asid + 1;
Avi Kivity's avatar
Avi Kivity committed
764

765
	gdt = get_current_gdt_rw();
766
	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
Avi Kivity's avatar
Avi Kivity committed
767

768
	wrmsrl(MSR_EFER, efer | EFER_SVME);
Avi Kivity's avatar
Avi Kivity committed
769

770
	wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
771

772
773
	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
		wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
774
		__this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
775
776
	}

777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806

	/*
	 * Get OSVW bits.
	 *
	 * Note that it is possible to have a system with mixed processor
	 * revisions and therefore different OSVW bits. If bits are not the same
	 * on different processors then choose the worst case (i.e. if erratum
	 * is present on one processor and not on another then assume that the
	 * erratum is present everywhere).
	 */
	if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
		uint64_t len, status = 0;
		int err;

		len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
		if (!err)
			status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
						      &err);

		if (err)
			osvw_status = osvw_len = 0;
		else {
			if (len < osvw_len)
				osvw_len = len;
			osvw_status |= status;
			osvw_status &= (1ULL << osvw_len) - 1;
		}
	} else
		osvw_status = osvw_len = 0;

807
808
	svm_init_erratum_383();

809
810
	amd_pmu_enable_virt();

811
	return 0;
Avi Kivity's avatar
Avi Kivity committed
812
813
}

814
815
static void svm_cpu_uninit(int cpu)
{
816
	struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
817

818
	if (!sd)
819
820
821
		return;

	per_cpu(svm_data, raw_smp_processor_id()) = NULL;
822
823
	__free_page(sd->save_area);
	kfree(sd);
824
825
}

Avi Kivity's avatar
Avi Kivity committed
826
827
static int svm_cpu_init(int cpu)
{
828
	struct svm_cpu_data *sd;
Avi Kivity's avatar
Avi Kivity committed
829
830
	int r;

831
832
	sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
	if (!sd)
Avi Kivity's avatar
Avi Kivity committed
833
		return -ENOMEM;
834
835
	sd->cpu = cpu;
	sd->save_area = alloc_page(GFP_KERNEL);
Avi Kivity's avatar
Avi Kivity committed
836
	r = -ENOMEM;
837
	if (!sd->save_area)
Avi Kivity's avatar
Avi Kivity committed
838
839
		goto err_1;

840
	per_cpu(svm_data, cpu) = sd;
Avi Kivity's avatar
Avi Kivity committed
841
842
843
844

	return 0;

err_1:
845
	kfree(sd);
Avi Kivity's avatar
Avi Kivity committed
846
847
848
849
	return r;

}

850
851
852
853
854
855
856
857
858
859
860
static bool valid_msr_intercept(u32 index)
{
	int i;

	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
		if (direct_access_msrs[i].index == index)
			return true;

	return false;
}

861
862
static void set_msr_interception(u32 *msrpm, unsigned msr,
				 int read, int write)
Avi Kivity's avatar
Avi Kivity committed
863
{
864
865
866
	u8 bit_read, bit_write;
	unsigned long tmp;
	u32 offset;
Avi Kivity's avatar
Avi Kivity committed
867

868
869
870
871
872
873
	/*
	 * If this warning triggers extend the direct_access_msrs list at the
	 * beginning of the file
	 */
	WARN_ON(!valid_msr_intercept(msr));

874
875
876
877
878
879
880
881
882
883
884
	offset    = svm_msrpm_offset(msr);
	bit_read  = 2 * (msr & 0x0f);
	bit_write = 2 * (msr & 0x0f) + 1;
	tmp       = msrpm[offset];

	BUG_ON(offset == MSR_INVALID);

	read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
	write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);

	msrpm[offset] = tmp;
Avi Kivity's avatar
Avi Kivity committed
885
886
}

887
static void svm_vcpu_init_msrpm(u32 *msrpm)
Avi Kivity's avatar
Avi Kivity committed
888
889
890
{
	int i;

891
892
	memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));

893
894
895
896
897
898
	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
		if (!direct_access_msrs[i].always)
			continue;

		set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
	}
899
900
}

901
902
903
904
905
906
907
908
static void add_msr_offset(u32 offset)
{
	int i;

	for (i = 0; i < MSRPM_OFFSETS; ++i) {

		/* Offset already in list? */
		if (msrpm_offsets[i] == offset)
909
			return;
910
911
912
913
914
915
916
917
918

		/* Slot used by another offset? */
		if (msrpm_offsets[i] != MSR_INVALID)
			continue;

		/* Add offset to list */
		msrpm_offsets[i] = offset;

		return;
Avi Kivity's avatar
Avi Kivity committed
919
	}
920
921
922
923
924

	/*
	 * If this BUG triggers the msrpm_offsets table has an overflow. Just
	 * increase MSRPM_OFFSETS in this case.
	 */
925
	BUG();
Avi Kivity's avatar
Avi Kivity committed
926
927
}

928
static void init_msrpm_offsets(void)
929
{
930
	int i;
931

932
933
934
935
936
937
938
939
940
941
	memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));

	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
		u32 offset;

		offset = svm_msrpm_offset(direct_access_msrs[i].index);
		BUG_ON(offset == MSR_INVALID);

		add_msr_offset(offset);
	}
942
943
}

944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
static void svm_enable_lbrv(struct vcpu_svm *svm)
{
	u32 *msrpm = svm->msrpm;

	svm->vmcb->control.lbr_ctl = 1;
	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
}

static void svm_disable_lbrv(struct vcpu_svm *svm)
{
	u32 *msrpm = svm->msrpm;

	svm->vmcb->control.lbr_ctl = 0;
	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}

966
967
968
969
970
971
/* Note:
 * This hash table is used to map VM_ID to a struct kvm_arch,
 * when handling AMD IOMMU GALOG notification to schedule in
 * a particular vCPU.
 */
#define SVM_VM_DATA_HASH_BITS	8
972
973
static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000

/* Note:
 * This function is called from IOMMU driver to notify
 * SVM to schedule in a particular vCPU of a particular VM.
 */
static int avic_ga_log_notifier(u32 ga_tag)
{
	unsigned long flags;
	struct kvm_arch *ka = NULL;
	struct kvm_vcpu *vcpu = NULL;
	u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
	u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);

	pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);

	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
	hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
		struct kvm *kvm = container_of(ka, struct kvm, arch);
		struct kvm_arch *vm_data = &kvm->arch;

		if (vm_data->avic_vm_id != vm_id)
			continue;
		vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
		break;
	}
	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);

For faster browsing, not all history is shown. View entire blame