From 42bdf991f4cad9678ee2b98c5c2e9299a3f986ef Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 15 Apr 2013 23:30:13 -0300 Subject: [PATCH 1/6] KVM: x86: fix maintenance of guest/host xcr0 state Emulation of xcr0 writes zero guest_xcr0_loaded variable so that subsequent VM-entry reloads CPU's xcr0 with guests xcr0 value. However, this is incorrect because guest_xcr0_loaded variable is read to decide whether to reload hosts xcr0. In case the vcpu thread is scheduled out after the guest_xcr0_loaded = 0 assignment, and scheduler decides to preload FPU: switch_to { __switch_to __math_state_restore restore_fpu_checking fpu_restore_checking if (use_xsave()) fpu_xrstor_checking xrstor64 with CPU's xcr0 == guests xcr0 Fix by properly restoring hosts xcr0 during emulation of xcr0 writes. Analyzed-by: Ulrich Obergfell Signed-off-by: Marcelo Tosatti Reviewed-by: Paolo Bonzini Signed-off-by: Gleb Natapov --- arch/x86/kvm/x86.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 05a8b1a2300d..094b5d96ab14 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -555,6 +555,25 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_GPL(kvm_lmsw); +static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && + !vcpu->guest_xcr0_loaded) { + /* kvm_set_xcr() also depends on this */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); + vcpu->guest_xcr0_loaded = 1; + } +} + +static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_xcr0_loaded) { + if (vcpu->arch.xcr0 != host_xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); + vcpu->guest_xcr0_loaded = 0; + } +} + int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0; @@ -571,8 +590,8 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) return 1; if (xcr0 & ~host_xcr0) return 1; + kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; - vcpu->guest_xcr0_loaded = 0; return 0; } @@ -5614,25 +5633,6 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) } } -static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) -{ - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && - !vcpu->guest_xcr0_loaded) { - /* kvm_set_xcr() also depends on this */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - vcpu->guest_xcr0_loaded = 1; - } -} - -static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) -{ - if (vcpu->guest_xcr0_loaded) { - if (vcpu->arch.xcr0 != host_xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); - vcpu->guest_xcr0_loaded = 0; - } -} - static void process_nmi(struct kvm_vcpu *vcpu) { unsigned limit = 2; From 7dac16c379a876e256bc7349cd80007e7f9f2b59 Mon Sep 17 00:00:00 2001 From: Asias He Date: Wed, 8 May 2013 10:57:29 +0800 Subject: [PATCH 2/6] KVM: Fix kvm_irqfd_init initialization In commit a0f155e96 'KVM: Initialize irqfd from kvm_init()', when kvm_init() is called the second time (e.g kvm-amd.ko and kvm-intel.ko), kvm_arch_init() will fail with -EEXIST, then kvm_irqfd_exit() will be called on the error handling path. This way, the kvm_irqfd system will not be ready. This patch fix the following: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] _raw_spin_lock+0xe/0x30 PGD 0 Oops: 0002 [#1] SMP Modules linked in: vhost_net CPU 6 Pid: 4257, comm: qemu-system-x86 Not tainted 3.9.0-rc3+ #757 Dell Inc. OptiPlex 790/0V5HMK RIP: 0010:[] [] _raw_spin_lock+0xe/0x30 RSP: 0018:ffff880221721cc8 EFLAGS: 00010046 RAX: 0000000000000100 RBX: ffff88022dcc003f RCX: ffff880221734950 RDX: ffff8802208f6ca8 RSI: 000000007fffffff RDI: 0000000000000000 RBP: ffff880221721cc8 R08: 0000000000000002 R09: 0000000000000002 R10: 00007f7fd01087e0 R11: 0000000000000246 R12: ffff8802208f6ca8 R13: 0000000000000080 R14: ffff880223e2a900 R15: 0000000000000000 FS: 00007f7fd38488e0(0000) GS:ffff88022dcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000022309f000 CR4: 00000000000427e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process qemu-system-x86 (pid: 4257, threadinfo ffff880221720000, task ffff880222bd5640) Stack: ffff880221721d08 ffffffff810ac5c5 ffff88022431dc00 0000000000000086 0000000000000080 ffff880223e2a900 ffff8802208f6ca8 0000000000000000 ffff880221721d48 ffffffff810ac8fe 0000000000000000 ffff880221734000 Call Trace: [] __queue_work+0x45/0x2d0 [] queue_work_on+0x8e/0xa0 [] queue_work+0x19/0x20 [] irqfd_deactivate+0x4b/0x60 [] kvm_irqfd+0x39d/0x580 [] kvm_vm_ioctl+0x207/0x5b0 [] ? update_curr+0xf5/0x180 [] do_vfs_ioctl+0x98/0x550 [] ? finish_task_switch+0x4e/0xe0 [] ? __schedule+0x2ea/0x710 [] sys_ioctl+0x57/0x90 [] ? trace_hardirqs_on_thunk+0x3a/0x3c [] system_call_fastpath+0x16/0x1b Code: c1 ea 08 38 c2 74 0f 66 0f 1f 44 00 00 f3 90 0f b6 03 38 c2 75 f7 48 83 c4 08 5b c9 c3 55 48 89 e5 66 66 66 66 90 b8 00 01 00 00 66 0f c1 07 89 c2 66 c1 ea 08 38 c2 74 0c 0f 1f 00 f3 90 0f RIP [] _raw_spin_lock+0xe/0x30 RSP CR2: 0000000000000000 ---[ end trace 13fb1e4b6e5ab21f ]--- Signed-off-by: Asias He Acked-by: Cornelia Huck Signed-off-by: Gleb Natapov --- virt/kvm/kvm_main.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 45f09362ee7b..8a1889ccb883 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3105,13 +3105,21 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, int r; int cpu; - r = kvm_irqfd_init(); - if (r) - goto out_irqfd; r = kvm_arch_init(opaque); if (r) goto out_fail; + /* + * kvm_arch_init makes sure there's at most one caller + * for architectures that support multiple implementations, + * like intel and amd on x86. + * kvm_arch_init must be called before kvm_irqfd_init to avoid creating + * conflicts in case kvm is already setup for another implementation. + */ + r = kvm_irqfd_init(); + if (r) + goto out_irqfd; + if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto out_free_0; @@ -3186,10 +3194,10 @@ out_free_1: out_free_0a: free_cpumask_var(cpus_hardware_enabled); out_free_0: - kvm_arch_exit(); -out_fail: kvm_irqfd_exit(); out_irqfd: + kvm_arch_exit(); +out_fail: return r; } EXPORT_SYMBOL_GPL(kvm_init); From 8d76c49e9ffeee839bc0b7a3278a23f99101263e Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 8 May 2013 18:38:44 +0300 Subject: [PATCH 3/6] KVM: VMX: fix halt emulation while emulating invalid guest sate The invalid guest state emulation loop does not check halt_request which causes 100% cpu loop while guest is in halt and in invalid state, but more serious issue is that this leaves halt_request set, so random instruction emulated by vm86 #GP exit can be interpreted as halt which causes guest hang. Fix both problems by handling halt_request in emulation loop. Reported-by: Tomas Papan Tested-by: Tomas Papan Reviewed-by: Paolo Bonzini CC: stable@vger.kernel.org Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 25a791ed21c8..260a91939555 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5434,6 +5434,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) return 0; } + if (vcpu->arch.halt_request) { + vcpu->arch.halt_request = 0; + ret = kvm_emulate_halt(vcpu); + goto out; + } + if (signal_pending(current)) goto out; if (need_resched()) From a035d5c64d08a8ac12d81b596e7fa6d95a73c347 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 May 2013 11:32:49 +0200 Subject: [PATCH 4/6] KVM: emulator: emulate AAM This is used by SGABIOS, KVM breaks with emulate_invalid_guest_state=1. AAM needs the source operand to be unsigned; do the same in AAD as well for consistency, even though it does not affect the result. Reported-by: Jun'ichi Nomura Cc: stable@vger.kernel.org # 3.9 Signed-off-by: Paolo Bonzini Signed-off-by: Gleb Natapov --- arch/x86/kvm/emulate.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8e517bba6a7c..55322a7c113d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2996,6 +2996,28 @@ static int em_das(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_aam(struct x86_emulate_ctxt *ctxt) +{ + u8 al, ah; + + if (ctxt->src.val == 0) + return emulate_de(ctxt); + + al = ctxt->dst.val & 0xff; + ah = al / ctxt->src.val; + al %= ctxt->src.val; + + ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al | (ah << 8); + + /* Set PF, ZF, SF */ + ctxt->src.type = OP_IMM; + ctxt->src.val = 0; + ctxt->src.bytes = 1; + fastop(ctxt, em_or); + + return X86EMUL_CONTINUE; +} + static int em_aad(struct x86_emulate_ctxt *ctxt) { u8 al = ctxt->dst.val & 0xff; @@ -3936,7 +3958,8 @@ static const struct opcode opcode_table[256] = { /* 0xD0 - 0xD7 */ G(Src2One | ByteOp, group2), G(Src2One, group2), G(Src2CL | ByteOp, group2), G(Src2CL, group2), - N, I(DstAcc | SrcImmByte | No64, em_aad), N, N, + I(DstAcc | SrcImmUByte | No64, em_aam), + I(DstAcc | SrcImmUByte | No64, em_aad), N, N, /* 0xD8 - 0xDF */ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N, /* 0xE0 - 0xE7 */ From 7fa57952d70f5737513d8319395e471d107e4e0d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 May 2013 11:32:50 +0200 Subject: [PATCH 5/6] KVM: emulator: emulate XLAT This is used by SGABIOS, KVM breaks with emulate_invalid_guest_state=1. It is just a MOV in disguise, with a funny source address. Reported-by: Jun'ichi Nomura Cc: stable@vger.kernel.org # 3.9 Signed-off-by: Paolo Bonzini Signed-off-by: Gleb Natapov --- arch/x86/kvm/emulate.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 55322a7c113d..a06a550c2db8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -60,6 +60,7 @@ #define OpGS 25ull /* GS */ #define OpMem8 26ull /* 8-bit zero extended memory operand */ #define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ +#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -99,6 +100,7 @@ #define SrcImmUByte (OpImmUByte << SrcShift) #define SrcImmU (OpImmU << SrcShift) #define SrcSI (OpSI << SrcShift) +#define SrcXLat (OpXLat << SrcShift) #define SrcImmFAddr (OpImmFAddr << SrcShift) #define SrcMemFAddr (OpMemFAddr << SrcShift) #define SrcAcc (OpAcc << SrcShift) @@ -3959,7 +3961,8 @@ static const struct opcode opcode_table[256] = { G(Src2One | ByteOp, group2), G(Src2One, group2), G(Src2CL | ByteOp, group2), G(Src2CL, group2), I(DstAcc | SrcImmUByte | No64, em_aam), - I(DstAcc | SrcImmUByte | No64, em_aad), N, N, + I(DstAcc | SrcImmUByte | No64, em_aad), N, + I(DstAcc | SrcXLat | ByteOp, em_mov), /* 0xD8 - 0xDF */ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N, /* 0xE0 - 0xE7 */ @@ -4221,6 +4224,16 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->val = 0; op->count = 1; break; + case OpXLat: + op->type = OP_MEM; + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.mem.ea = + register_address(ctxt, + reg_read(ctxt, VCPU_REGS_RBX) + + (reg_read(ctxt, VCPU_REGS_RAX) & 0xff)); + op->addr.mem.seg = seg_override(ctxt); + op->val = 0; + break; case OpImmFAddr: op->type = OP_IMM; op->addr.mem.ea = ctxt->_eip; From 326f578f7e1443bac2333712dd130a261ec15288 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 May 2013 11:32:51 +0200 Subject: [PATCH 6/6] KVM: emulator: emulate SALC This is an almost-undocumented instruction available in 32-bit mode. I say "almost" undocumented because AMD documents it in their opcode maps just to say that it is unavailable in 64-bit mode (sections "A.2.1 One-Byte Opcodes" and "B.3 Invalid and Reassigned Instructions in 64-Bit Mode"). It is roughly equivalent to "sbb %al, %al" except it does not set the flags. Use fastop to emulate it, but do not use the opcode directly because it would fail if the host is 64-bit! Reported-by: Jun'ichi Nomura Cc: stable@vger.kernel.org # 3.9 Signed-off-by: Paolo Bonzini Signed-off-by: Gleb Natapov --- arch/x86/kvm/emulate.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a06a550c2db8..8db0010ed150 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -535,6 +535,9 @@ FOP_SETCC(setle) FOP_SETCC(setnle) FOP_END; +FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET +FOP_END; + #define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ do { \ unsigned long _tmp; \ @@ -3961,7 +3964,8 @@ static const struct opcode opcode_table[256] = { G(Src2One | ByteOp, group2), G(Src2One, group2), G(Src2CL | ByteOp, group2), G(Src2CL, group2), I(DstAcc | SrcImmUByte | No64, em_aam), - I(DstAcc | SrcImmUByte | No64, em_aad), N, + I(DstAcc | SrcImmUByte | No64, em_aad), + F(DstAcc | ByteOp | No64, em_salc), I(DstAcc | SrcXLat | ByteOp, em_mov), /* 0xD8 - 0xDF */ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,