From f20d2752980c144c82649eb18746ef0c29f508dd Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Tue, 20 May 2008 13:13:50 +0200 Subject: [PATCH 01/19] KVM: ia64: fix zero extending for mmio ld1/2/4 emulation in KVM Only copy in the data actually requested by the instruction emulation and zero pad the destination register first. This avoids the problem where emulated mmio access got garbled data from ld2.acq instructions in the vga console driver. Signed-off-by: Jes Sorensen Acked-by: Xiantao Zhang Signed-off-by: Avi Kivity --- arch/ia64/kvm/mmio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/ia64/kvm/mmio.c b/arch/ia64/kvm/mmio.c index 351bf70da46..7f1a858bc69 100644 --- a/arch/ia64/kvm/mmio.c +++ b/arch/ia64/kvm/mmio.c @@ -159,7 +159,8 @@ static void mmio_access(struct kvm_vcpu *vcpu, u64 src_pa, u64 *dest, if (p->u.ioreq.state == STATE_IORESP_READY) { if (dir == IOREQ_READ) - *dest = p->u.ioreq.data; + /* it's necessary to ensure zero extending */ + *dest = p->u.ioreq.data & (~0UL >> (64-(s*8))); } else panic_vm(vcpu); out: From 33e3885de25148e00595c4dd808d6eb15db2edcf Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 21 May 2008 15:34:25 +0300 Subject: [PATCH 02/19] KVM: x86 emulator: fix hypercall return value on AMD The hypercall instructions on Intel and AMD are different. KVM allows the guest to choose one or the other (the default is Intel), and if the guest chooses incorrectly, KVM will patch it at runtime to select the correct instruction. This allows live migration between Intel and AMD machines. This patching occurs in the x86 emulator. The current code also executes the hypercall. Unfortunately, the tail end of the x86 emulator code also executes, overwriting the return value of the hypercall with the original contents of rax (which happens to be the hypercall number). Fix not by executing the hypercall in the emulator context; instead let the guest reissue the patched instruction and execute the hypercall via the normal path. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 8a96320ab07..932f216d890 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -1727,7 +1727,8 @@ twobyte_insn: if (rc) goto done; - kvm_emulate_hypercall(ctxt->vcpu); + /* Let the processor re-execute the fixed hypercall */ + c->eip = ctxt->vcpu->arch.rip; /* Disable writeback. */ c->dst.type = OP_NONE; break; From b8cee18cc75d7b9dbe6c6526dfae9ab49e84fa95 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 21 May 2008 13:37:16 +0200 Subject: [PATCH 03/19] KVM: s390: use yield instead of schedule to implement diag 0x44 diag 0x44 is the common way on s390 to yield the cpu to the hypervisor. It is called by the guest in cpu_relax and in the spinlock code to yield to other guest cpus. This semantic is similar to yield. Lets replace the call to schedule with yield to make sure that current is really yielding. Signed-off-by: Christian Borntraeger Signed-off-by: Carsten Otte Signed-off-by: Avi Kivity --- arch/s390/kvm/diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index f639a152869..a0775e1f08d 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -20,7 +20,7 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 5, "%s", "diag time slice end"); vcpu->stat.diagnose_44++; vcpu_put(vcpu); - schedule(); + yield(); vcpu_load(vcpu); return 0; } From 74b6b522ec83f9c44fc7743f2adcb24664aa8f45 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 21 May 2008 13:37:29 +0200 Subject: [PATCH 04/19] KVM: s390: fix locking order problem in enable_sie There are potential locking problem in enable_sie. We take the task_lock and the mmap_sem. As exit_mm uses the same locks vice versa, this triggers a lockdep warning. The second problem is that dup_mm and mmput might sleep, so we must not hold the task_lock at that moment. The solution is to dup the mm unconditional and use the task_lock before and afterwards to check if we can use the new mm. dup_mm and mmput are called outside the task_lock, but we run update_mm while holding the task_lock, protection us against ptrace. Signed-off-by: Christian Borntraeger Signed-off-by: Carsten Otte Acked-by: Martin Schwidefsky Signed-off-by: Avi Kivity --- arch/s390/mm/pgtable.c | 44 ++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 5c1aea97cd1..3d98ba82ea6 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -254,36 +254,46 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) int s390_enable_sie(void) { struct task_struct *tsk = current; - struct mm_struct *mm; - int rc; + struct mm_struct *mm, *old_mm; - task_lock(tsk); - - rc = 0; + /* Do we have pgstes? if yes, we are done */ if (tsk->mm->context.pgstes) - goto unlock; + return 0; - rc = -EINVAL; + /* lets check if we are allowed to replace the mm */ + task_lock(tsk); if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || - tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) - goto unlock; + tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) { + task_unlock(tsk); + return -EINVAL; + } + task_unlock(tsk); - tsk->mm->context.pgstes = 1; /* dirty little tricks .. */ + /* we copy the mm with pgstes enabled */ + tsk->mm->context.pgstes = 1; mm = dup_mm(tsk); tsk->mm->context.pgstes = 0; - - rc = -ENOMEM; if (!mm) - goto unlock; - mmput(tsk->mm); + return -ENOMEM; + + /* Now lets check again if somebody attached ptrace etc */ + task_lock(tsk); + if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || + tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) { + mmput(mm); + task_unlock(tsk); + return -EINVAL; + } + + /* ok, we are alone. No ptrace, no threads, etc. */ + old_mm = tsk->mm; tsk->mm = tsk->active_mm = mm; preempt_disable(); update_mm(mm, tsk); cpu_set(smp_processor_id(), mm->cpu_vm_mask); preempt_enable(); - rc = 0; -unlock: task_unlock(tsk); - return rc; + mmput(old_mm); + return 0; } EXPORT_SYMBOL_GPL(s390_enable_sie); From 71cde5879f10b639506bc0b9f29a89f58b42a17e Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 21 May 2008 13:37:34 +0200 Subject: [PATCH 05/19] KVM: s390: handle machine checks when guest is running The low-level interrupt handler on s390 checks for _TIF_WORK_INT and exits the guest context, if work is pending. TIF_WORK_INT is defined as_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_MCCK_PENDING. Currently the sie loop checks for signals and reschedule, but it does not check for machine checks. That means that we exit the guest context if a machine check is pending, but we do not handle the machine check. Signed-off-by: Christian Borntraeger CC: Heiko Carstens Signed-off-by: Carsten Otte Signed-off-by: Avi Kivity --- arch/s390/kvm/kvm-s390.c | 5 +++++ drivers/s390/s390mach.c | 1 + 2 files changed, 6 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0ac36a649eb..40e4f2de732 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -423,6 +423,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return -EINVAL; /* not implemented yet */ } +extern void s390_handle_mcck(void); + static void __vcpu_run(struct kvm_vcpu *vcpu) { memcpy(&vcpu->arch.sie_block->gg14, &vcpu->arch.guest_gprs[14], 16); @@ -430,6 +432,9 @@ static void __vcpu_run(struct kvm_vcpu *vcpu) if (need_resched()) schedule(); + if (test_thread_flag(TIF_MCCK_PENDING)) + s390_handle_mcck(); + vcpu->arch.sie_block->icptcode = 0; local_irq_disable(); kvm_guest_enter(); diff --git a/drivers/s390/s390mach.c b/drivers/s390/s390mach.c index 5080f343ad7..5bfbe765983 100644 --- a/drivers/s390/s390mach.c +++ b/drivers/s390/s390mach.c @@ -207,6 +207,7 @@ s390_handle_mcck(void) do_exit(SIGSEGV); } } +EXPORT_SYMBOL_GPL(s390_handle_mcck); /* * returns 0 if all registers could be validated From 0ff318674503ce3787ef62d84f4d948db204b268 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 21 May 2008 13:37:37 +0200 Subject: [PATCH 06/19] KVM: s390: fix interrupt delivery The current code delivers pending interrupts before it checks for need_resched. On a busy host, this can lead to a longer interrupt latency if the interrupt is injected while the process is scheduled away. This patch moves delivering the interrupt _after_ schedule(), which makes more sense. Signed-off-by: Carsten Otte Signed-off-by: Avi Kivity --- arch/s390/kvm/kvm-s390.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 40e4f2de732..ded27c7777c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -435,6 +435,8 @@ static void __vcpu_run(struct kvm_vcpu *vcpu) if (test_thread_flag(TIF_MCCK_PENDING)) s390_handle_mcck(); + kvm_s390_deliver_pending_interrupts(vcpu); + vcpu->arch.sie_block->icptcode = 0; local_irq_disable(); kvm_guest_enter(); @@ -480,7 +482,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) might_sleep(); do { - kvm_s390_deliver_pending_interrupts(vcpu); __vcpu_run(vcpu); rc = kvm_handle_sie_intercept(vcpu); } while (!signal_pending(current) && !rc); From 1f0d0f094df9a570dfc26d5eb825986b7e165e1d Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 21 May 2008 13:37:40 +0200 Subject: [PATCH 07/19] KVM: s390: Send program check on access error If the guest accesses non-existing memory, the sie64a function returns -EFAULT. We must check the return value and send a program check to the guest if the sie instruction faulted, otherwise the guest will loop at the faulting code. Signed-off-by: Christian Borntraeger Signed-off-by: Carsten Otte Signed-off-by: Avi Kivity --- arch/s390/kvm/kvm-s390.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ded27c7777c..6558b09ff57 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -443,7 +443,10 @@ static void __vcpu_run(struct kvm_vcpu *vcpu) local_irq_enable(); VCPU_EVENT(vcpu, 6, "entering sie flags %x", atomic_read(&vcpu->arch.sie_block->cpuflags)); - sie64a(vcpu->arch.sie_block, vcpu->arch.guest_gprs); + if (sie64a(vcpu->arch.sie_block, vcpu->arch.guest_gprs)) { + VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); + kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + } VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", vcpu->arch.sie_block->icptcode); local_irq_disable(); From e52b2af541bcb299212a63cfa3e3231618a415be Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 21 May 2008 13:37:44 +0200 Subject: [PATCH 08/19] KVM: s390: Fix race condition in kvm_s390_handle_wait The call to add_timer was issued before local_int.lock was taken and before timer_due was set to 0. If the timer expires before the lock is being taken, the timer function will set timer_due to 1 and exit before the vcpu falls asleep. Depending on other external events, the vcpu might sleep forever. This fix pulls setting timer_due to the beginning of the function before add_timer, which ensures correct behavior. Signed-off-by: Carsten Otte Signed-off-by: Avi Kivity --- arch/s390/kvm/interrupt.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index fcd1ed8015c..84a7fed4cd4 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -339,6 +339,11 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) if (kvm_cpu_has_interrupt(vcpu)) return 0; + __set_cpu_idle(vcpu); + spin_lock_bh(&vcpu->arch.local_int.lock); + vcpu->arch.local_int.timer_due = 0; + spin_unlock_bh(&vcpu->arch.local_int.lock); + if (psw_interrupts_disabled(vcpu)) { VCPU_EVENT(vcpu, 3, "%s", "disabled wait"); __unset_cpu_idle(vcpu); @@ -366,8 +371,6 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) no_timer: spin_lock_bh(&vcpu->arch.local_int.float_int->lock); spin_lock_bh(&vcpu->arch.local_int.lock); - __set_cpu_idle(vcpu); - vcpu->arch.local_int.timer_due = 0; add_wait_queue(&vcpu->arch.local_int.wq, &wait); while (list_empty(&vcpu->arch.local_int.list) && list_empty(&vcpu->arch.local_int.float_int->list) && From ce263d70e509287ee761f9bba519342f57b121ca Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Wed, 21 May 2008 18:22:51 -0500 Subject: [PATCH 09/19] KVM: ppc: Remove duplicate function This was left behind from some code movement. Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/kvm/booke_guest.c | 33 --------------------------------- include/asm-powerpc/kvm_ppc.h | 1 + 2 files changed, 1 insertion(+), 33 deletions(-) diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c index 712d89a28c4..9c8ad850c6e 100644 --- a/arch/powerpc/kvm/booke_guest.c +++ b/arch/powerpc/kvm/booke_guest.c @@ -227,39 +227,6 @@ void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu) } } -static int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) -{ - enum emulation_result er; - int r; - - er = kvmppc_emulate_instruction(run, vcpu); - switch (er) { - case EMULATE_DONE: - /* Future optimization: only reload non-volatiles if they were - * actually modified. */ - r = RESUME_GUEST_NV; - break; - case EMULATE_DO_MMIO: - run->exit_reason = KVM_EXIT_MMIO; - /* We must reload nonvolatiles because "update" load/store - * instructions modify register state. */ - /* Future optimization: only reload non-volatiles if they were - * actually modified. */ - r = RESUME_HOST_NV; - break; - case EMULATE_FAIL: - /* XXX Deliver Program interrupt to guest. */ - printk(KERN_EMERG "%s: emulation failed (%08x)\n", __func__, - vcpu->arch.last_inst); - r = RESUME_HOST; - break; - default: - BUG(); - } - - return r; -} - /** * kvmppc_handle_exit * diff --git a/include/asm-powerpc/kvm_ppc.h b/include/asm-powerpc/kvm_ppc.h index b35a7e3ef97..5a21115228a 100644 --- a/include/asm-powerpc/kvm_ppc.h +++ b/include/asm-powerpc/kvm_ppc.h @@ -57,6 +57,7 @@ extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, extern int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu); +extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid, u32 flags); From ac3cd34e4eb9e3dccaec8e586c073ba2660b322f Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Wed, 21 May 2008 18:22:52 -0500 Subject: [PATCH 10/19] KVM: ppc: add lwzx/stwz emulation Somehow these load/store instructions got missed before, but weren't used by the guest so didn't break anything. Signed-off-by: Hollis Blanchard Signed-off-by: Christian Ehrhardt Signed-off-by: Avi Kivity --- arch/powerpc/kvm/emulate.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index a03fe0c8069..00009746128 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -246,6 +246,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case 31: switch (get_xop(inst)) { + case 23: /* lwzx */ + rt = get_rt(inst); + emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); + break; + case 83: /* mfmsr */ rt = get_rt(inst); vcpu->arch.gpr[rt] = vcpu->arch.msr; @@ -267,6 +272,13 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]); break; + case 151: /* stwx */ + rs = get_rs(inst); + emulated = kvmppc_handle_store(run, vcpu, + vcpu->arch.gpr[rs], + 4, 1); + break; + case 163: /* wrteei */ vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE) | (inst & MSR_EE); From 52435b7c7a29f7dd7947c8c204494d7f52f14813 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Wed, 21 May 2008 18:22:53 -0500 Subject: [PATCH 11/19] KVM: ppc: Remove unmatched kunmap() call We're not calling kmap() now, so we shouldn't call kunmap() either. This has no practical effect in the non-highmem case, which is why it hasn't caused more obvious problems. Pointed out by Anthony Liguori. Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/kvm/44x_tlb.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index f5d7a5eab96..aa649c7db99 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -116,8 +116,6 @@ static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu, struct tlbe *stlbe = &vcpu->arch.shadow_tlb[index]; struct page *page = vcpu->arch.shadow_pages[index]; - kunmap(vcpu->arch.shadow_pages[index]); - if (get_tlb_v(stlbe)) { if (kvmppc_44x_tlbe_is_writable(stlbe)) kvm_release_page_dirty(page); From 905fa4b9d6e2c9fd1c9ad84e3abe83021f498f53 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Wed, 21 May 2008 18:22:54 -0500 Subject: [PATCH 12/19] KVM: ppc: Use a read lock around MMU operations, and release it on error gfn_to_page() and kvm_release_page_clean() are called from other contexts with mmap_sem locked only for reading. Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/kvm/44x_tlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index aa649c7db99..1c48d6164bd 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -142,18 +142,19 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid, stlbe = &vcpu->arch.shadow_tlb[victim]; /* Get reference to new page. */ - down_write(¤t->mm->mmap_sem); + down_read(¤t->mm->mmap_sem); new_page = gfn_to_page(vcpu->kvm, gfn); if (is_error_page(new_page)) { printk(KERN_ERR "Couldn't get guest page!\n"); kvm_release_page_clean(new_page); + up_read(¤t->mm->mmap_sem); return; } hpaddr = page_to_phys(new_page); /* Drop reference to old page. */ kvmppc_44x_shadow_release(vcpu, victim); - up_write(¤t->mm->mmap_sem); + up_read(¤t->mm->mmap_sem); vcpu->arch.shadow_pages[victim] = new_page; From 9dcb40e1aa5bfe7d6ffc729f3c2b6c8f1392d2d3 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Wed, 21 May 2008 18:22:55 -0500 Subject: [PATCH 13/19] KVM: ppc: Report bad GFNs This code shouldn't be hit anyways, but when it is, it's useful to have a little more information about the failure. Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/kvm/44x_tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 1c48d6164bd..75dff7cfa81 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -145,7 +145,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid, down_read(¤t->mm->mmap_sem); new_page = gfn_to_page(vcpu->kvm, gfn); if (is_error_page(new_page)) { - printk(KERN_ERR "Couldn't get guest page!\n"); + printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn); kvm_release_page_clean(new_page); up_read(¤t->mm->mmap_sem); return; From 2f5997140f22f68f6390c49941150d3fa8a95cb7 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 May 2008 12:10:20 -0300 Subject: [PATCH 14/19] KVM: migrate PIT timer Migrate the PIT timer to the physical CPU which vcpu0 is scheduled on, similarly to what is done for the LAPIC timers, otherwise PIT interrupts will be delayed until an unrelated event causes an exit. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 14 +++++++++++++- arch/x86/kvm/irq.c | 6 ++++++ arch/x86/kvm/irq.h | 2 ++ arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 2 +- 7 files changed, 25 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 7c077a9d977..f2f5d260874 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -200,7 +200,6 @@ int __pit_timer_fn(struct kvm_kpit_state *ps) atomic_inc(&pt->pending); smp_mb__after_atomic_inc(); - /* FIXME: handle case where the guest is in guest mode */ if (vcpu0 && waitqueue_active(&vcpu0->wq)) { vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; wake_up_interruptible(&vcpu0->wq); @@ -237,6 +236,19 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } +void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_pit *pit = vcpu->kvm->arch.vpit; + struct hrtimer *timer; + + if (vcpu->vcpu_id != 0 || !pit) + return; + + timer = &pit->pit_state.pit_timer.timer; + if (hrtimer_cancel(timer)) + hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); +} + static void destroy_pit_timer(struct kvm_kpit_timer *pt) { pr_debug("pit: execute del timer!\n"); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index ce1f583459b..76d736b5f66 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -94,3 +94,9 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_timer_intr_post); + +void __kvm_migrate_timers(struct kvm_vcpu *vcpu) +{ + __kvm_migrate_apic_timer(vcpu); + __kvm_migrate_pit_timer(vcpu); +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 1802134b836..2a15be2275c 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -84,6 +84,8 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); +void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); +void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int pit_has_pending_timer(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ab22615eee8..6b0d5fa5bab 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -688,7 +688,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) delta = vcpu->arch.host_tsc - tsc_this; svm->vmcb->control.tsc_offset += delta; vcpu->cpu = cpu; - kvm_migrate_apic_timer(vcpu); + kvm_migrate_timers(vcpu); } for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bfe4db11989..96445f34151 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -608,7 +608,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu->cpu != cpu) { vcpu_clear(vmx); - kvm_migrate_apic_timer(vcpu); + kvm_migrate_timers(vcpu); vpid_sync_vcpu_all(vmx); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 21338bdb28f..00acf1301a1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2758,7 +2758,7 @@ again: if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) - __kvm_migrate_apic_timer(vcpu); + __kvm_migrate_timers(vcpu); if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests)) { kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 398978972b7..092b1b25291 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -297,7 +297,7 @@ static inline gpa_t gfn_to_gpa(gfn_t gfn) return (gpa_t)gfn << PAGE_SHIFT; } -static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) +static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) { set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); } From e693d71b46e64536581bf4884434fc1b8797e96f Mon Sep 17 00:00:00 2001 From: Eli Collins Date: Sun, 1 Jun 2008 20:24:40 -0700 Subject: [PATCH 15/19] KVM: VMX: Clear CR4.VMXE in hardware_disable Clear CR4.VMXE in hardware_disable. There's no reason to leave it set after doing a VMXOFF. VMware Workstation 6.5 checks CR4.VMXE as a proxy for whether the CPU is in VMX mode, so leaving VMXE set means we'll refuse to power on. With this change the user can power on after unloading the kvm-intel module. I tested on kvm-67 and kvm-69. Signed-off-by: Eli Collins Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 96445f34151..02efbe75f31 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1036,6 +1036,7 @@ static void hardware_enable(void *garbage) static void hardware_disable(void *garbage) { asm volatile (ASM_VMX_VMXOFF : : : "cc"); + write_cr4(read_cr4() & ~X86_CR4_VMXE); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, From 8d2d73b9a5c35f2c6abf427afba7888cfc4cc65d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 4 Jun 2008 18:42:24 +0300 Subject: [PATCH 16/19] KVM: MMU: reschedule during shadow teardown Shadows for large guests can take a long time to tear down, so reschedule occasionally to avoid softlockup warnings. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7246b60afb9..c2fd6a4a58c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1858,6 +1858,7 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu) sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, struct kvm_mmu_page, link); kvm_mmu_zap_page(vcpu->kvm, sp); + cond_resched(); } free_page((unsigned long)vcpu->arch.mmu.pae_root); } From ff4b9df877b30b8a371d706d3552999dee450738 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 5 Jun 2008 00:08:11 -0300 Subject: [PATCH 17/19] KVM: IOAPIC: only set remote_irr if interrupt was injected There's a bug in the IOAPIC code for level-triggered interrupts. Its relatively easy to trigger by sharing (virtio-blk + usbtablet was the testcase, initially reported by Gerd von Egidy). The "remote_irr" variable is used to indicate accepted but not yet acked interrupts. Its cleared from the EOI handler. Problem is that the EOI handler clears remote_irr unconditionally, even if it reinjected another pending interrupt. In that case, kvm_ioapic_set_irq() proceeds to ioapic_service() which sets remote_irr even if it failed to inject (since the IRR was high due to EOI reinjection). Since the TMR bit has been cleared by the first EOI, the second one fails to clear remote_irr. End result is interrupt line dead. Fix it by setting remote_irr only if a new pending interrupt has been generated (and the TMR bit for vector in question set). Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- virt/kvm/ioapic.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 4232fd75dd2..98778cb69c6 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -45,7 +45,7 @@ #else #define ioapic_debug(fmt, arg...) #endif -static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); +static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq); static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, unsigned long addr, @@ -89,8 +89,8 @@ static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) pent = &ioapic->redirtbl[idx]; if (!pent->fields.mask) { - ioapic_deliver(ioapic, idx); - if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) + int injected = ioapic_deliver(ioapic, idx); + if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) pent->fields.remote_irr = 1; } if (!pent->fields.trig_mode) @@ -133,7 +133,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) } } -static void ioapic_inj_irq(struct kvm_ioapic *ioapic, +static int ioapic_inj_irq(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu, u8 vector, u8 trig_mode, u8 delivery_mode) { @@ -143,7 +143,7 @@ static void ioapic_inj_irq(struct kvm_ioapic *ioapic, ASSERT((delivery_mode == IOAPIC_FIXED) || (delivery_mode == IOAPIC_LOWEST_PRIORITY)); - kvm_apic_set_irq(vcpu, vector, trig_mode); + return kvm_apic_set_irq(vcpu, vector, trig_mode); } static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, @@ -186,7 +186,7 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, return mask; } -static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) +static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) { u8 dest = ioapic->redirtbl[irq].fields.dest_id; u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode; @@ -195,7 +195,7 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; u32 deliver_bitmask; struct kvm_vcpu *vcpu; - int vcpu_id; + int vcpu_id, r = 0; ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " "vector=%x trig_mode=%x\n", @@ -204,7 +204,7 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); if (!deliver_bitmask) { ioapic_debug("no target on destination\n"); - return; + return 0; } switch (delivery_mode) { @@ -216,7 +216,7 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) vcpu = ioapic->kvm->vcpus[0]; #endif if (vcpu != NULL) - ioapic_inj_irq(ioapic, vcpu, vector, + r = ioapic_inj_irq(ioapic, vcpu, vector, trig_mode, delivery_mode); else ioapic_debug("null lowest prio vcpu: " @@ -234,7 +234,7 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) deliver_bitmask &= ~(1 << vcpu_id); vcpu = ioapic->kvm->vcpus[vcpu_id]; if (vcpu) { - ioapic_inj_irq(ioapic, vcpu, vector, + r = ioapic_inj_irq(ioapic, vcpu, vector, trig_mode, delivery_mode); } } @@ -246,6 +246,7 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) delivery_mode); break; } + return r; } void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) From ebb0e6264c7a65c51feb3575e9edb58eab0cf469 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 May 2008 16:21:58 +0300 Subject: [PATCH 18/19] KVM: MMU: Fix printk() format string Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 156fe10288a..934c7b61939 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -418,7 +418,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* mmio */ if (is_error_pfn(pfn)) { - pgprintk("gfn %x is mmio\n", walker.gfn); + pgprintk("gfn %lx is mmio\n", walker.gfn); kvm_release_pfn_clean(pfn); return 1; } From 3c9155106d589584f67b026ec444e69c4a68d7dc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 May 2008 16:21:13 +0300 Subject: [PATCH 19/19] KVM: MMU: Fix is_empty_shadow_page() check The check is only looking at one of two possible empty ptes. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c2fd6a4a58c..ee3f53098f0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -658,7 +658,7 @@ static int is_empty_shadow_page(u64 *spt) u64 *end; for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) - if (*pos != shadow_trap_nonpresent_pte) { + if (is_shadow_present_pte(*pos)) { printk(KERN_ERR "%s: %p %llx\n", __func__, pos, *pos); return 0;