Features:

* Performance improvement to lower the amount of traps the hypervisor
    has to do 32-bit guests. Mainly for setting PTE entries and updating
    TLS descriptors.
  * MCE polling driver to collect hypervisor MCE buffer and present them to
    /dev/mcelog.
  * Physical CPU online/offline support. When an privileged guest is booted
    it is present with virtual CPUs, which might have an 1:1 to physical
    CPUs but usually don't. This provides mechanism to offline/online physical
    CPUs.
 Bug-fixes for:
  * Coverity found fixes in the console and ACPI processor driver.
  * PVonHVM kexec fixes along with some cleanups.
  * Pages that fall within E820 gaps and non-RAM regions (and had been
    released to hypervisor) would be populated back, but potentially in
    non-RAM regions.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1.4.12 (GNU/Linux)
 
 iQEcBAABAgAGBQJQDWcvAAoJEFjIrFwIi8fJ6GAH/iFIkOC5wseD8qZ9nV4VI46t
 0GYvBFC4F91NvC7CNfoAySr84v+ZORIZzMcdyDF8H/tLO9MaOY/Mwn0S5ZSqmYMi
 rhskvK3InBaVkYtceOHugNGM7mB0c3STIm7OsjW6gbVzohmTN25rbQR+X5iWAtVA
 cTUtDyH3AU15mwuVT3U+VC4IulHpnNJz4pHoq3Sn61/UK1LYmhLXYd5fveA0D0B8
 lRZTAvNMsYDJDDmkWNrs8RczKkQ86DTSjfGawm0YG+Gf94GgD5yMHWbiHh2Gy93e
 u7sHK0RrKbP5BY/MV6vVJxkoV5NoWgCc0tcjBcYwdyvwzxDS75UhV6uoVHC3Ao8=
 =drt2
 -----END PGP SIGNATURE-----

Merge tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen

Pull Xen update from Konrad Rzeszutek Wilk:
 "Features:
   * Performance improvement to lower the amount of traps the hypervisor
     has to do 32-bit guests.  Mainly for setting PTE entries and
     updating TLS descriptors.
   * MCE polling driver to collect hypervisor MCE buffer and present
     them to /dev/mcelog.
   * Physical CPU online/offline support.  When an privileged guest is
     booted it is present with virtual CPUs, which might have an 1:1 to
     physical CPUs but usually don't.  This provides mechanism to
     offline/online physical CPUs.
  Bug-fixes for:
   * Coverity found fixes in the console and ACPI processor driver.
   * PVonHVM kexec fixes along with some cleanups.
   * Pages that fall within E820 gaps and non-RAM regions (and had been
     released to hypervisor) would be populated back, but potentially in
     non-RAM regions."

* tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen:
  xen: populate correct number of pages when across mem boundary (v2)
  xen PVonHVM: move shared_info to MMIO before kexec
  xen: simplify init_hvm_pv_info
  xen: remove cast from HYPERVISOR_shared_info assignment
  xen: enable platform-pci only in a Xen guest
  xen/pv-on-hvm kexec: shutdown watches from old kernel
  xen/x86: avoid updating TLS descriptors if they haven't changed
  xen/x86: add desc_equal() to compare GDT descriptors
  xen/mm: zero PTEs for non-present MFNs in the initial page table
  xen/mm: do direct hypercall in xen_set_pte() if batching is unavailable
  xen/hvc: Fix up checks when the info is allocated.
  xen/acpi: Fix potential memory leak.
  xen/mce: add .poll method for mcelog device driver
  xen/mce: schedule a workqueue to avoid sleep in atomic context
  xen/pcpu: Xen physical cpus online/offline sys interface
  xen/mce: Register native mce handler as vMCE bounce back point
  x86, MCE, AMD: Adjust initcall sequence for xen
  xen/mce: Add mcelog support for Xen platform
This commit is contained in:
Linus Torvalds 2012-07-24 13:14:03 -07:00
commit 62c4d9afa4
23 changed files with 1514 additions and 93 deletions

View File

@ -0,0 +1,20 @@
What: /sys/devices/system/xen_cpu/
Date: May 2012
Contact: Liu, Jinsong <jinsong.liu@intel.com>
Description:
A collection of global/individual Xen physical cpu attributes
Individual physical cpu attributes are contained in
subdirectories named by the Xen's logical cpu number, e.g.:
/sys/devices/system/xen_cpu/xen_cpu#/
What: /sys/devices/system/xen_cpu/xen_cpu#/online
Date: May 2012
Contact: Liu, Jinsong <jinsong.liu@intel.com>
Description:
Interface to online/offline Xen physical cpus
When running under Xen platform, it provide user interface
to online/offline physical cpus, except cpu0 due to several
logic restrictions and assumptions.

View File

@ -48,6 +48,7 @@
#include <xen/interface/sched.h>
#include <xen/interface/physdev.h>
#include <xen/interface/platform.h>
#include <xen/interface/xen-mca.h>
/*
* The hypercall asms have to meet several constraints:
@ -301,6 +302,13 @@ HYPERVISOR_set_timer_op(u64 timeout)
return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
}
static inline int
HYPERVISOR_mca(struct xen_mc *mc_op)
{
mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
return _hypercall1(int, mca, mc_op);
}
static inline int
HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
{

View File

@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
int mce_disabled __read_mostly;
#define MISC_MCELOG_MINOR 227
#define SPINUNIT 100 /* 100ns */
atomic_t mce_entry;
@ -2346,7 +2344,7 @@ static __init int mcheck_init_device(void)
return err;
}
device_initcall(mcheck_init_device);
device_initcall_sync(mcheck_init_device);
/*
* Old style boot options parsing. Only for compatibility.

View File

@ -759,4 +759,24 @@ static __init int threshold_init_device(void)
return 0;
}
device_initcall(threshold_init_device);
/*
* there are 3 funcs which need to be _initcalled in a logic sequence:
* 1. xen_late_init_mcelog
* 2. mcheck_init_device
* 3. threshold_init_device
*
* xen_late_init_mcelog must register xen_mce_chrdev_device before
* native mce_chrdev_device registration if running under xen platform;
*
* mcheck_init_device should be inited before threshold_init_device to
* initialize mce_device, otherwise a NULL ptr dereference will cause panic.
*
* so we use following _initcalls
* 1. device_initcall(xen_late_init_mcelog);
* 2. device_initcall_sync(mcheck_init_device);
* 3. late_initcall(threshold_init_device);
*
* when running under xen, the initcall order is 1,2,3;
* on baremetal, we skip 1 and we do only 2 and 3.
*/
late_initcall(threshold_init_device);

View File

@ -31,6 +31,7 @@
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/memblock.h>
#include <linux/syscore_ops.h>
#include <xen/xen.h>
#include <xen/interface/xen.h>
@ -38,6 +39,7 @@
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
#include <xen/interface/memory.h>
#include <xen/interface/xen-mca.h>
#include <xen/features.h>
#include <xen/page.h>
#include <xen/hvm.h>
@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
* Point at some empty memory to start with. We map the real shared_info
* page as soon as fixmap is up and running.
*/
struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
/*
* Flag to determine whether vcpu info placement is available on all
@ -124,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
*/
static int have_vcpu_info_placement = 1;
struct tls_descs {
struct desc_struct desc[3];
};
/*
* Updating the 3 TLS descriptors in the GDT on every task switch is
* surprisingly expensive so we avoid updating them if they haven't
* changed. Since Xen writes different descriptors than the one
* passed in the update_descriptor hypercall we keep shadow copies to
* compare against.
*/
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
static void clamp_max_cpus(void)
{
#ifdef CONFIG_SMP
@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
unsigned int xsave_mask;
cpuid_leaf1_edx_mask =
~((1 << X86_FEATURE_MCE) | /* disable MCE */
(1 << X86_FEATURE_MCA) | /* disable MCA */
(1 << X86_FEATURE_MTRR) | /* disable MTRR */
~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
if (!xen_initial_domain())
@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
BUG();
}
static inline bool desc_equal(const struct desc_struct *d1,
const struct desc_struct *d2)
{
return d1->a == d2->a && d1->b == d2->b;
}
static void load_TLS_descriptor(struct thread_struct *t,
unsigned int cpu, unsigned int i)
{
struct desc_struct *gdt = get_cpu_gdt_table(cpu);
xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
struct multicall_space mc = __xen_mc_entry(0);
struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
struct desc_struct *gdt;
xmaddr_t maddr;
struct multicall_space mc;
if (desc_equal(shadow, &t->tls_array[i]))
return;
*shadow = t->tls_array[i];
gdt = get_cpu_gdt_table(cpu);
maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
mc = __xen_mc_entry(0);
MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
}
@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
/*
* Look for known traps using IST, and substitute them
* appropriately. The debugger ones are the only ones we care
* about. Xen will handle faults like double_fault and
* machine_check, so we should never see them. Warn if
* about. Xen will handle faults like double_fault,
* so we should never see them. Warn if
* there's an unexpected IST-using fault handler.
*/
if (addr == (unsigned long)debug)
@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
return 0;
#ifdef CONFIG_X86_MCE
} else if (addr == (unsigned long)machine_check) {
return 0;
/*
* when xen hypervisor inject vMCE to guest,
* use native mce handler to handle it
*/
;
#endif
} else {
/* Some other trap using IST? */
@ -1437,17 +1470,142 @@ asmlinkage void __init xen_start_kernel(void)
#endif
}
static int init_hvm_pv_info(int *major, int *minor)
#ifdef CONFIG_XEN_PVHVM
/*
* The pfn containing the shared_info is located somewhere in RAM. This
* will cause trouble if the current kernel is doing a kexec boot into a
* new kernel. The new kernel (and its startup code) can not know where
* the pfn is, so it can not reserve the page. The hypervisor will
* continue to update the pfn, and as a result memory corruption occours
* in the new kernel.
*
* One way to work around this issue is to allocate a page in the
* xen-platform pci device's BAR memory range. But pci init is done very
* late and the shared_info page is already in use very early to read
* the pvclock. So moving the pfn from RAM to MMIO is racy because some
* code paths on other vcpus could access the pfn during the small
* window when the old pfn is moved to the new pfn. There is even a
* small window were the old pfn is not backed by a mfn, and during that
* time all reads return -1.
*
* Because it is not known upfront where the MMIO region is located it
* can not be used right from the start in xen_hvm_init_shared_info.
*
* To minimise trouble the move of the pfn is done shortly before kexec.
* This does not eliminate the race because all vcpus are still online
* when the syscore_ops will be called. But hopefully there is no work
* pending at this point in time. Also the syscore_op is run last which
* reduces the risk further.
*/
static struct shared_info *xen_hvm_shared_info;
static void xen_hvm_connect_shared_info(unsigned long pfn)
{
struct xen_add_to_physmap xatp;
xatp.domid = DOMID_SELF;
xatp.idx = 0;
xatp.space = XENMAPSPACE_shared_info;
xatp.gpfn = pfn;
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
BUG();
}
static void xen_hvm_set_shared_info(struct shared_info *sip)
{
int cpu;
HYPERVISOR_shared_info = sip;
/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
* page, we use it in the event channel upcall and in some pvclock
* related functions. We don't need the vcpu_info placement
* optimizations because we don't use any pv_mmu or pv_irq op on
* HVM.
* When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
* online but xen_hvm_set_shared_info is run at resume time too and
* in that case multiple vcpus might be online. */
for_each_online_cpu(cpu) {
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
}
}
/* Reconnect the shared_info pfn to a mfn */
void xen_hvm_resume_shared_info(void)
{
xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
}
#ifdef CONFIG_KEXEC
static struct shared_info *xen_hvm_shared_info_kexec;
static unsigned long xen_hvm_shared_info_pfn_kexec;
/* Remember a pfn in MMIO space for kexec reboot */
void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
{
xen_hvm_shared_info_kexec = sip;
xen_hvm_shared_info_pfn_kexec = pfn;
}
static void xen_hvm_syscore_shutdown(void)
{
struct xen_memory_reservation reservation = {
.domid = DOMID_SELF,
.nr_extents = 1,
};
unsigned long prev_pfn;
int rc;
if (!xen_hvm_shared_info_kexec)
return;
prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
set_xen_guest_handle(reservation.extent_start, &prev_pfn);
/* Move pfn to MMIO, disconnects previous pfn from mfn */
xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
/* Update pointers, following hypercall is also a memory barrier */
xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
/* Allocate new mfn for previous pfn */
do {
rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
if (rc == 0)
msleep(123);
} while (rc == 0);
/* Make sure the previous pfn is really connected to a (new) mfn */
BUG_ON(rc != 1);
}
static struct syscore_ops xen_hvm_syscore_ops = {
.shutdown = xen_hvm_syscore_shutdown,
};
#endif
/* Use a pfn in RAM, may move to MMIO before kexec. */
static void __init xen_hvm_init_shared_info(void)
{
/* Remember pointer for resume */
xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
xen_hvm_set_shared_info(xen_hvm_shared_info);
}
static void __init init_hvm_pv_info(void)
{
int major, minor;
uint32_t eax, ebx, ecx, edx, pages, msr, base;
u64 pfn;
base = xen_cpuid_base();
cpuid(base + 1, &eax, &ebx, &ecx, &edx);
*major = eax >> 16;
*minor = eax & 0xffff;
printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
major = eax >> 16;
minor = eax & 0xffff;
printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
cpuid(base + 2, &pages, &msr, &ecx, &edx);
@ -1459,42 +1617,8 @@ static int init_hvm_pv_info(int *major, int *minor)
pv_info.name = "Xen HVM";
xen_domain_type = XEN_HVM_DOMAIN;
return 0;
}
void __ref xen_hvm_init_shared_info(void)
{
int cpu;
struct xen_add_to_physmap xatp;
static struct shared_info *shared_info_page = 0;
if (!shared_info_page)
shared_info_page = (struct shared_info *)
extend_brk(PAGE_SIZE, PAGE_SIZE);
xatp.domid = DOMID_SELF;
xatp.idx = 0;
xatp.space = XENMAPSPACE_shared_info;
xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
BUG();
HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
* page, we use it in the event channel upcall and in some pvclock
* related functions. We don't need the vcpu_info placement
* optimizations because we don't use any pv_mmu or pv_irq op on
* HVM.
* When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
* online but xen_hvm_init_shared_info is run at resume time too and
* in that case multiple vcpus might be online. */
for_each_online_cpu(cpu) {
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
}
}
#ifdef CONFIG_XEN_PVHVM
static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
@ -1517,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
static void __init xen_hvm_guest_init(void)
{
int r;
int major, minor;
r = init_hvm_pv_info(&major, &minor);
if (r < 0)
return;
init_hvm_pv_info();
xen_hvm_init_shared_info();
#ifdef CONFIG_KEXEC
register_syscore_ops(&xen_hvm_syscore_ops);
#endif
if (xen_feature(XENFEAT_hvm_callback_vector))
xen_have_vector_callback = 1;

View File

@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
{
if (!xen_batched_set_pte(ptep, pteval))
native_set_pte(ptep, pteval);
if (!xen_batched_set_pte(ptep, pteval)) {
/*
* Could call native_set_pte() here and trap and
* emulate the PTE write but with 32-bit guests this
* needs two traps (one for each of the two 32-bit
* words in the PTE) so do one hypercall directly
* instead.
*/
struct mmu_update u;
u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
u.val = pte_val_ma(pteval);
HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
}
}
static void xen_set_pte(pte_t *ptep, pte_t pteval)
@ -1416,13 +1428,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
}
#endif /* CONFIG_X86_64 */
/* Init-time set_pte while constructing initial pagetables, which
doesn't allow RO pagetable pages to be remapped RW */
/*
* Init-time set_pte while constructing initial pagetables, which
* doesn't allow RO page table pages to be remapped RW.
*
* If there is no MFN for this PFN then this page is initially
* ballooned out so clear the PTE (as in decrease_reservation() in
* drivers/xen/balloon.c).
*
* Many of these PTE updates are done on unpinned and writable pages
* and doing a hypercall for these is unnecessary and expensive. At
* this point it is not possible to tell if a page is pinned or not,
* so always write the PTE directly and rely on Xen trapping and
* emulating any updates as necessary.
*/
static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
{
pte = mask_rw_pte(ptep, pte);
if (pte_mfn(pte) != INVALID_P2M_ENTRY)
pte = mask_rw_pte(ptep, pte);
else
pte = __pte_ma(0);
xen_set_pte(ptep, pte);
native_set_pte(ptep, pte);
}
static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)

View File

@ -157,25 +157,24 @@ static unsigned long __init xen_populate_chunk(
unsigned long dest_pfn;
for (i = 0, entry = list; i < map_size; i++, entry++) {
unsigned long credits = credits_left;
unsigned long s_pfn;
unsigned long e_pfn;
unsigned long pfns;
long capacity;
if (credits <= 0)
if (credits_left <= 0)
break;
if (entry->type != E820_RAM)
continue;
e_pfn = PFN_UP(entry->addr + entry->size);
e_pfn = PFN_DOWN(entry->addr + entry->size);
/* We only care about E820 after the xen_start_info->nr_pages */
if (e_pfn <= max_pfn)
continue;
s_pfn = PFN_DOWN(entry->addr);
s_pfn = PFN_UP(entry->addr);
/* If the E820 falls within the nr_pages, we want to start
* at the nr_pages PFN.
* If that would mean going past the E820 entry, skip it
@ -184,23 +183,19 @@ static unsigned long __init xen_populate_chunk(
capacity = e_pfn - max_pfn;
dest_pfn = max_pfn;
} else {
/* last_pfn MUST be within E820_RAM regions */
if (*last_pfn && e_pfn >= *last_pfn)
s_pfn = *last_pfn;
capacity = e_pfn - s_pfn;
dest_pfn = s_pfn;
}
/* If we had filled this E820_RAM entry, go to the next one. */
if (capacity <= 0)
continue;
if (credits > capacity)
credits = capacity;
if (credits_left < capacity)
capacity = credits_left;
pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false);
pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
done += pfns;
credits_left -= pfns;
*last_pfn = (dest_pfn + pfns);
if (pfns < capacity)
break;
credits_left -= pfns;
}
return done;
}

View File

@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
{
#ifdef CONFIG_XEN_PVHVM
int cpu;
xen_hvm_init_shared_info();
xen_hvm_resume_shared_info();
xen_callback_vector();
xen_unplug_emulated_devices();
if (xen_feature(XENFEAT_hvm_safe_pvclock)) {

View File

@ -41,7 +41,7 @@ void xen_enable_syscall(void);
void xen_vcpu_restore(void);
void xen_callback_vector(void);
void xen_hvm_init_shared_info(void);
void xen_hvm_resume_shared_info(void);
void xen_unplug_emulated_devices(void);
void __init xen_build_dynamic_phys_to_machine(void);

View File

@ -209,11 +209,10 @@ static int xen_hvm_console_init(void)
info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL | __GFP_ZERO);
if (!info)
return -ENOMEM;
}
/* already configured */
if (info->intf != NULL)
} else if (info->intf != NULL) {
/* already configured */
return 0;
}
/*
* If the toolstack (or the hypervisor) hasn't set these values, the
* default value is 0. Even though mfn = 0 and evtchn = 0 are
@ -259,12 +258,10 @@ static int xen_pv_console_init(void)
info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL | __GFP_ZERO);
if (!info)
return -ENOMEM;
}
/* already configured */
if (info->intf != NULL)
} else if (info->intf != NULL) {
/* already configured */
return 0;
}
info->evtchn = xen_start_info->console.domU.evtchn;
info->intf = mfn_to_virt(xen_start_info->console.domU.mfn);
info->vtermno = HVC_COOKIE;

View File

@ -196,4 +196,12 @@ config XEN_ACPI_PROCESSOR
called xen_acpi_processor If you do not know what to choose, select
M here. If the CPUFREQ drivers are built in, select Y here.
config XEN_MCE_LOG
bool "Xen platform mcelog"
depends on XEN_DOM0 && X86_64 && X86_MCE
default n
help
Allow kernel fetching MCE error from Xen platform and
converting it into Linux mcelog format for mcelog tools
endmenu

View File

@ -17,7 +17,9 @@ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
obj-$(CONFIG_XEN_PVHVM) += platform-pci.o
obj-$(CONFIG_XEN_TMEM) += tmem.o
obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
obj-$(CONFIG_XEN_DOM0) += pcpu.o
obj-$(CONFIG_XEN_DOM0) += pci.o acpi.o
obj-$(CONFIG_XEN_MCE_LOG) += mcelog.o
obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/
obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o
obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o

414
drivers/xen/mcelog.c Normal file
View File

@ -0,0 +1,414 @@
/******************************************************************************
* mcelog.c
* Driver for receiving and transferring machine check error infomation
*
* Copyright (c) 2012 Intel Corporation
* Author: Liu, Jinsong <jinsong.liu@intel.com>
* Author: Jiang, Yunhong <yunhong.jiang@intel.com>
* Author: Ke, Liping <liping.ke@intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/miscdevice.h>
#include <linux/uaccess.h>
#include <linux/capability.h>
#include <linux/poll.h>
#include <linux/sched.h>
#include <xen/interface/xen.h>
#include <xen/events.h>
#include <xen/interface/vcpu.h>
#include <xen/xen.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#define XEN_MCELOG "xen_mcelog: "
static struct mc_info g_mi;
static struct mcinfo_logical_cpu *g_physinfo;
static uint32_t ncpus;
static DEFINE_MUTEX(mcelog_lock);
static struct xen_mce_log xen_mcelog = {
.signature = XEN_MCE_LOG_SIGNATURE,
.len = XEN_MCE_LOG_LEN,
.recordlen = sizeof(struct xen_mce),
};
static DEFINE_SPINLOCK(xen_mce_chrdev_state_lock);
static int xen_mce_chrdev_open_count; /* #times opened */
static int xen_mce_chrdev_open_exclu; /* already open exclusive? */
static DECLARE_WAIT_QUEUE_HEAD(xen_mce_chrdev_wait);
static int xen_mce_chrdev_open(struct inode *inode, struct file *file)
{
spin_lock(&xen_mce_chrdev_state_lock);
if (xen_mce_chrdev_open_exclu ||
(xen_mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
spin_unlock(&xen_mce_chrdev_state_lock);
return -EBUSY;
}
if (file->f_flags & O_EXCL)
xen_mce_chrdev_open_exclu = 1;
xen_mce_chrdev_open_count++;
spin_unlock(&xen_mce_chrdev_state_lock);
return nonseekable_open(inode, file);
}
static int xen_mce_chrdev_release(struct inode *inode, struct file *file)
{
spin_lock(&xen_mce_chrdev_state_lock);
xen_mce_chrdev_open_count--;
xen_mce_chrdev_open_exclu = 0;
spin_unlock(&xen_mce_chrdev_state_lock);
return 0;
}
static ssize_t xen_mce_chrdev_read(struct file *filp, char __user *ubuf,
size_t usize, loff_t *off)
{
char __user *buf = ubuf;
unsigned num;
int i, err;
mutex_lock(&mcelog_lock);
num = xen_mcelog.next;
/* Only supports full reads right now */
err = -EINVAL;
if (*off != 0 || usize < XEN_MCE_LOG_LEN*sizeof(struct xen_mce))
goto out;
err = 0;
for (i = 0; i < num; i++) {
struct xen_mce *m = &xen_mcelog.entry[i];
err |= copy_to_user(buf, m, sizeof(*m));
buf += sizeof(*m);
}
memset(xen_mcelog.entry, 0, num * sizeof(struct xen_mce));
xen_mcelog.next = 0;
if (err)
err = -EFAULT;
out:
mutex_unlock(&mcelog_lock);
return err ? err : buf - ubuf;
}
static unsigned int xen_mce_chrdev_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &xen_mce_chrdev_wait, wait);
if (xen_mcelog.next)
return POLLIN | POLLRDNORM;
return 0;
}
static long xen_mce_chrdev_ioctl(struct file *f, unsigned int cmd,
unsigned long arg)
{
int __user *p = (int __user *)arg;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
switch (cmd) {
case MCE_GET_RECORD_LEN:
return put_user(sizeof(struct xen_mce), p);
case MCE_GET_LOG_LEN:
return put_user(XEN_MCE_LOG_LEN, p);
case MCE_GETCLEAR_FLAGS: {
unsigned flags;
do {
flags = xen_mcelog.flags;
} while (cmpxchg(&xen_mcelog.flags, flags, 0) != flags);
return put_user(flags, p);
}
default:
return -ENOTTY;
}
}
static const struct file_operations xen_mce_chrdev_ops = {
.open = xen_mce_chrdev_open,
.release = xen_mce_chrdev_release,
.read = xen_mce_chrdev_read,
.poll = xen_mce_chrdev_poll,
.unlocked_ioctl = xen_mce_chrdev_ioctl,
.llseek = no_llseek,
};
static struct miscdevice xen_mce_chrdev_device = {
MISC_MCELOG_MINOR,
"mcelog",
&xen_mce_chrdev_ops,
};
/*
* Caller should hold the mcelog_lock
*/
static void xen_mce_log(struct xen_mce *mce)
{
unsigned entry;
entry = xen_mcelog.next;
/*
* When the buffer fills up discard new entries.
* Assume that the earlier errors are the more
* interesting ones:
*/
if (entry >= XEN_MCE_LOG_LEN) {
set_bit(XEN_MCE_OVERFLOW,
(unsigned long *)&xen_mcelog.flags);
return;
}
memcpy(xen_mcelog.entry + entry, mce, sizeof(struct xen_mce));
xen_mcelog.next++;
}
static int convert_log(struct mc_info *mi)
{
struct mcinfo_common *mic;
struct mcinfo_global *mc_global;
struct mcinfo_bank *mc_bank;
struct xen_mce m;
uint32_t i;
mic = NULL;
x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL);
if (unlikely(!mic)) {
pr_warning(XEN_MCELOG "Failed to find global error info\n");
return -ENODEV;
}
memset(&m, 0, sizeof(struct xen_mce));
mc_global = (struct mcinfo_global *)mic;
m.mcgstatus = mc_global->mc_gstatus;
m.apicid = mc_global->mc_apicid;
for (i = 0; i < ncpus; i++)
if (g_physinfo[i].mc_apicid == m.apicid)
break;
if (unlikely(i == ncpus)) {
pr_warning(XEN_MCELOG "Failed to match cpu with apicid %d\n",
m.apicid);
return -ENODEV;
}
m.socketid = g_physinfo[i].mc_chipid;
m.cpu = m.extcpu = g_physinfo[i].mc_cpunr;
m.cpuvendor = (__u8)g_physinfo[i].mc_vendor;
m.mcgcap = g_physinfo[i].mc_msrvalues[__MC_MSR_MCGCAP].value;
mic = NULL;
x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK);
if (unlikely(!mic)) {
pr_warning(XEN_MCELOG "Fail to find bank error info\n");
return -ENODEV;
}
do {
if ((!mic) || (mic->size == 0) ||
(mic->type != MC_TYPE_GLOBAL &&
mic->type != MC_TYPE_BANK &&
mic->type != MC_TYPE_EXTENDED &&
mic->type != MC_TYPE_RECOVERY))
break;
if (mic->type == MC_TYPE_BANK) {
mc_bank = (struct mcinfo_bank *)mic;
m.misc = mc_bank->mc_misc;
m.status = mc_bank->mc_status;
m.addr = mc_bank->mc_addr;
m.tsc = mc_bank->mc_tsc;
m.bank = mc_bank->mc_bank;
m.finished = 1;
/*log this record*/
xen_mce_log(&m);
}
mic = x86_mcinfo_next(mic);
} while (1);
return 0;
}
static int mc_queue_handle(uint32_t flags)
{
struct xen_mc mc_op;
int ret = 0;
mc_op.cmd = XEN_MC_fetch;
mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
set_xen_guest_handle(mc_op.u.mc_fetch.data, &g_mi);
do {
mc_op.u.mc_fetch.flags = flags;
ret = HYPERVISOR_mca(&mc_op);
if (ret) {
pr_err(XEN_MCELOG "Failed to fetch %s error log\n",
(flags == XEN_MC_URGENT) ?
"urgnet" : "nonurgent");
break;
}
if (mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
break;
else {
ret = convert_log(&g_mi);
if (ret)
pr_warning(XEN_MCELOG
"Failed to convert this error log, "
"continue acking it anyway\n");
mc_op.u.mc_fetch.flags = flags | XEN_MC_ACK;
ret = HYPERVISOR_mca(&mc_op);
if (ret) {
pr_err(XEN_MCELOG
"Failed to ack previous error log\n");
break;
}
}
} while (1);
return ret;
}
/* virq handler for machine check error info*/
static void xen_mce_work_fn(struct work_struct *work)
{
int err;
mutex_lock(&mcelog_lock);
/* urgent mc_info */
err = mc_queue_handle(XEN_MC_URGENT);
if (err)
pr_err(XEN_MCELOG
"Failed to handle urgent mc_info queue, "
"continue handling nonurgent mc_info queue anyway.\n");
/* nonurgent mc_info */
err = mc_queue_handle(XEN_MC_NONURGENT);
if (err)
pr_err(XEN_MCELOG
"Failed to handle nonurgent mc_info queue.\n");
/* wake processes polling /dev/mcelog */
wake_up_interruptible(&xen_mce_chrdev_wait);
mutex_unlock(&mcelog_lock);
}
static DECLARE_WORK(xen_mce_work, xen_mce_work_fn);
static irqreturn_t xen_mce_interrupt(int irq, void *dev_id)
{
schedule_work(&xen_mce_work);
return IRQ_HANDLED;
}
static int bind_virq_for_mce(void)
{
int ret;
struct xen_mc mc_op;
memset(&mc_op, 0, sizeof(struct xen_mc));
/* Fetch physical CPU Numbers */
mc_op.cmd = XEN_MC_physcpuinfo;
mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
ret = HYPERVISOR_mca(&mc_op);
if (ret) {
pr_err(XEN_MCELOG "Failed to get CPU numbers\n");
return ret;
}
/* Fetch each CPU Physical Info for later reference*/
ncpus = mc_op.u.mc_physcpuinfo.ncpus;
g_physinfo = kcalloc(ncpus, sizeof(struct mcinfo_logical_cpu),
GFP_KERNEL);
if (!g_physinfo)
return -ENOMEM;
set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
ret = HYPERVISOR_mca(&mc_op);
if (ret) {
pr_err(XEN_MCELOG "Failed to get CPU info\n");
kfree(g_physinfo);
return ret;
}
ret = bind_virq_to_irqhandler(VIRQ_MCA, 0,
xen_mce_interrupt, 0, "mce", NULL);
if (ret < 0) {
pr_err(XEN_MCELOG "Failed to bind virq\n");
kfree(g_physinfo);
return ret;
}
return 0;
}
static int __init xen_late_init_mcelog(void)
{
/* Only DOM0 is responsible for MCE logging */
if (xen_initial_domain()) {
/* register character device /dev/mcelog for xen mcelog */
if (misc_register(&xen_mce_chrdev_device))
return -ENODEV;
return bind_virq_for_mce();
}
return -ENODEV;
}
device_initcall(xen_late_init_mcelog);

371
drivers/xen/pcpu.c Normal file
View File

@ -0,0 +1,371 @@
/******************************************************************************
* pcpu.c
* Management physical cpu in dom0, get pcpu info and provide sys interface
*
* Copyright (c) 2012 Intel Corporation
* Author: Liu, Jinsong <jinsong.liu@intel.com>
* Author: Jiang, Yunhong <yunhong.jiang@intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/cpu.h>
#include <linux/stat.h>
#include <linux/capability.h>
#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/events.h>
#include <xen/interface/platform.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#define XEN_PCPU "xen_cpu: "
/*
* @cpu_id: Xen physical cpu logic number
* @flags: Xen physical cpu status flag
* - XEN_PCPU_FLAGS_ONLINE: cpu is online
* - XEN_PCPU_FLAGS_INVALID: cpu is not present
*/
struct pcpu {
struct list_head list;
struct device dev;
uint32_t cpu_id;
uint32_t flags;
};
static struct bus_type xen_pcpu_subsys = {
.name = "xen_cpu",
.dev_name = "xen_cpu",
};
static DEFINE_MUTEX(xen_pcpu_lock);
static LIST_HEAD(xen_pcpus);
static int xen_pcpu_down(uint32_t cpu_id)
{
struct xen_platform_op op = {
.cmd = XENPF_cpu_offline,
.interface_version = XENPF_INTERFACE_VERSION,
.u.cpu_ol.cpuid = cpu_id,
};
return HYPERVISOR_dom0_op(&op);
}
static int xen_pcpu_up(uint32_t cpu_id)
{
struct xen_platform_op op = {
.cmd = XENPF_cpu_online,
.interface_version = XENPF_INTERFACE_VERSION,
.u.cpu_ol.cpuid = cpu_id,
};
return HYPERVISOR_dom0_op(&op);
}
static ssize_t show_online(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct pcpu *cpu = container_of(dev, struct pcpu, dev);
return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE));
}
static ssize_t __ref store_online(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct pcpu *pcpu = container_of(dev, struct pcpu, dev);
unsigned long long val;
ssize_t ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (kstrtoull(buf, 0, &val) < 0)
return -EINVAL;
switch (val) {
case 0:
ret = xen_pcpu_down(pcpu->cpu_id);
break;
case 1:
ret = xen_pcpu_up(pcpu->cpu_id);
break;
default:
ret = -EINVAL;
}
if (ret >= 0)
ret = count;
return ret;
}
static DEVICE_ATTR(online, S_IRUGO | S_IWUSR, show_online, store_online);
static bool xen_pcpu_online(uint32_t flags)
{
return !!(flags & XEN_PCPU_FLAGS_ONLINE);
}
static void pcpu_online_status(struct xenpf_pcpuinfo *info,
struct pcpu *pcpu)
{
if (xen_pcpu_online(info->flags) &&
!xen_pcpu_online(pcpu->flags)) {
/* the pcpu is onlined */
pcpu->flags |= XEN_PCPU_FLAGS_ONLINE;
kobject_uevent(&pcpu->dev.kobj, KOBJ_ONLINE);
} else if (!xen_pcpu_online(info->flags) &&
xen_pcpu_online(pcpu->flags)) {
/* The pcpu is offlined */
pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE;
kobject_uevent(&pcpu->dev.kobj, KOBJ_OFFLINE);
}
}
static struct pcpu *get_pcpu(uint32_t cpu_id)
{
struct pcpu *pcpu;
list_for_each_entry(pcpu, &xen_pcpus, list) {
if (pcpu->cpu_id == cpu_id)
return pcpu;
}
return NULL;
}
static void pcpu_release(struct device *dev)
{
struct pcpu *pcpu = container_of(dev, struct pcpu, dev);
list_del(&pcpu->list);
kfree(pcpu);
}
static void unregister_and_remove_pcpu(struct pcpu *pcpu)
{
struct device *dev;
if (!pcpu)
return;
dev = &pcpu->dev;
if (dev->id)
device_remove_file(dev, &dev_attr_online);
/* pcpu remove would be implicitly done */
device_unregister(dev);
}
static int register_pcpu(struct pcpu *pcpu)
{
struct device *dev;
int err = -EINVAL;
if (!pcpu)
return err;
dev = &pcpu->dev;
dev->bus = &xen_pcpu_subsys;
dev->id = pcpu->cpu_id;
dev->release = pcpu_release;
err = device_register(dev);
if (err) {
pcpu_release(dev);
return err;
}
/*
* Xen never offline cpu0 due to several restrictions
* and assumptions. This basically doesn't add a sys control
* to user, one cannot attempt to offline BSP.
*/
if (dev->id) {
err = device_create_file(dev, &dev_attr_online);
if (err) {
device_unregister(dev);
return err;
}
}
return 0;
}
static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info)
{
struct pcpu *pcpu;
int err;
if (info->flags & XEN_PCPU_FLAGS_INVALID)
return ERR_PTR(-ENODEV);
pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL);
if (!pcpu)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&pcpu->list);
pcpu->cpu_id = info->xen_cpuid;
pcpu->flags = info->flags;
/* Need hold on xen_pcpu_lock before pcpu list manipulations */
list_add_tail(&pcpu->list, &xen_pcpus);
err = register_pcpu(pcpu);
if (err) {
pr_warning(XEN_PCPU "Failed to register pcpu%u\n",
info->xen_cpuid);
return ERR_PTR(-ENOENT);
}
return pcpu;
}
/*
* Caller should hold the xen_pcpu_lock
*/
static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu)
{
int ret;
struct pcpu *pcpu = NULL;
struct xenpf_pcpuinfo *info;
struct xen_platform_op op = {
.cmd = XENPF_get_cpuinfo,
.interface_version = XENPF_INTERFACE_VERSION,
.u.pcpu_info.xen_cpuid = cpu,
};
ret = HYPERVISOR_dom0_op(&op);
if (ret)
return ret;
info = &op.u.pcpu_info;
if (max_cpu)
*max_cpu = info->max_present;
pcpu = get_pcpu(cpu);
/*
* Only those at cpu present map has its sys interface.
*/
if (info->flags & XEN_PCPU_FLAGS_INVALID) {
if (pcpu)
unregister_and_remove_pcpu(pcpu);
return 0;
}
if (!pcpu) {
pcpu = create_and_register_pcpu(info);
if (IS_ERR_OR_NULL(pcpu))
return -ENODEV;
} else
pcpu_online_status(info, pcpu);
return 0;
}
/*
* Sync dom0's pcpu information with xen hypervisor's
*/
static int xen_sync_pcpus(void)
{
/*
* Boot cpu always have cpu_id 0 in xen
*/
uint32_t cpu = 0, max_cpu = 0;
int err = 0;
struct pcpu *pcpu, *tmp;
mutex_lock(&xen_pcpu_lock);
while (!err && (cpu <= max_cpu)) {
err = sync_pcpu(cpu, &max_cpu);
cpu++;
}
if (err)
list_for_each_entry_safe(pcpu, tmp, &xen_pcpus, list)
unregister_and_remove_pcpu(pcpu);
mutex_unlock(&xen_pcpu_lock);
return err;
}
static void xen_pcpu_work_fn(struct work_struct *work)
{
xen_sync_pcpus();
}
static DECLARE_WORK(xen_pcpu_work, xen_pcpu_work_fn);
static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id)
{
schedule_work(&xen_pcpu_work);
return IRQ_HANDLED;
}
static int __init xen_pcpu_init(void)
{
int irq, ret;
if (!xen_initial_domain())
return -ENODEV;
irq = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, 0,
xen_pcpu_interrupt, 0,
"xen-pcpu", NULL);
if (irq < 0) {
pr_warning(XEN_PCPU "Failed to bind pcpu virq\n");
return irq;
}
ret = subsys_system_register(&xen_pcpu_subsys, NULL);
if (ret) {
pr_warning(XEN_PCPU "Failed to register pcpu subsys\n");
goto err1;
}
ret = xen_sync_pcpus();
if (ret) {
pr_warning(XEN_PCPU "Failed to sync pcpu info\n");
goto err2;
}
return 0;
err2:
bus_unregister(&xen_pcpu_subsys);
err1:
unbind_from_irqhandler(irq, NULL);
return ret;
}
arch_initcall(xen_pcpu_init);

View File

@ -101,6 +101,19 @@ static int platform_pci_resume(struct pci_dev *pdev)
return 0;
}
static void __devinit prepare_shared_info(void)
{
#ifdef CONFIG_KEXEC
unsigned long addr;
struct shared_info *hvm_shared_info;
addr = alloc_xen_mmio(PAGE_SIZE);
hvm_shared_info = ioremap(addr, PAGE_SIZE);
memset(hvm_shared_info, 0, PAGE_SIZE);
xen_hvm_prepare_kexec(hvm_shared_info, addr >> PAGE_SHIFT);
#endif
}
static int __devinit platform_pci_init(struct pci_dev *pdev,
const struct pci_device_id *ent)
{
@ -109,6 +122,9 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
long mmio_addr, mmio_len;
unsigned int max_nr_gframes;
if (!xen_domain())
return -ENODEV;
i = pci_enable_device(pdev);
if (i)
return i;
@ -135,6 +151,8 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
platform_mmio = mmio_addr;
platform_mmiolen = mmio_len;
prepare_shared_info();
if (!xen_have_vector_callback) {
ret = xen_allocate_irq(pdev);
if (ret) {

View File

@ -520,15 +520,18 @@ static int __init xen_acpi_processor_init(void)
if (!pr_backup) {
pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
if (pr_backup)
memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
}
(void)upload_pm_data(_pr);
}
rc = check_acpi_ids(pr_backup);
if (rc)
goto err_unregister;
kfree(pr_backup);
pr_backup = NULL;
if (rc)
goto err_unregister;
return 0;
err_unregister:

View File

@ -618,6 +618,23 @@ static struct xenbus_watch *find_watch(const char *token)
return NULL;
}
static void xs_reset_watches(void)
{
int err, supported = 0;
if (!xen_hvm_domain())
return;
err = xenbus_scanf(XBT_NIL, "control",
"platform-feature-xs_reset_watches", "%d", &supported);
if (err != 1 || !supported)
return;
err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL));
if (err && err != -EEXIST)
printk(KERN_WARNING "xs_reset_watches failed: %d\n", err);
}
/* Register callback to watch this node. */
int register_xenbus_watch(struct xenbus_watch *watch)
{
@ -900,5 +917,8 @@ int xs_init(void)
if (IS_ERR(task))
return PTR_ERR(task);
/* shutdown watches for kexec boot */
xs_reset_watches();
return 0;
}

View File

@ -35,6 +35,7 @@
#define MPT_MINOR 220
#define MPT2SAS_MINOR 221
#define UINPUT_MINOR 223
#define MISC_MCELOG_MINOR 227
#define HPET_MINOR 228
#define FUSE_MINOR 229
#define KVM_MINOR 232

View File

@ -58,6 +58,8 @@ void notify_remote_via_irq(int irq);
void xen_irq_resume(void);
void xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn);
/* Clear an irq's pending state, in preparation for polling on it */
void xen_clear_irq_pending(int irq);
void xen_set_irq_pending(int irq);

View File

@ -29,7 +29,8 @@ enum xsd_sockmsg_type
XS_IS_DOMAIN_INTRODUCED,
XS_RESUME,
XS_SET_TARGET,
XS_RESTRICT
XS_RESTRICT,
XS_RESET_WATCHES,
};
#define XS_WRITE_NONE "NONE"

View File

@ -314,6 +314,13 @@ struct xenpf_pcpuinfo {
};
DEFINE_GUEST_HANDLE_STRUCT(xenpf_pcpuinfo);
#define XENPF_cpu_online 56
#define XENPF_cpu_offline 57
struct xenpf_cpu_ol {
uint32_t cpuid;
};
DEFINE_GUEST_HANDLE_STRUCT(xenpf_cpu_ol);
struct xen_platform_op {
uint32_t cmd;
uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@ -330,6 +337,7 @@ struct xen_platform_op {
struct xenpf_getidletime getidletime;
struct xenpf_set_processor_pminfo set_pminfo;
struct xenpf_pcpuinfo pcpu_info;
struct xenpf_cpu_ol cpu_ol;
uint8_t pad[128];
} u;
};

View File

@ -0,0 +1,385 @@
/******************************************************************************
* arch-x86/mca.h
* Guest OS machine check interface to x86 Xen.
*
* Contributed by Advanced Micro Devices, Inc.
* Author: Christoph Egger <Christoph.Egger@amd.com>
*
* Updated by Intel Corporation
* Author: Liu, Jinsong <jinsong.liu@intel.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__
#define __XEN_PUBLIC_ARCH_X86_MCA_H__
/* Hypercall */
#define __HYPERVISOR_mca __HYPERVISOR_arch_0
#define XEN_MCA_INTERFACE_VERSION 0x01ecc003
/* IN: Dom0 calls hypercall to retrieve nonurgent error log entry */
#define XEN_MC_NONURGENT 0x1
/* IN: Dom0 calls hypercall to retrieve urgent error log entry */
#define XEN_MC_URGENT 0x2
/* IN: Dom0 acknowledges previosly-fetched error log entry */
#define XEN_MC_ACK 0x4
/* OUT: All is ok */
#define XEN_MC_OK 0x0
/* OUT: Domain could not fetch data. */
#define XEN_MC_FETCHFAILED 0x1
/* OUT: There was no machine check data to fetch. */
#define XEN_MC_NODATA 0x2
#ifndef __ASSEMBLY__
/* vIRQ injected to Dom0 */
#define VIRQ_MCA VIRQ_ARCH_0
/*
* mc_info entry types
* mca machine check info are recorded in mc_info entries.
* when fetch mca info, it can use MC_TYPE_... to distinguish
* different mca info.
*/
#define MC_TYPE_GLOBAL 0
#define MC_TYPE_BANK 1
#define MC_TYPE_EXTENDED 2
#define MC_TYPE_RECOVERY 3
struct mcinfo_common {
uint16_t type; /* structure type */
uint16_t size; /* size of this struct in bytes */
};
#define MC_FLAG_CORRECTABLE (1 << 0)
#define MC_FLAG_UNCORRECTABLE (1 << 1)
#define MC_FLAG_RECOVERABLE (1 << 2)
#define MC_FLAG_POLLED (1 << 3)
#define MC_FLAG_RESET (1 << 4)
#define MC_FLAG_CMCI (1 << 5)
#define MC_FLAG_MCE (1 << 6)
/* contains x86 global mc information */
struct mcinfo_global {
struct mcinfo_common common;
uint16_t mc_domid; /* running domain at the time in error */
uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
uint32_t mc_socketid; /* physical socket of the physical core */
uint16_t mc_coreid; /* physical impacted core */
uint16_t mc_core_threadid; /* core thread of physical core */
uint32_t mc_apicid;
uint32_t mc_flags;
uint64_t mc_gstatus; /* global status */
};
/* contains x86 bank mc information */
struct mcinfo_bank {
struct mcinfo_common common;
uint16_t mc_bank; /* bank nr */
uint16_t mc_domid; /* domain referenced by mc_addr if valid */
uint64_t mc_status; /* bank status */
uint64_t mc_addr; /* bank address */
uint64_t mc_misc;
uint64_t mc_ctrl2;
uint64_t mc_tsc;
};
struct mcinfo_msr {
uint64_t reg; /* MSR */
uint64_t value; /* MSR value */
};
/* contains mc information from other or additional mc MSRs */
struct mcinfo_extended {
struct mcinfo_common common;
uint32_t mc_msrs; /* Number of msr with valid values. */
/*
* Currently Intel extended MSR (32/64) include all gp registers
* and E(R)FLAGS, E(R)IP, E(R)MISC, up to 11/19 of them might be
* useful at present. So expand this array to 16/32 to leave room.
*/
struct mcinfo_msr mc_msr[sizeof(void *) * 4];
};
/* Recovery Action flags. Giving recovery result information to DOM0 */
/* Xen takes successful recovery action, the error is recovered */
#define REC_ACTION_RECOVERED (0x1 << 0)
/* No action is performed by XEN */
#define REC_ACTION_NONE (0x1 << 1)
/* It's possible DOM0 might take action ownership in some case */
#define REC_ACTION_NEED_RESET (0x1 << 2)
/*
* Different Recovery Action types, if the action is performed successfully,
* REC_ACTION_RECOVERED flag will be returned.
*/
/* Page Offline Action */
#define MC_ACTION_PAGE_OFFLINE (0x1 << 0)
/* CPU offline Action */
#define MC_ACTION_CPU_OFFLINE (0x1 << 1)
/* L3 cache disable Action */
#define MC_ACTION_CACHE_SHRINK (0x1 << 2)
/*
* Below interface used between XEN/DOM0 for passing XEN's recovery action
* information to DOM0.
*/
struct page_offline_action {
/* Params for passing the offlined page number to DOM0 */
uint64_t mfn;
uint64_t status;
};
struct cpu_offline_action {
/* Params for passing the identity of the offlined CPU to DOM0 */
uint32_t mc_socketid;
uint16_t mc_coreid;
uint16_t mc_core_threadid;
};
#define MAX_UNION_SIZE 16
struct mcinfo_recovery {
struct mcinfo_common common;
uint16_t mc_bank; /* bank nr */
uint8_t action_flags;
uint8_t action_types;
union {
struct page_offline_action page_retire;
struct cpu_offline_action cpu_offline;
uint8_t pad[MAX_UNION_SIZE];
} action_info;
};
#define MCINFO_MAXSIZE 768
struct mc_info {
/* Number of mcinfo_* entries in mi_data */
uint32_t mi_nentries;
uint32_t flags;
uint64_t mi_data[(MCINFO_MAXSIZE - 1) / 8];
};
DEFINE_GUEST_HANDLE_STRUCT(mc_info);
#define __MC_MSR_ARRAYSIZE 8
#define __MC_MSR_MCGCAP 0
#define __MC_NMSRS 1
#define MC_NCAPS 7
struct mcinfo_logical_cpu {
uint32_t mc_cpunr;
uint32_t mc_chipid;
uint16_t mc_coreid;
uint16_t mc_threadid;
uint32_t mc_apicid;
uint32_t mc_clusterid;
uint32_t mc_ncores;
uint32_t mc_ncores_active;
uint32_t mc_nthreads;
uint32_t mc_cpuid_level;
uint32_t mc_family;
uint32_t mc_vendor;
uint32_t mc_model;
uint32_t mc_step;
char mc_vendorid[16];
char mc_brandid[64];
uint32_t mc_cpu_caps[MC_NCAPS];
uint32_t mc_cache_size;
uint32_t mc_cache_alignment;
uint32_t mc_nmsrvals;
struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE];
};
DEFINE_GUEST_HANDLE_STRUCT(mcinfo_logical_cpu);
/*
* Prototype:
* uint32_t x86_mcinfo_nentries(struct mc_info *mi);
*/
#define x86_mcinfo_nentries(_mi) \
((_mi)->mi_nentries)
/*
* Prototype:
* struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi);
*/
#define x86_mcinfo_first(_mi) \
((struct mcinfo_common *)(_mi)->mi_data)
/*
* Prototype:
* struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
*/
#define x86_mcinfo_next(_mic) \
((struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size))
/*
* Prototype:
* void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type);
*/
static inline void x86_mcinfo_lookup(struct mcinfo_common **ret,
struct mc_info *mi, uint16_t type)
{
uint32_t i;
struct mcinfo_common *mic;
bool found = 0;
if (!ret || !mi)
return;
mic = x86_mcinfo_first(mi);
for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
if (mic->type == type) {
found = 1;
break;
}
mic = x86_mcinfo_next(mic);
}
*ret = found ? mic : NULL;
}
/*
* Fetch machine check data from hypervisor.
*/
#define XEN_MC_fetch 1
struct xen_mc_fetch {
/*
* IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
* XEN_MC_ACK if ack'king an earlier fetch
* OUT: XEN_MC_OK, XEN_MC_FETCHAILED, XEN_MC_NODATA
*/
uint32_t flags;
uint32_t _pad0;
/* OUT: id for ack, IN: id we are ack'ing */
uint64_t fetch_id;
/* OUT variables. */
GUEST_HANDLE(mc_info) data;
};
DEFINE_GUEST_HANDLE_STRUCT(xen_mc_fetch);
/*
* This tells the hypervisor to notify a DomU about the machine check error
*/
#define XEN_MC_notifydomain 2
struct xen_mc_notifydomain {
/* IN variables */
uint16_t mc_domid; /* The unprivileged domain to notify */
uint16_t mc_vcpuid; /* The vcpu in mc_domid to notify */
/* IN/OUT variables */
uint32_t flags;
};
DEFINE_GUEST_HANDLE_STRUCT(xen_mc_notifydomain);
#define XEN_MC_physcpuinfo 3
struct xen_mc_physcpuinfo {
/* IN/OUT */
uint32_t ncpus;
uint32_t _pad0;
/* OUT */
GUEST_HANDLE(mcinfo_logical_cpu) info;
};
#define XEN_MC_msrinject 4
#define MC_MSRINJ_MAXMSRS 8
struct xen_mc_msrinject {
/* IN */
uint32_t mcinj_cpunr; /* target processor id */
uint32_t mcinj_flags; /* see MC_MSRINJ_F_* below */
uint32_t mcinj_count; /* 0 .. count-1 in array are valid */
uint32_t _pad0;
struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS];
};
/* Flags for mcinj_flags above; bits 16-31 are reserved */
#define MC_MSRINJ_F_INTERPOSE 0x1
#define XEN_MC_mceinject 5
struct xen_mc_mceinject {
unsigned int mceinj_cpunr; /* target processor id */
};
struct xen_mc {
uint32_t cmd;
uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
union {
struct xen_mc_fetch mc_fetch;
struct xen_mc_notifydomain mc_notifydomain;
struct xen_mc_physcpuinfo mc_physcpuinfo;
struct xen_mc_msrinject mc_msrinject;
struct xen_mc_mceinject mc_mceinject;
} u;
};
DEFINE_GUEST_HANDLE_STRUCT(xen_mc);
/* Fields are zero when not available */
struct xen_mce {
__u64 status;
__u64 misc;
__u64 addr;
__u64 mcgstatus;
__u64 ip;
__u64 tsc; /* cpu time stamp counter */
__u64 time; /* wall time_t when error was detected */
__u8 cpuvendor; /* cpu vendor as encoded in system.h */
__u8 inject_flags; /* software inject flags */
__u16 pad;
__u32 cpuid; /* CPUID 1 EAX */
__u8 cs; /* code segment */
__u8 bank; /* machine check bank */
__u8 cpu; /* cpu number; obsolete; use extcpu now */
__u8 finished; /* entry is valid */
__u32 extcpu; /* linux cpu number that detected the error */
__u32 socketid; /* CPU socket ID */
__u32 apicid; /* CPU initial apic ID */
__u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
};
/*
* This structure contains all data related to the MCE log. Also
* carries a signature to make it easier to find from external
* debugging tools. Each entry is only valid when its finished flag
* is set.
*/
#define XEN_MCE_LOG_LEN 32
struct xen_mce_log {
char signature[12]; /* "MACHINECHECK" */
unsigned len; /* = XEN_MCE_LOG_LEN */
unsigned next;
unsigned flags;
unsigned recordlen; /* length of struct xen_mce */
struct xen_mce entry[XEN_MCE_LOG_LEN];
};
#define XEN_MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
#define XEN_MCE_LOG_SIGNATURE "MACHINECHECK"
#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
#define MCE_GET_LOG_LEN _IOR('M', 2, int)
#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int)
#endif /* __ASSEMBLY__ */
#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */

View File

@ -80,6 +80,7 @@
#define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */
#define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */
#define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */
#define VIRQ_PCPU_STATE 9 /* (DOM0) PCPU state changed */
/* Architecture-specific VIRQ definitions. */
#define VIRQ_ARCH_0 16