KVM: Allow host IRQ sharing for assigned PCI 2.3 devices

PCI 2.3 allows to generically disable IRQ sources at device level. This
enables us to share legacy IRQs of such devices with other host devices
when passing them to a guest.

The new IRQ sharing feature introduced here is optional, user space has
to request it explicitly. Moreover, user space can inform us about its
view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the
interrupt and signaling it if the guest masked it via the virtualized
PCI config space.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
This commit is contained in:
Jan Kiszka 2012-02-28 14:19:54 +01:00 committed by Avi Kivity
parent 3e515705a1
commit 07700a94b0
5 changed files with 230 additions and 29 deletions

View file

@ -1169,6 +1169,14 @@ following flags are specified:
/* Depends on KVM_CAP_IOMMU */
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
/* The following two depend on KVM_CAP_PCI_2_3 */
#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
isolation of the device. Usages not specifying this flag are deprecated.
@ -1441,6 +1449,39 @@ The "num_dirty" field is a performance hint for KVM to determine whether it
should skip processing the bitmap and just invalidate everything. It must
be set to the number of set bits in the bitmap.
4.60 KVM_ASSIGN_SET_INTX_MASK
Capability: KVM_CAP_PCI_2_3
Architectures: x86
Type: vm ioctl
Parameters: struct kvm_assigned_pci_dev (in)
Returns: 0 on success, -1 on error
Allows userspace to mask PCI INTx interrupts from the assigned device. The
kernel will not deliver INTx interrupts to the guest between setting and
clearing of KVM_ASSIGN_SET_INTX_MASK via this interface. This enables use of
and emulation of PCI 2.3 INTx disable command register behavior.
This may be used for both PCI 2.3 devices supporting INTx disable natively and
older devices lacking this support. Userspace is responsible for emulating the
read value of the INTx disable bit in the guest visible PCI command register.
When modifying the INTx disable state, userspace should precede updating the
physical device command register by calling this ioctl to inform the kernel of
the new intended INTx mask state.
Note that the kernel uses the device INTx disable bit to internally manage the
device interrupt state for PCI 2.3 devices. Reads of this register may
therefore not match the expected value. Writes should always use the guest
intended INTx disable value rather than attempting to read-copy-update the
current physical device state. Races between user and kernel updates to the
INTx disable bit are handled lazily in the kernel. It's possible the device
may generate unintended interrupts, but they will not be injected into the
guest.
See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
evaluated.
4.62 KVM_CREATE_SPAPR_TCE
Capability: KVM_CAP_SPAPR_TCE

View file

@ -2143,6 +2143,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_PCI_2_3:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:

View file

@ -588,6 +588,7 @@ struct kvm_ppc_pvinfo {
#define KVM_CAP_TSC_DEADLINE_TIMER 72
#define KVM_CAP_S390_UCONTROL 73
#define KVM_CAP_SYNC_REGS 74
#define KVM_CAP_PCI_2_3 75
#ifdef KVM_CAP_IRQ_ROUTING
@ -784,6 +785,9 @@ struct kvm_s390_ucas_mapping {
/* Available with KVM_CAP_TSC_CONTROL */
#define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2)
#define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3)
/* Available with KVM_CAP_PCI_2_3 */
#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \
struct kvm_assigned_pci_dev)
/*
* ioctls for vcpu fds
@ -857,6 +861,8 @@ struct kvm_s390_ucas_mapping {
#define KVM_SET_ONE_REG _IOW(KVMIO, 0xac, struct kvm_one_reg)
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
struct kvm_assigned_pci_dev {
__u32 assigned_dev_id;

View file

@ -546,6 +546,7 @@ struct kvm_assigned_dev_kernel {
unsigned int entries_nr;
int host_irq;
bool host_irq_disabled;
bool pci_2_3;
struct msix_entry *host_msix_entries;
int guest_irq;
struct msix_entry *guest_msix_entries;
@ -555,6 +556,7 @@ struct kvm_assigned_dev_kernel {
struct pci_dev *dev;
struct kvm *kvm;
spinlock_t intx_lock;
struct mutex intx_mask_lock;
char irq_name[32];
struct pci_saved_state *pci_saved_state;
};

View file

@ -55,23 +55,67 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
return index;
}
static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
int ret;
spin_lock(&assigned_dev->intx_lock);
if (pci_check_and_mask_intx(assigned_dev->dev)) {
assigned_dev->host_irq_disabled = true;
ret = IRQ_WAKE_THREAD;
} else
ret = IRQ_NONE;
spin_unlock(&assigned_dev->intx_lock);
return ret;
}
static void
kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
int vector)
{
if (unlikely(assigned_dev->irq_requested_type &
KVM_DEV_IRQ_GUEST_INTX)) {
mutex_lock(&assigned_dev->intx_mask_lock);
if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
kvm_set_irq(assigned_dev->kvm,
assigned_dev->irq_source_id, vector, 1);
mutex_unlock(&assigned_dev->intx_mask_lock);
} else
kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
vector, 1);
}
static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
spin_lock(&assigned_dev->intx_lock);
if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
spin_lock_irq(&assigned_dev->intx_lock);
disable_irq_nosync(irq);
assigned_dev->host_irq_disabled = true;
spin_unlock(&assigned_dev->intx_lock);
spin_unlock_irq(&assigned_dev->intx_lock);
}
kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
assigned_dev->guest_irq, 1);
kvm_assigned_dev_raise_guest_irq(assigned_dev,
assigned_dev->guest_irq);
return IRQ_HANDLED;
}
#ifdef __KVM_HAVE_MSI
static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
kvm_assigned_dev_raise_guest_irq(assigned_dev,
assigned_dev->guest_irq);
return IRQ_HANDLED;
}
#endif
#ifdef __KVM_HAVE_MSIX
static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
{
@ -81,8 +125,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
if (index >= 0) {
vector = assigned_dev->guest_msix_entries[index].vector;
kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
vector, 1);
kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
}
return IRQ_HANDLED;
@ -98,15 +141,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
/* The guest irq may be shared so this ack may be
* from another device.
*/
spin_lock(&dev->intx_lock);
if (dev->host_irq_disabled) {
enable_irq(dev->host_irq);
dev->host_irq_disabled = false;
mutex_lock(&dev->intx_mask_lock);
if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
bool reassert = false;
spin_lock_irq(&dev->intx_lock);
/*
* The guest IRQ may be shared so this ack can come from an
* IRQ for another guest device.
*/
if (dev->host_irq_disabled) {
if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
enable_irq(dev->host_irq);
else if (!pci_check_and_unmask_intx(dev->dev))
reassert = true;
dev->host_irq_disabled = reassert;
}
spin_unlock_irq(&dev->intx_lock);
if (reassert)
kvm_set_irq(dev->kvm, dev->irq_source_id,
dev->guest_irq, 1);
}
spin_unlock(&dev->intx_lock);
mutex_unlock(&dev->intx_mask_lock);
}
static void deassign_guest_irq(struct kvm *kvm,
@ -154,7 +213,15 @@ static void deassign_host_irq(struct kvm *kvm,
pci_disable_msix(assigned_dev->dev);
} else {
/* Deal with MSI and INTx */
disable_irq(assigned_dev->host_irq);
if ((assigned_dev->irq_requested_type &
KVM_DEV_IRQ_HOST_INTX) &&
(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
spin_lock_irq(&assigned_dev->intx_lock);
pci_intx(assigned_dev->dev, false);
spin_unlock_irq(&assigned_dev->intx_lock);
synchronize_irq(assigned_dev->host_irq);
} else
disable_irq(assigned_dev->host_irq);
free_irq(assigned_dev->host_irq, assigned_dev);
@ -235,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
static int assigned_device_enable_host_intx(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
irq_handler_t irq_handler;
unsigned long flags;
dev->host_irq = dev->dev->irq;
/* Even though this is PCI, we don't want to use shared
* interrupts. Sharing host devices with guest-assigned devices
* on the same interrupt line is not a happy situation: there
* are going to be long delays in accepting, acking, etc.
/*
* We can only share the IRQ line with other host devices if we are
* able to disable the IRQ source at device-level - independently of
* the guest driver. Otherwise host devices may suffer from unbounded
* IRQ latencies when the guest keeps the line asserted.
*/
if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
IRQF_ONESHOT, dev->irq_name, dev))
if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
irq_handler = kvm_assigned_dev_intx;
flags = IRQF_SHARED;
} else {
irq_handler = NULL;
flags = IRQF_ONESHOT;
}
if (request_threaded_irq(dev->host_irq, irq_handler,
kvm_assigned_dev_thread_intx, flags,
dev->irq_name, dev))
return -EIO;
if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
spin_lock_irq(&dev->intx_lock);
pci_intx(dev->dev, true);
spin_unlock_irq(&dev->intx_lock);
}
return 0;
}
@ -260,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
}
dev->host_irq = dev->dev->irq;
if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
0, dev->irq_name, dev)) {
if (request_threaded_irq(dev->host_irq, NULL,
kvm_assigned_dev_thread_msi, 0,
dev->irq_name, dev)) {
pci_disable_msi(dev->dev);
return -EIO;
}
@ -319,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
{
dev->guest_irq = irq->guest_irq;
dev->ack_notifier.gsi = -1;
dev->host_irq_disabled = false;
return 0;
}
#endif
@ -331,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
{
dev->guest_irq = irq->guest_irq;
dev->ack_notifier.gsi = -1;
dev->host_irq_disabled = false;
return 0;
}
#endif
@ -365,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm,
default:
r = -EINVAL;
}
dev->host_irq_disabled = false;
if (!r)
dev->irq_requested_type |= host_irq_type;
@ -466,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
{
int r = -ENODEV;
struct kvm_assigned_dev_kernel *match;
unsigned long irq_type;
mutex_lock(&kvm->lock);
@ -474,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
if (!match)
goto out;
r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
KVM_DEV_IRQ_GUEST_MASK);
r = kvm_deassign_irq(kvm, match, irq_type);
out:
mutex_unlock(&kvm->lock);
return r;
@ -607,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
if (!match->pci_saved_state)
printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
__func__, dev_name(&dev->dev));
if (!pci_intx_mask_supported(dev))
assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
match->assigned_dev_id = assigned_dev->assigned_dev_id;
match->host_segnr = assigned_dev->segnr;
match->host_busnr = assigned_dev->busnr;
@ -614,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
match->flags = assigned_dev->flags;
match->dev = dev;
spin_lock_init(&match->intx_lock);
mutex_init(&match->intx_mask_lock);
match->irq_source_id = -1;
match->kvm = kvm;
match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
@ -759,6 +853,55 @@ msix_entry_out:
}
#endif
static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
struct kvm_assigned_pci_dev *assigned_dev)
{
int r = 0;
struct kvm_assigned_dev_kernel *match;
mutex_lock(&kvm->lock);
match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
assigned_dev->assigned_dev_id);
if (!match) {
r = -ENODEV;
goto out;
}
mutex_lock(&match->intx_mask_lock);
match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
kvm_set_irq(match->kvm, match->irq_source_id,
match->guest_irq, 0);
/*
* Masking at hardware-level is performed on demand,
* i.e. when an IRQ actually arrives at the host.
*/
} else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
/*
* Unmask the IRQ line if required. Unmasking at
* device level will be performed by user space.
*/
spin_lock_irq(&match->intx_lock);
if (match->host_irq_disabled) {
enable_irq(match->host_irq);
match->host_irq_disabled = false;
}
spin_unlock_irq(&match->intx_lock);
}
}
mutex_unlock(&match->intx_mask_lock);
out:
mutex_unlock(&kvm->lock);
return r;
}
long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
unsigned long arg)
{
@ -866,6 +1009,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
break;
}
#endif
case KVM_ASSIGN_SET_INTX_MASK: {
struct kvm_assigned_pci_dev assigned_dev;
r = -EFAULT;
if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
goto out;
r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
break;
}
default:
r = -ENOTTY;
break;
@ -873,4 +1025,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
out:
return r;
}