vfio: Add PCI device driver
Add PCI device support for VFIO. PCI devices expose regions for accessing config space, I/O port space, and MMIO areas of the device. PCI config access is virtualized in the kernel, allowing us to ensure the integrity of the system, by preventing various accesses while reducing duplicate support across various userspace drivers. I/O port supports read/write access while MMIO also supports mmap of sufficiently sized regions. Support for INTx, MSI, and MSI-X interrupts are provided using eventfds to userspace. Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
This commit is contained in:
parent
73fa0d10d0
commit
89e1f7d4c6
|
@ -12,3 +12,5 @@ menuconfig VFIO
|
|||
See Documentation/vfio.txt for more details.
|
||||
|
||||
If you don't know what to do here, say N.
|
||||
|
||||
source "drivers/vfio/pci/Kconfig"
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
config VFIO_PCI
|
||||
tristate "VFIO support for PCI devices"
|
||||
depends on VFIO && PCI && EVENTFD
|
||||
help
|
||||
Support for the PCI VFIO bus driver. This is required to make
|
||||
use of PCI drivers using the VFIO framework.
|
||||
|
||||
If you don't know what to do here, say N.
|
|
@ -0,0 +1,4 @@
|
|||
|
||||
vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
|
||||
|
||||
obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
|
|
@ -0,0 +1,579 @@
|
|||
/*
|
||||
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
||||
* Author: Alex Williamson <alex.williamson@redhat.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Derived from original vfio:
|
||||
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Author: Tom Lyon, pugs@cisco.com
|
||||
*/
|
||||
|
||||
#include <linux/device.h>
|
||||
#include <linux/eventfd.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/iommu.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/pm_runtime.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/vfio.h>
|
||||
|
||||
#include "vfio_pci_private.h"
|
||||
|
||||
#define DRIVER_VERSION "0.2"
|
||||
#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
|
||||
#define DRIVER_DESC "VFIO PCI - User Level meta-driver"
|
||||
|
||||
static bool nointxmask;
|
||||
module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
|
||||
MODULE_PARM_DESC(nointxmask,
|
||||
"Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
|
||||
|
||||
static int vfio_pci_enable(struct vfio_pci_device *vdev)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
int ret;
|
||||
u16 cmd;
|
||||
u8 msix_pos;
|
||||
|
||||
vdev->reset_works = (pci_reset_function(pdev) == 0);
|
||||
pci_save_state(pdev);
|
||||
vdev->pci_saved_state = pci_store_saved_state(pdev);
|
||||
if (!vdev->pci_saved_state)
|
||||
pr_debug("%s: Couldn't store %s saved state\n",
|
||||
__func__, dev_name(&pdev->dev));
|
||||
|
||||
ret = vfio_config_init(vdev);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (likely(!nointxmask))
|
||||
vdev->pci_2_3 = pci_intx_mask_supported(pdev);
|
||||
|
||||
pci_read_config_word(pdev, PCI_COMMAND, &cmd);
|
||||
if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
|
||||
cmd &= ~PCI_COMMAND_INTX_DISABLE;
|
||||
pci_write_config_word(pdev, PCI_COMMAND, cmd);
|
||||
}
|
||||
|
||||
msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
|
||||
if (msix_pos) {
|
||||
u16 flags;
|
||||
u32 table;
|
||||
|
||||
pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
|
||||
pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
|
||||
|
||||
vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK;
|
||||
vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
|
||||
vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
|
||||
} else
|
||||
vdev->msix_bar = 0xFF;
|
||||
|
||||
ret = pci_enable_device(pdev);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
return ret;
|
||||
|
||||
out:
|
||||
kfree(vdev->pci_saved_state);
|
||||
vdev->pci_saved_state = NULL;
|
||||
vfio_config_free(vdev);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void vfio_pci_disable(struct vfio_pci_device *vdev)
|
||||
{
|
||||
int bar;
|
||||
|
||||
pci_disable_device(vdev->pdev);
|
||||
|
||||
vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
|
||||
VFIO_IRQ_SET_ACTION_TRIGGER,
|
||||
vdev->irq_type, 0, 0, NULL);
|
||||
|
||||
vdev->virq_disabled = false;
|
||||
|
||||
vfio_config_free(vdev);
|
||||
|
||||
pci_reset_function(vdev->pdev);
|
||||
|
||||
if (pci_load_and_free_saved_state(vdev->pdev,
|
||||
&vdev->pci_saved_state) == 0)
|
||||
pci_restore_state(vdev->pdev);
|
||||
else
|
||||
pr_info("%s: Couldn't reload %s saved state\n",
|
||||
__func__, dev_name(&vdev->pdev->dev));
|
||||
|
||||
for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
|
||||
if (!vdev->barmap[bar])
|
||||
continue;
|
||||
pci_iounmap(vdev->pdev, vdev->barmap[bar]);
|
||||
pci_release_selected_regions(vdev->pdev, 1 << bar);
|
||||
vdev->barmap[bar] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void vfio_pci_release(void *device_data)
|
||||
{
|
||||
struct vfio_pci_device *vdev = device_data;
|
||||
|
||||
if (atomic_dec_and_test(&vdev->refcnt))
|
||||
vfio_pci_disable(vdev);
|
||||
|
||||
module_put(THIS_MODULE);
|
||||
}
|
||||
|
||||
static int vfio_pci_open(void *device_data)
|
||||
{
|
||||
struct vfio_pci_device *vdev = device_data;
|
||||
|
||||
if (!try_module_get(THIS_MODULE))
|
||||
return -ENODEV;
|
||||
|
||||
if (atomic_inc_return(&vdev->refcnt) == 1) {
|
||||
int ret = vfio_pci_enable(vdev);
|
||||
if (ret) {
|
||||
module_put(THIS_MODULE);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
|
||||
{
|
||||
if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
|
||||
u8 pin;
|
||||
pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
|
||||
if (pin)
|
||||
return 1;
|
||||
|
||||
} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
|
||||
u8 pos;
|
||||
u16 flags;
|
||||
|
||||
pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI);
|
||||
if (pos) {
|
||||
pci_read_config_word(vdev->pdev,
|
||||
pos + PCI_MSI_FLAGS, &flags);
|
||||
|
||||
return 1 << (flags & PCI_MSI_FLAGS_QMASK);
|
||||
}
|
||||
} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
|
||||
u8 pos;
|
||||
u16 flags;
|
||||
|
||||
pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX);
|
||||
if (pos) {
|
||||
pci_read_config_word(vdev->pdev,
|
||||
pos + PCI_MSIX_FLAGS, &flags);
|
||||
|
||||
return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long vfio_pci_ioctl(void *device_data,
|
||||
unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
struct vfio_pci_device *vdev = device_data;
|
||||
unsigned long minsz;
|
||||
|
||||
if (cmd == VFIO_DEVICE_GET_INFO) {
|
||||
struct vfio_device_info info;
|
||||
|
||||
minsz = offsetofend(struct vfio_device_info, num_irqs);
|
||||
|
||||
if (copy_from_user(&info, (void __user *)arg, minsz))
|
||||
return -EFAULT;
|
||||
|
||||
if (info.argsz < minsz)
|
||||
return -EINVAL;
|
||||
|
||||
info.flags = VFIO_DEVICE_FLAGS_PCI;
|
||||
|
||||
if (vdev->reset_works)
|
||||
info.flags |= VFIO_DEVICE_FLAGS_RESET;
|
||||
|
||||
info.num_regions = VFIO_PCI_NUM_REGIONS;
|
||||
info.num_irqs = VFIO_PCI_NUM_IRQS;
|
||||
|
||||
return copy_to_user((void __user *)arg, &info, minsz);
|
||||
|
||||
} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
struct vfio_region_info info;
|
||||
|
||||
minsz = offsetofend(struct vfio_region_info, offset);
|
||||
|
||||
if (copy_from_user(&info, (void __user *)arg, minsz))
|
||||
return -EFAULT;
|
||||
|
||||
if (info.argsz < minsz)
|
||||
return -EINVAL;
|
||||
|
||||
switch (info.index) {
|
||||
case VFIO_PCI_CONFIG_REGION_INDEX:
|
||||
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
|
||||
info.size = pdev->cfg_size;
|
||||
info.flags = VFIO_REGION_INFO_FLAG_READ |
|
||||
VFIO_REGION_INFO_FLAG_WRITE;
|
||||
break;
|
||||
case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
|
||||
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
|
||||
info.size = pci_resource_len(pdev, info.index);
|
||||
if (!info.size) {
|
||||
info.flags = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
info.flags = VFIO_REGION_INFO_FLAG_READ |
|
||||
VFIO_REGION_INFO_FLAG_WRITE;
|
||||
if (pci_resource_flags(pdev, info.index) &
|
||||
IORESOURCE_MEM && info.size >= PAGE_SIZE)
|
||||
info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
|
||||
break;
|
||||
case VFIO_PCI_ROM_REGION_INDEX:
|
||||
{
|
||||
void __iomem *io;
|
||||
size_t size;
|
||||
|
||||
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
|
||||
info.flags = 0;
|
||||
|
||||
/* Report the BAR size, not the ROM size */
|
||||
info.size = pci_resource_len(pdev, info.index);
|
||||
if (!info.size)
|
||||
break;
|
||||
|
||||
/* Is it really there? */
|
||||
io = pci_map_rom(pdev, &size);
|
||||
if (!io || !size) {
|
||||
info.size = 0;
|
||||
break;
|
||||
}
|
||||
pci_unmap_rom(pdev, io);
|
||||
|
||||
info.flags = VFIO_REGION_INFO_FLAG_READ;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return copy_to_user((void __user *)arg, &info, minsz);
|
||||
|
||||
} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
|
||||
struct vfio_irq_info info;
|
||||
|
||||
minsz = offsetofend(struct vfio_irq_info, count);
|
||||
|
||||
if (copy_from_user(&info, (void __user *)arg, minsz))
|
||||
return -EFAULT;
|
||||
|
||||
if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
|
||||
return -EINVAL;
|
||||
|
||||
info.flags = VFIO_IRQ_INFO_EVENTFD;
|
||||
|
||||
info.count = vfio_pci_get_irq_count(vdev, info.index);
|
||||
|
||||
if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
|
||||
info.flags |= (VFIO_IRQ_INFO_MASKABLE |
|
||||
VFIO_IRQ_INFO_AUTOMASKED);
|
||||
else
|
||||
info.flags |= VFIO_IRQ_INFO_NORESIZE;
|
||||
|
||||
return copy_to_user((void __user *)arg, &info, minsz);
|
||||
|
||||
} else if (cmd == VFIO_DEVICE_SET_IRQS) {
|
||||
struct vfio_irq_set hdr;
|
||||
u8 *data = NULL;
|
||||
int ret = 0;
|
||||
|
||||
minsz = offsetofend(struct vfio_irq_set, count);
|
||||
|
||||
if (copy_from_user(&hdr, (void __user *)arg, minsz))
|
||||
return -EFAULT;
|
||||
|
||||
if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
|
||||
hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
|
||||
VFIO_IRQ_SET_ACTION_TYPE_MASK))
|
||||
return -EINVAL;
|
||||
|
||||
if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
|
||||
size_t size;
|
||||
|
||||
if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
|
||||
size = sizeof(uint8_t);
|
||||
else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
|
||||
size = sizeof(int32_t);
|
||||
else
|
||||
return -EINVAL;
|
||||
|
||||
if (hdr.argsz - minsz < hdr.count * size ||
|
||||
hdr.count > vfio_pci_get_irq_count(vdev, hdr.index))
|
||||
return -EINVAL;
|
||||
|
||||
data = kmalloc(hdr.count * size, GFP_KERNEL);
|
||||
if (!data)
|
||||
return -ENOMEM;
|
||||
|
||||
if (copy_from_user(data, (void __user *)(arg + minsz),
|
||||
hdr.count * size)) {
|
||||
kfree(data);
|
||||
return -EFAULT;
|
||||
}
|
||||
}
|
||||
|
||||
mutex_lock(&vdev->igate);
|
||||
|
||||
ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
|
||||
hdr.start, hdr.count, data);
|
||||
|
||||
mutex_unlock(&vdev->igate);
|
||||
kfree(data);
|
||||
|
||||
return ret;
|
||||
|
||||
} else if (cmd == VFIO_DEVICE_RESET)
|
||||
return vdev->reset_works ?
|
||||
pci_reset_function(vdev->pdev) : -EINVAL;
|
||||
|
||||
return -ENOTTY;
|
||||
}
|
||||
|
||||
static ssize_t vfio_pci_read(void *device_data, char __user *buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
|
||||
struct vfio_pci_device *vdev = device_data;
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
|
||||
if (index >= VFIO_PCI_NUM_REGIONS)
|
||||
return -EINVAL;
|
||||
|
||||
if (index == VFIO_PCI_CONFIG_REGION_INDEX)
|
||||
return vfio_pci_config_readwrite(vdev, buf, count, ppos, false);
|
||||
else if (index == VFIO_PCI_ROM_REGION_INDEX)
|
||||
return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
|
||||
else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
|
||||
return vfio_pci_io_readwrite(vdev, buf, count, ppos, false);
|
||||
else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM)
|
||||
return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
|
||||
struct vfio_pci_device *vdev = device_data;
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
|
||||
if (index >= VFIO_PCI_NUM_REGIONS)
|
||||
return -EINVAL;
|
||||
|
||||
if (index == VFIO_PCI_CONFIG_REGION_INDEX)
|
||||
return vfio_pci_config_readwrite(vdev, (char __user *)buf,
|
||||
count, ppos, true);
|
||||
else if (index == VFIO_PCI_ROM_REGION_INDEX)
|
||||
return -EINVAL;
|
||||
else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
|
||||
return vfio_pci_io_readwrite(vdev, (char __user *)buf,
|
||||
count, ppos, true);
|
||||
else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) {
|
||||
return vfio_pci_mem_readwrite(vdev, (char __user *)buf,
|
||||
count, ppos, true);
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
|
||||
{
|
||||
struct vfio_pci_device *vdev = device_data;
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
unsigned int index;
|
||||
u64 phys_len, req_len, pgoff, req_start, phys;
|
||||
int ret;
|
||||
|
||||
index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
|
||||
|
||||
if (vma->vm_end < vma->vm_start)
|
||||
return -EINVAL;
|
||||
if ((vma->vm_flags & VM_SHARED) == 0)
|
||||
return -EINVAL;
|
||||
if (index >= VFIO_PCI_ROM_REGION_INDEX)
|
||||
return -EINVAL;
|
||||
if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
|
||||
return -EINVAL;
|
||||
|
||||
phys_len = pci_resource_len(pdev, index);
|
||||
req_len = vma->vm_end - vma->vm_start;
|
||||
pgoff = vma->vm_pgoff &
|
||||
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
|
||||
req_start = pgoff << PAGE_SHIFT;
|
||||
|
||||
if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
|
||||
return -EINVAL;
|
||||
|
||||
if (index == vdev->msix_bar) {
|
||||
/*
|
||||
* Disallow mmaps overlapping the MSI-X table; users don't
|
||||
* get to touch this directly. We could find somewhere
|
||||
* else to map the overlap, but page granularity is only
|
||||
* a recommendation, not a requirement, so the user needs
|
||||
* to know which bits are real. Requiring them to mmap
|
||||
* around the table makes that clear.
|
||||
*/
|
||||
|
||||
/* If neither entirely above nor below, then it overlaps */
|
||||
if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
|
||||
req_start + req_len <= vdev->msix_offset))
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Even though we don't make use of the barmap for the mmap,
|
||||
* we need to request the region and the barmap tracks that.
|
||||
*/
|
||||
if (!vdev->barmap[index]) {
|
||||
ret = pci_request_selected_regions(pdev,
|
||||
1 << index, "vfio-pci");
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
vdev->barmap[index] = pci_iomap(pdev, index, 0);
|
||||
}
|
||||
|
||||
vma->vm_private_data = vdev;
|
||||
vma->vm_flags |= (VM_IO | VM_RESERVED);
|
||||
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
|
||||
|
||||
phys = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
|
||||
|
||||
return remap_pfn_range(vma, vma->vm_start, phys,
|
||||
req_len, vma->vm_page_prot);
|
||||
}
|
||||
|
||||
static const struct vfio_device_ops vfio_pci_ops = {
|
||||
.name = "vfio-pci",
|
||||
.open = vfio_pci_open,
|
||||
.release = vfio_pci_release,
|
||||
.ioctl = vfio_pci_ioctl,
|
||||
.read = vfio_pci_read,
|
||||
.write = vfio_pci_write,
|
||||
.mmap = vfio_pci_mmap,
|
||||
};
|
||||
|
||||
static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||
{
|
||||
u8 type;
|
||||
struct vfio_pci_device *vdev;
|
||||
struct iommu_group *group;
|
||||
int ret;
|
||||
|
||||
pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
|
||||
if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
|
||||
return -EINVAL;
|
||||
|
||||
group = iommu_group_get(&pdev->dev);
|
||||
if (!group)
|
||||
return -EINVAL;
|
||||
|
||||
vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
|
||||
if (!vdev) {
|
||||
iommu_group_put(group);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
vdev->pdev = pdev;
|
||||
vdev->irq_type = VFIO_PCI_NUM_IRQS;
|
||||
mutex_init(&vdev->igate);
|
||||
spin_lock_init(&vdev->irqlock);
|
||||
atomic_set(&vdev->refcnt, 0);
|
||||
|
||||
ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
|
||||
if (ret) {
|
||||
iommu_group_put(group);
|
||||
kfree(vdev);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void vfio_pci_remove(struct pci_dev *pdev)
|
||||
{
|
||||
struct vfio_pci_device *vdev;
|
||||
|
||||
vdev = vfio_del_group_dev(&pdev->dev);
|
||||
if (!vdev)
|
||||
return;
|
||||
|
||||
iommu_group_put(pdev->dev.iommu_group);
|
||||
kfree(vdev);
|
||||
}
|
||||
|
||||
static struct pci_driver vfio_pci_driver = {
|
||||
.name = "vfio-pci",
|
||||
.id_table = NULL, /* only dynamic ids */
|
||||
.probe = vfio_pci_probe,
|
||||
.remove = vfio_pci_remove,
|
||||
};
|
||||
|
||||
static void __exit vfio_pci_cleanup(void)
|
||||
{
|
||||
pci_unregister_driver(&vfio_pci_driver);
|
||||
vfio_pci_virqfd_exit();
|
||||
vfio_pci_uninit_perm_bits();
|
||||
}
|
||||
|
||||
static int __init vfio_pci_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* Allocate shared config space permision data used by all devices */
|
||||
ret = vfio_pci_init_perm_bits();
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* Start the virqfd cleanup handler */
|
||||
ret = vfio_pci_virqfd_init();
|
||||
if (ret)
|
||||
goto out_virqfd;
|
||||
|
||||
/* Register and scan for devices */
|
||||
ret = pci_register_driver(&vfio_pci_driver);
|
||||
if (ret)
|
||||
goto out_driver;
|
||||
|
||||
return 0;
|
||||
|
||||
out_virqfd:
|
||||
vfio_pci_virqfd_exit();
|
||||
out_driver:
|
||||
vfio_pci_uninit_perm_bits();
|
||||
return ret;
|
||||
}
|
||||
|
||||
module_init(vfio_pci_init);
|
||||
module_exit(vfio_pci_cleanup);
|
||||
|
||||
MODULE_VERSION(DRIVER_VERSION);
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_AUTHOR(DRIVER_AUTHOR);
|
||||
MODULE_DESCRIPTION(DRIVER_DESC);
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,740 @@
|
|||
/*
|
||||
* VFIO PCI interrupt handling
|
||||
*
|
||||
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
||||
* Author: Alex Williamson <alex.williamson@redhat.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Derived from original vfio:
|
||||
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Author: Tom Lyon, pugs@cisco.com
|
||||
*/
|
||||
|
||||
#include <linux/device.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/eventfd.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/poll.h>
|
||||
#include <linux/vfio.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
#include "vfio_pci_private.h"
|
||||
|
||||
/*
|
||||
* IRQfd - generic
|
||||
*/
|
||||
struct virqfd {
|
||||
struct vfio_pci_device *vdev;
|
||||
struct eventfd_ctx *eventfd;
|
||||
int (*handler)(struct vfio_pci_device *, void *);
|
||||
void (*thread)(struct vfio_pci_device *, void *);
|
||||
void *data;
|
||||
struct work_struct inject;
|
||||
wait_queue_t wait;
|
||||
poll_table pt;
|
||||
struct work_struct shutdown;
|
||||
struct virqfd **pvirqfd;
|
||||
};
|
||||
|
||||
static struct workqueue_struct *vfio_irqfd_cleanup_wq;
|
||||
|
||||
int __init vfio_pci_virqfd_init(void)
|
||||
{
|
||||
vfio_irqfd_cleanup_wq =
|
||||
create_singlethread_workqueue("vfio-irqfd-cleanup");
|
||||
if (!vfio_irqfd_cleanup_wq)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void vfio_pci_virqfd_exit(void)
|
||||
{
|
||||
destroy_workqueue(vfio_irqfd_cleanup_wq);
|
||||
}
|
||||
|
||||
static void virqfd_deactivate(struct virqfd *virqfd)
|
||||
{
|
||||
queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown);
|
||||
}
|
||||
|
||||
static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
|
||||
{
|
||||
struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
|
||||
unsigned long flags = (unsigned long)key;
|
||||
|
||||
if (flags & POLLIN) {
|
||||
/* An event has been signaled, call function */
|
||||
if ((!virqfd->handler ||
|
||||
virqfd->handler(virqfd->vdev, virqfd->data)) &&
|
||||
virqfd->thread)
|
||||
schedule_work(&virqfd->inject);
|
||||
}
|
||||
|
||||
if (flags & POLLHUP)
|
||||
/* The eventfd is closing, detach from VFIO */
|
||||
virqfd_deactivate(virqfd);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void virqfd_ptable_queue_proc(struct file *file,
|
||||
wait_queue_head_t *wqh, poll_table *pt)
|
||||
{
|
||||
struct virqfd *virqfd = container_of(pt, struct virqfd, pt);
|
||||
add_wait_queue(wqh, &virqfd->wait);
|
||||
}
|
||||
|
||||
static void virqfd_shutdown(struct work_struct *work)
|
||||
{
|
||||
struct virqfd *virqfd = container_of(work, struct virqfd, shutdown);
|
||||
struct virqfd **pvirqfd = virqfd->pvirqfd;
|
||||
u64 cnt;
|
||||
|
||||
eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt);
|
||||
flush_work(&virqfd->inject);
|
||||
eventfd_ctx_put(virqfd->eventfd);
|
||||
|
||||
kfree(virqfd);
|
||||
*pvirqfd = NULL;
|
||||
}
|
||||
|
||||
static void virqfd_inject(struct work_struct *work)
|
||||
{
|
||||
struct virqfd *virqfd = container_of(work, struct virqfd, inject);
|
||||
if (virqfd->thread)
|
||||
virqfd->thread(virqfd->vdev, virqfd->data);
|
||||
}
|
||||
|
||||
static int virqfd_enable(struct vfio_pci_device *vdev,
|
||||
int (*handler)(struct vfio_pci_device *, void *),
|
||||
void (*thread)(struct vfio_pci_device *, void *),
|
||||
void *data, struct virqfd **pvirqfd, int fd)
|
||||
{
|
||||
struct file *file = NULL;
|
||||
struct eventfd_ctx *ctx = NULL;
|
||||
struct virqfd *virqfd;
|
||||
int ret = 0;
|
||||
unsigned int events;
|
||||
|
||||
if (*pvirqfd)
|
||||
return -EBUSY;
|
||||
|
||||
virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL);
|
||||
if (!virqfd)
|
||||
return -ENOMEM;
|
||||
|
||||
virqfd->pvirqfd = pvirqfd;
|
||||
*pvirqfd = virqfd;
|
||||
virqfd->vdev = vdev;
|
||||
virqfd->handler = handler;
|
||||
virqfd->thread = thread;
|
||||
virqfd->data = data;
|
||||
|
||||
INIT_WORK(&virqfd->shutdown, virqfd_shutdown);
|
||||
INIT_WORK(&virqfd->inject, virqfd_inject);
|
||||
|
||||
file = eventfd_fget(fd);
|
||||
if (IS_ERR(file)) {
|
||||
ret = PTR_ERR(file);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
ctx = eventfd_ctx_fileget(file);
|
||||
if (IS_ERR(ctx)) {
|
||||
ret = PTR_ERR(ctx);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
virqfd->eventfd = ctx;
|
||||
|
||||
/*
|
||||
* Install our own custom wake-up handling so we are notified via
|
||||
* a callback whenever someone signals the underlying eventfd.
|
||||
*/
|
||||
init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
|
||||
init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
|
||||
|
||||
events = file->f_op->poll(file, &virqfd->pt);
|
||||
|
||||
/*
|
||||
* Check if there was an event already pending on the eventfd
|
||||
* before we registered and trigger it as if we didn't miss it.
|
||||
*/
|
||||
if (events & POLLIN) {
|
||||
if ((!handler || handler(vdev, data)) && thread)
|
||||
schedule_work(&virqfd->inject);
|
||||
}
|
||||
|
||||
/*
|
||||
* Do not drop the file until the irqfd is fully initialized,
|
||||
* otherwise we might race against the POLLHUP.
|
||||
*/
|
||||
fput(file);
|
||||
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
if (ctx && !IS_ERR(ctx))
|
||||
eventfd_ctx_put(ctx);
|
||||
|
||||
if (file && !IS_ERR(file))
|
||||
fput(file);
|
||||
|
||||
kfree(virqfd);
|
||||
*pvirqfd = NULL;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void virqfd_disable(struct virqfd *virqfd)
|
||||
{
|
||||
if (!virqfd)
|
||||
return;
|
||||
|
||||
virqfd_deactivate(virqfd);
|
||||
|
||||
/* Block until we know all outstanding shutdown jobs have completed. */
|
||||
flush_workqueue(vfio_irqfd_cleanup_wq);
|
||||
}
|
||||
|
||||
/*
|
||||
* INTx
|
||||
*/
|
||||
static void vfio_send_intx_eventfd(struct vfio_pci_device *vdev, void *unused)
|
||||
{
|
||||
if (likely(is_intx(vdev) && !vdev->virq_disabled))
|
||||
eventfd_signal(vdev->ctx[0].trigger, 1);
|
||||
}
|
||||
|
||||
void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&vdev->irqlock, flags);
|
||||
|
||||
/*
|
||||
* Masking can come from interrupt, ioctl, or config space
|
||||
* via INTx disable. The latter means this can get called
|
||||
* even when not using intx delivery. In this case, just
|
||||
* try to have the physical bit follow the virtual bit.
|
||||
*/
|
||||
if (unlikely(!is_intx(vdev))) {
|
||||
if (vdev->pci_2_3)
|
||||
pci_intx(pdev, 0);
|
||||
} else if (!vdev->ctx[0].masked) {
|
||||
/*
|
||||
* Can't use check_and_mask here because we always want to
|
||||
* mask, not just when something is pending.
|
||||
*/
|
||||
if (vdev->pci_2_3)
|
||||
pci_intx(pdev, 0);
|
||||
else
|
||||
disable_irq_nosync(pdev->irq);
|
||||
|
||||
vdev->ctx[0].masked = true;
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&vdev->irqlock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is triggered by an eventfd, we can't call eventfd_signal
|
||||
* or else we'll deadlock on the eventfd wait queue. Return >0 when
|
||||
* a signal is necessary, which can then be handled via a work queue
|
||||
* or directly depending on the caller.
|
||||
*/
|
||||
int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
unsigned long flags;
|
||||
int ret = 0;
|
||||
|
||||
spin_lock_irqsave(&vdev->irqlock, flags);
|
||||
|
||||
/*
|
||||
* Unmasking comes from ioctl or config, so again, have the
|
||||
* physical bit follow the virtual even when not using INTx.
|
||||
*/
|
||||
if (unlikely(!is_intx(vdev))) {
|
||||
if (vdev->pci_2_3)
|
||||
pci_intx(pdev, 1);
|
||||
} else if (vdev->ctx[0].masked && !vdev->virq_disabled) {
|
||||
/*
|
||||
* A pending interrupt here would immediately trigger,
|
||||
* but we can avoid that overhead by just re-sending
|
||||
* the interrupt to the user.
|
||||
*/
|
||||
if (vdev->pci_2_3) {
|
||||
if (!pci_check_and_unmask_intx(pdev))
|
||||
ret = 1;
|
||||
} else
|
||||
enable_irq(pdev->irq);
|
||||
|
||||
vdev->ctx[0].masked = (ret > 0);
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&vdev->irqlock, flags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void vfio_pci_intx_unmask(struct vfio_pci_device *vdev)
|
||||
{
|
||||
if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
|
||||
vfio_send_intx_eventfd(vdev, NULL);
|
||||
}
|
||||
|
||||
static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
|
||||
{
|
||||
struct vfio_pci_device *vdev = dev_id;
|
||||
unsigned long flags;
|
||||
int ret = IRQ_NONE;
|
||||
|
||||
spin_lock_irqsave(&vdev->irqlock, flags);
|
||||
|
||||
if (!vdev->pci_2_3) {
|
||||
disable_irq_nosync(vdev->pdev->irq);
|
||||
vdev->ctx[0].masked = true;
|
||||
ret = IRQ_HANDLED;
|
||||
} else if (!vdev->ctx[0].masked && /* may be shared */
|
||||
pci_check_and_mask_intx(vdev->pdev)) {
|
||||
vdev->ctx[0].masked = true;
|
||||
ret = IRQ_HANDLED;
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&vdev->irqlock, flags);
|
||||
|
||||
if (ret == IRQ_HANDLED)
|
||||
vfio_send_intx_eventfd(vdev, NULL);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int vfio_intx_enable(struct vfio_pci_device *vdev)
|
||||
{
|
||||
if (!is_irq_none(vdev))
|
||||
return -EINVAL;
|
||||
|
||||
if (!vdev->pdev->irq)
|
||||
return -ENODEV;
|
||||
|
||||
vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
|
||||
if (!vdev->ctx)
|
||||
return -ENOMEM;
|
||||
|
||||
vdev->num_ctx = 1;
|
||||
vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
unsigned long irqflags = IRQF_SHARED;
|
||||
struct eventfd_ctx *trigger;
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
if (vdev->ctx[0].trigger) {
|
||||
free_irq(pdev->irq, vdev);
|
||||
kfree(vdev->ctx[0].name);
|
||||
eventfd_ctx_put(vdev->ctx[0].trigger);
|
||||
vdev->ctx[0].trigger = NULL;
|
||||
}
|
||||
|
||||
if (fd < 0) /* Disable only */
|
||||
return 0;
|
||||
|
||||
vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)",
|
||||
pci_name(pdev));
|
||||
if (!vdev->ctx[0].name)
|
||||
return -ENOMEM;
|
||||
|
||||
trigger = eventfd_ctx_fdget(fd);
|
||||
if (IS_ERR(trigger)) {
|
||||
kfree(vdev->ctx[0].name);
|
||||
return PTR_ERR(trigger);
|
||||
}
|
||||
|
||||
if (!vdev->pci_2_3)
|
||||
irqflags = 0;
|
||||
|
||||
ret = request_irq(pdev->irq, vfio_intx_handler,
|
||||
irqflags, vdev->ctx[0].name, vdev);
|
||||
if (ret) {
|
||||
kfree(vdev->ctx[0].name);
|
||||
eventfd_ctx_put(trigger);
|
||||
return ret;
|
||||
}
|
||||
|
||||
vdev->ctx[0].trigger = trigger;
|
||||
|
||||
/*
|
||||
* INTx disable will stick across the new irq setup,
|
||||
* disable_irq won't.
|
||||
*/
|
||||
spin_lock_irqsave(&vdev->irqlock, flags);
|
||||
if (!vdev->pci_2_3 && (vdev->ctx[0].masked || vdev->virq_disabled))
|
||||
disable_irq_nosync(pdev->irq);
|
||||
spin_unlock_irqrestore(&vdev->irqlock, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void vfio_intx_disable(struct vfio_pci_device *vdev)
|
||||
{
|
||||
vfio_intx_set_signal(vdev, -1);
|
||||
virqfd_disable(vdev->ctx[0].unmask);
|
||||
virqfd_disable(vdev->ctx[0].mask);
|
||||
vdev->irq_type = VFIO_PCI_NUM_IRQS;
|
||||
vdev->num_ctx = 0;
|
||||
kfree(vdev->ctx);
|
||||
}
|
||||
|
||||
/*
|
||||
* MSI/MSI-X
|
||||
*/
|
||||
static irqreturn_t vfio_msihandler(int irq, void *arg)
|
||||
{
|
||||
struct eventfd_ctx *trigger = arg;
|
||||
|
||||
eventfd_signal(trigger, 1);
|
||||
return IRQ_HANDLED;
|
||||
}
|
||||
|
||||
static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
int ret;
|
||||
|
||||
if (!is_irq_none(vdev))
|
||||
return -EINVAL;
|
||||
|
||||
vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
|
||||
if (!vdev->ctx)
|
||||
return -ENOMEM;
|
||||
|
||||
if (msix) {
|
||||
int i;
|
||||
|
||||
vdev->msix = kzalloc(nvec * sizeof(struct msix_entry),
|
||||
GFP_KERNEL);
|
||||
if (!vdev->msix) {
|
||||
kfree(vdev->ctx);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (i = 0; i < nvec; i++)
|
||||
vdev->msix[i].entry = i;
|
||||
|
||||
ret = pci_enable_msix(pdev, vdev->msix, nvec);
|
||||
if (ret) {
|
||||
kfree(vdev->msix);
|
||||
kfree(vdev->ctx);
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
ret = pci_enable_msi_block(pdev, nvec);
|
||||
if (ret) {
|
||||
kfree(vdev->ctx);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
vdev->num_ctx = nvec;
|
||||
vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX :
|
||||
VFIO_PCI_MSI_IRQ_INDEX;
|
||||
|
||||
if (!msix) {
|
||||
/*
|
||||
* Compute the virtual hardware field for max msi vectors -
|
||||
* it is the log base 2 of the number of vectors.
|
||||
*/
|
||||
vdev->msi_qmax = fls(nvec * 2 - 1) - 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
|
||||
int vector, int fd, bool msix)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector;
|
||||
char *name = msix ? "vfio-msix" : "vfio-msi";
|
||||
struct eventfd_ctx *trigger;
|
||||
int ret;
|
||||
|
||||
if (vector >= vdev->num_ctx)
|
||||
return -EINVAL;
|
||||
|
||||
if (vdev->ctx[vector].trigger) {
|
||||
free_irq(irq, vdev->ctx[vector].trigger);
|
||||
kfree(vdev->ctx[vector].name);
|
||||
eventfd_ctx_put(vdev->ctx[vector].trigger);
|
||||
vdev->ctx[vector].trigger = NULL;
|
||||
}
|
||||
|
||||
if (fd < 0)
|
||||
return 0;
|
||||
|
||||
vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)",
|
||||
name, vector, pci_name(pdev));
|
||||
if (!vdev->ctx[vector].name)
|
||||
return -ENOMEM;
|
||||
|
||||
trigger = eventfd_ctx_fdget(fd);
|
||||
if (IS_ERR(trigger)) {
|
||||
kfree(vdev->ctx[vector].name);
|
||||
return PTR_ERR(trigger);
|
||||
}
|
||||
|
||||
ret = request_irq(irq, vfio_msihandler, 0,
|
||||
vdev->ctx[vector].name, trigger);
|
||||
if (ret) {
|
||||
kfree(vdev->ctx[vector].name);
|
||||
eventfd_ctx_put(trigger);
|
||||
return ret;
|
||||
}
|
||||
|
||||
vdev->ctx[vector].trigger = trigger;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start,
|
||||
unsigned count, int32_t *fds, bool msix)
|
||||
{
|
||||
int i, j, ret = 0;
|
||||
|
||||
if (start + count > vdev->num_ctx)
|
||||
return -EINVAL;
|
||||
|
||||
for (i = 0, j = start; i < count && !ret; i++, j++) {
|
||||
int fd = fds ? fds[i] : -1;
|
||||
ret = vfio_msi_set_vector_signal(vdev, j, fd, msix);
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
for (--j; j >= start; j--)
|
||||
vfio_msi_set_vector_signal(vdev, j, -1, msix);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
int i;
|
||||
|
||||
vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
|
||||
|
||||
for (i = 0; i < vdev->num_ctx; i++) {
|
||||
virqfd_disable(vdev->ctx[i].unmask);
|
||||
virqfd_disable(vdev->ctx[i].mask);
|
||||
}
|
||||
|
||||
if (msix) {
|
||||
pci_disable_msix(vdev->pdev);
|
||||
kfree(vdev->msix);
|
||||
} else
|
||||
pci_disable_msi(pdev);
|
||||
|
||||
vdev->irq_type = VFIO_PCI_NUM_IRQS;
|
||||
vdev->num_ctx = 0;
|
||||
kfree(vdev->ctx);
|
||||
}
|
||||
|
||||
/*
|
||||
* IOCTL support
|
||||
*/
|
||||
static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev,
|
||||
unsigned index, unsigned start,
|
||||
unsigned count, uint32_t flags, void *data)
|
||||
{
|
||||
if (!is_intx(vdev) || start != 0 || count != 1)
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & VFIO_IRQ_SET_DATA_NONE) {
|
||||
vfio_pci_intx_unmask(vdev);
|
||||
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
|
||||
uint8_t unmask = *(uint8_t *)data;
|
||||
if (unmask)
|
||||
vfio_pci_intx_unmask(vdev);
|
||||
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
|
||||
int32_t fd = *(int32_t *)data;
|
||||
if (fd >= 0)
|
||||
return virqfd_enable(vdev, vfio_pci_intx_unmask_handler,
|
||||
vfio_send_intx_eventfd, NULL,
|
||||
&vdev->ctx[0].unmask, fd);
|
||||
|
||||
virqfd_disable(vdev->ctx[0].unmask);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
|
||||
unsigned index, unsigned start,
|
||||
unsigned count, uint32_t flags, void *data)
|
||||
{
|
||||
if (!is_intx(vdev) || start != 0 || count != 1)
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & VFIO_IRQ_SET_DATA_NONE) {
|
||||
vfio_pci_intx_mask(vdev);
|
||||
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
|
||||
uint8_t mask = *(uint8_t *)data;
|
||||
if (mask)
|
||||
vfio_pci_intx_mask(vdev);
|
||||
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
|
||||
return -ENOTTY; /* XXX implement me */
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
|
||||
unsigned index, unsigned start,
|
||||
unsigned count, uint32_t flags, void *data)
|
||||
{
|
||||
if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
|
||||
vfio_intx_disable(vdev);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1)
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
|
||||
int32_t fd = *(int32_t *)data;
|
||||
int ret;
|
||||
|
||||
if (is_intx(vdev))
|
||||
return vfio_intx_set_signal(vdev, fd);
|
||||
|
||||
ret = vfio_intx_enable(vdev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = vfio_intx_set_signal(vdev, fd);
|
||||
if (ret)
|
||||
vfio_intx_disable(vdev);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!is_intx(vdev))
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & VFIO_IRQ_SET_DATA_NONE) {
|
||||
vfio_send_intx_eventfd(vdev, NULL);
|
||||
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
|
||||
uint8_t trigger = *(uint8_t *)data;
|
||||
if (trigger)
|
||||
vfio_send_intx_eventfd(vdev, NULL);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
|
||||
unsigned index, unsigned start,
|
||||
unsigned count, uint32_t flags, void *data)
|
||||
{
|
||||
int i;
|
||||
bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false;
|
||||
|
||||
if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
|
||||
vfio_msi_disable(vdev, msix);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!(irq_is(vdev, index) || is_irq_none(vdev)))
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
|
||||
int32_t *fds = data;
|
||||
int ret;
|
||||
|
||||
if (vdev->irq_type == index)
|
||||
return vfio_msi_set_block(vdev, start, count,
|
||||
fds, msix);
|
||||
|
||||
ret = vfio_msi_enable(vdev, start + count, msix);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = vfio_msi_set_block(vdev, start, count, fds, msix);
|
||||
if (ret)
|
||||
vfio_msi_disable(vdev, msix);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!irq_is(vdev, index) || start + count > vdev->num_ctx)
|
||||
return -EINVAL;
|
||||
|
||||
for (i = start; i < start + count; i++) {
|
||||
if (!vdev->ctx[i].trigger)
|
||||
continue;
|
||||
if (flags & VFIO_IRQ_SET_DATA_NONE) {
|
||||
eventfd_signal(vdev->ctx[i].trigger, 1);
|
||||
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
|
||||
uint8_t *bools = data;
|
||||
if (bools[i - start])
|
||||
eventfd_signal(vdev->ctx[i].trigger, 1);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
|
||||
unsigned index, unsigned start, unsigned count,
|
||||
void *data)
|
||||
{
|
||||
int (*func)(struct vfio_pci_device *vdev, unsigned index,
|
||||
unsigned start, unsigned count, uint32_t flags,
|
||||
void *data) = NULL;
|
||||
|
||||
switch (index) {
|
||||
case VFIO_PCI_INTX_IRQ_INDEX:
|
||||
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
|
||||
case VFIO_IRQ_SET_ACTION_MASK:
|
||||
func = vfio_pci_set_intx_mask;
|
||||
break;
|
||||
case VFIO_IRQ_SET_ACTION_UNMASK:
|
||||
func = vfio_pci_set_intx_unmask;
|
||||
break;
|
||||
case VFIO_IRQ_SET_ACTION_TRIGGER:
|
||||
func = vfio_pci_set_intx_trigger;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case VFIO_PCI_MSI_IRQ_INDEX:
|
||||
case VFIO_PCI_MSIX_IRQ_INDEX:
|
||||
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
|
||||
case VFIO_IRQ_SET_ACTION_MASK:
|
||||
case VFIO_IRQ_SET_ACTION_UNMASK:
|
||||
/* XXX Need masking support exported */
|
||||
break;
|
||||
case VFIO_IRQ_SET_ACTION_TRIGGER:
|
||||
func = vfio_pci_set_msi_trigger;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (!func)
|
||||
return -ENOTTY;
|
||||
|
||||
return func(vdev, index, start, count, flags, data);
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
/*
|
||||
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
||||
* Author: Alex Williamson <alex.williamson@redhat.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Derived from original vfio:
|
||||
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Author: Tom Lyon, pugs@cisco.com
|
||||
*/
|
||||
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/pci.h>
|
||||
|
||||
#ifndef VFIO_PCI_PRIVATE_H
|
||||
#define VFIO_PCI_PRIVATE_H
|
||||
|
||||
#define VFIO_PCI_OFFSET_SHIFT 40
|
||||
|
||||
#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT)
|
||||
#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
|
||||
#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
|
||||
|
||||
struct vfio_pci_irq_ctx {
|
||||
struct eventfd_ctx *trigger;
|
||||
struct virqfd *unmask;
|
||||
struct virqfd *mask;
|
||||
char *name;
|
||||
bool masked;
|
||||
};
|
||||
|
||||
struct vfio_pci_device {
|
||||
struct pci_dev *pdev;
|
||||
void __iomem *barmap[PCI_STD_RESOURCE_END + 1];
|
||||
u8 *pci_config_map;
|
||||
u8 *vconfig;
|
||||
struct perm_bits *msi_perm;
|
||||
spinlock_t irqlock;
|
||||
struct mutex igate;
|
||||
struct msix_entry *msix;
|
||||
struct vfio_pci_irq_ctx *ctx;
|
||||
int num_ctx;
|
||||
int irq_type;
|
||||
u8 msi_qmax;
|
||||
u8 msix_bar;
|
||||
u16 msix_size;
|
||||
u32 msix_offset;
|
||||
u32 rbar[7];
|
||||
bool pci_2_3;
|
||||
bool virq_disabled;
|
||||
bool reset_works;
|
||||
bool extended_caps;
|
||||
bool bardirty;
|
||||
struct pci_saved_state *pci_saved_state;
|
||||
atomic_t refcnt;
|
||||
};
|
||||
|
||||
#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
|
||||
#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX)
|
||||
#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
|
||||
#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
|
||||
#define irq_is(vdev, type) (vdev->irq_type == type)
|
||||
|
||||
extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev);
|
||||
extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev);
|
||||
|
||||
extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev,
|
||||
uint32_t flags, unsigned index,
|
||||
unsigned start, unsigned count, void *data);
|
||||
|
||||
extern ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev,
|
||||
char __user *buf, size_t count,
|
||||
loff_t *ppos, bool iswrite);
|
||||
extern ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev,
|
||||
char __user *buf, size_t count,
|
||||
loff_t *ppos, bool iswrite);
|
||||
extern ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev,
|
||||
char __user *buf, size_t count,
|
||||
loff_t *ppos, bool iswrite);
|
||||
|
||||
extern int vfio_pci_init_perm_bits(void);
|
||||
extern void vfio_pci_uninit_perm_bits(void);
|
||||
|
||||
extern int vfio_pci_virqfd_init(void);
|
||||
extern void vfio_pci_virqfd_exit(void);
|
||||
|
||||
extern int vfio_config_init(struct vfio_pci_device *vdev);
|
||||
extern void vfio_config_free(struct vfio_pci_device *vdev);
|
||||
#endif /* VFIO_PCI_PRIVATE_H */
|
|
@ -0,0 +1,269 @@
|
|||
/*
|
||||
* VFIO PCI I/O Port & MMIO access
|
||||
*
|
||||
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
||||
* Author: Alex Williamson <alex.williamson@redhat.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Derived from original vfio:
|
||||
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Author: Tom Lyon, pugs@cisco.com
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/io.h>
|
||||
|
||||
#include "vfio_pci_private.h"
|
||||
|
||||
/* I/O Port BAR access */
|
||||
ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev, char __user *buf,
|
||||
size_t count, loff_t *ppos, bool iswrite)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
|
||||
int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
|
||||
void __iomem *io;
|
||||
size_t done = 0;
|
||||
|
||||
if (!pci_resource_start(pdev, bar))
|
||||
return -EINVAL;
|
||||
|
||||
if (pos + count > pci_resource_len(pdev, bar))
|
||||
return -EINVAL;
|
||||
|
||||
if (!vdev->barmap[bar]) {
|
||||
int ret;
|
||||
|
||||
ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
vdev->barmap[bar] = pci_iomap(pdev, bar, 0);
|
||||
|
||||
if (!vdev->barmap[bar]) {
|
||||
pci_release_selected_regions(pdev, 1 << bar);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
io = vdev->barmap[bar];
|
||||
|
||||
while (count) {
|
||||
int filled;
|
||||
|
||||
if (count >= 3 && !(pos % 4)) {
|
||||
__le32 val;
|
||||
|
||||
if (iswrite) {
|
||||
if (copy_from_user(&val, buf, 4))
|
||||
return -EFAULT;
|
||||
|
||||
iowrite32(le32_to_cpu(val), io + pos);
|
||||
} else {
|
||||
val = cpu_to_le32(ioread32(io + pos));
|
||||
|
||||
if (copy_to_user(buf, &val, 4))
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
filled = 4;
|
||||
|
||||
} else if ((pos % 2) == 0 && count >= 2) {
|
||||
__le16 val;
|
||||
|
||||
if (iswrite) {
|
||||
if (copy_from_user(&val, buf, 2))
|
||||
return -EFAULT;
|
||||
|
||||
iowrite16(le16_to_cpu(val), io + pos);
|
||||
} else {
|
||||
val = cpu_to_le16(ioread16(io + pos));
|
||||
|
||||
if (copy_to_user(buf, &val, 2))
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
filled = 2;
|
||||
} else {
|
||||
u8 val;
|
||||
|
||||
if (iswrite) {
|
||||
if (copy_from_user(&val, buf, 1))
|
||||
return -EFAULT;
|
||||
|
||||
iowrite8(val, io + pos);
|
||||
} else {
|
||||
val = ioread8(io + pos);
|
||||
|
||||
if (copy_to_user(buf, &val, 1))
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
filled = 1;
|
||||
}
|
||||
|
||||
count -= filled;
|
||||
done += filled;
|
||||
buf += filled;
|
||||
pos += filled;
|
||||
}
|
||||
|
||||
*ppos += done;
|
||||
|
||||
return done;
|
||||
}
|
||||
|
||||
/*
|
||||
* MMIO BAR access
|
||||
* We handle two excluded ranges here as well, if the user tries to read
|
||||
* the ROM beyond what PCI tells us is available or the MSI-X table region,
|
||||
* we return 0xFF and writes are dropped.
|
||||
*/
|
||||
ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev, char __user *buf,
|
||||
size_t count, loff_t *ppos, bool iswrite)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
|
||||
int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
|
||||
void __iomem *io;
|
||||
resource_size_t end;
|
||||
size_t done = 0;
|
||||
size_t x_start = 0, x_end = 0; /* excluded range */
|
||||
|
||||
if (!pci_resource_start(pdev, bar))
|
||||
return -EINVAL;
|
||||
|
||||
end = pci_resource_len(pdev, bar);
|
||||
|
||||
if (pos > end)
|
||||
return -EINVAL;
|
||||
|
||||
if (pos == end)
|
||||
return 0;
|
||||
|
||||
if (pos + count > end)
|
||||
count = end - pos;
|
||||
|
||||
if (bar == PCI_ROM_RESOURCE) {
|
||||
io = pci_map_rom(pdev, &x_start);
|
||||
x_end = end;
|
||||
} else {
|
||||
if (!vdev->barmap[bar]) {
|
||||
int ret;
|
||||
|
||||
ret = pci_request_selected_regions(pdev, 1 << bar,
|
||||
"vfio");
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
vdev->barmap[bar] = pci_iomap(pdev, bar, 0);
|
||||
|
||||
if (!vdev->barmap[bar]) {
|
||||
pci_release_selected_regions(pdev, 1 << bar);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
io = vdev->barmap[bar];
|
||||
|
||||
if (bar == vdev->msix_bar) {
|
||||
x_start = vdev->msix_offset;
|
||||
x_end = vdev->msix_offset + vdev->msix_size;
|
||||
}
|
||||
}
|
||||
|
||||
if (!io)
|
||||
return -EINVAL;
|
||||
|
||||
while (count) {
|
||||
size_t fillable, filled;
|
||||
|
||||
if (pos < x_start)
|
||||
fillable = x_start - pos;
|
||||
else if (pos >= x_end)
|
||||
fillable = end - pos;
|
||||
else
|
||||
fillable = 0;
|
||||
|
||||
if (fillable >= 4 && !(pos % 4) && (count >= 4)) {
|
||||
__le32 val;
|
||||
|
||||
if (iswrite) {
|
||||
if (copy_from_user(&val, buf, 4))
|
||||
goto out;
|
||||
|
||||
iowrite32(le32_to_cpu(val), io + pos);
|
||||
} else {
|
||||
val = cpu_to_le32(ioread32(io + pos));
|
||||
|
||||
if (copy_to_user(buf, &val, 4))
|
||||
goto out;
|
||||
}
|
||||
|
||||
filled = 4;
|
||||
} else if (fillable >= 2 && !(pos % 2) && (count >= 2)) {
|
||||
__le16 val;
|
||||
|
||||
if (iswrite) {
|
||||
if (copy_from_user(&val, buf, 2))
|
||||
goto out;
|
||||
|
||||
iowrite16(le16_to_cpu(val), io + pos);
|
||||
} else {
|
||||
val = cpu_to_le16(ioread16(io + pos));
|
||||
|
||||
if (copy_to_user(buf, &val, 2))
|
||||
goto out;
|
||||
}
|
||||
|
||||
filled = 2;
|
||||
} else if (fillable) {
|
||||
u8 val;
|
||||
|
||||
if (iswrite) {
|
||||
if (copy_from_user(&val, buf, 1))
|
||||
goto out;
|
||||
|
||||
iowrite8(val, io + pos);
|
||||
} else {
|
||||
val = ioread8(io + pos);
|
||||
|
||||
if (copy_to_user(buf, &val, 1))
|
||||
goto out;
|
||||
}
|
||||
|
||||
filled = 1;
|
||||
} else {
|
||||
/* Drop writes, fill reads with FF */
|
||||
if (!iswrite) {
|
||||
char val = 0xFF;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < x_end - pos; i++) {
|
||||
if (put_user(val, buf + i))
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
filled = x_end - pos;
|
||||
}
|
||||
|
||||
count -= filled;
|
||||
done += filled;
|
||||
buf += filled;
|
||||
pos += filled;
|
||||
}
|
||||
|
||||
*ppos += done;
|
||||
|
||||
out:
|
||||
if (bar == PCI_ROM_RESOURCE)
|
||||
pci_unmap_rom(pdev, io);
|
||||
|
||||
return count ? -EFAULT : done;
|
||||
}
|
|
@ -223,6 +223,7 @@ struct vfio_device_info {
|
|||
__u32 argsz;
|
||||
__u32 flags;
|
||||
#define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */
|
||||
#define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */
|
||||
__u32 num_regions; /* Max region index + 1 */
|
||||
__u32 num_irqs; /* Max IRQ index + 1 */
|
||||
};
|
||||
|
@ -364,6 +365,31 @@ struct vfio_irq_set {
|
|||
*/
|
||||
#define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11)
|
||||
|
||||
/*
|
||||
* The VFIO-PCI bus driver makes use of the following fixed region and
|
||||
* IRQ index mapping. Unimplemented regions return a size of zero.
|
||||
* Unimplemented IRQ types return a count of zero.
|
||||
*/
|
||||
|
||||
enum {
|
||||
VFIO_PCI_BAR0_REGION_INDEX,
|
||||
VFIO_PCI_BAR1_REGION_INDEX,
|
||||
VFIO_PCI_BAR2_REGION_INDEX,
|
||||
VFIO_PCI_BAR3_REGION_INDEX,
|
||||
VFIO_PCI_BAR4_REGION_INDEX,
|
||||
VFIO_PCI_BAR5_REGION_INDEX,
|
||||
VFIO_PCI_ROM_REGION_INDEX,
|
||||
VFIO_PCI_CONFIG_REGION_INDEX,
|
||||
VFIO_PCI_NUM_REGIONS
|
||||
};
|
||||
|
||||
enum {
|
||||
VFIO_PCI_INTX_IRQ_INDEX,
|
||||
VFIO_PCI_MSI_IRQ_INDEX,
|
||||
VFIO_PCI_MSIX_IRQ_INDEX,
|
||||
VFIO_PCI_NUM_IRQS
|
||||
};
|
||||
|
||||
/* -------- API for Type1 VFIO IOMMU -------- */
|
||||
|
||||
/**
|
||||
|
|
Loading…
Reference in New Issue