VFIO: Virtual Function I/O and Device Passthrough
IOMMU groups, DMA remapping, and direct PCI device access from userspace/VMs
What is VFIO?
VFIO (Virtual Function I/O) provides a secure mechanism to expose PCI devices directly to userspace processes or VMs, bypassing the kernel driver. A guest VM with VFIO passthrough can access the device at near-native speed:
Without passthrough:
Guest driver → virtio protocol → host vhost/QEMU → host kernel driver → hardware
Every I/O: multiple context switches, data copies
With VFIO passthrough:
Guest driver → VFIO → IOMMU DMA remapping → hardware
Direct DMA: guest physical addresses remapped by IOMMU to real hardware addresses
Requirements:
- IOMMU hardware (Intel VT-d or AMD-Vi)
- Device must be in its own IOMMU group (or group isolation satisfied)
- intel_iommu=on or amd_iommu=on boot parameter
IOMMU groups
An IOMMU group is the smallest set of devices that must be isolated together for DMA safety. Devices in the same PCIe hierarchy that can peer-DMA to each other form a group:
# List all IOMMU groups:
find /sys/kernel/iommu_groups -type l | sort -V | head -20
# /sys/kernel/iommu_groups/0/devices/0000:00:00.0
# /sys/kernel/iommu_groups/1/devices/0000:00:01.0
# /sys/kernel/iommu_groups/2/devices/0000:00:02.0
# ...
# Check which group a device belongs to:
ls /sys/bus/pci/devices/0000:03:00.0/iommu_group
# → ../../../kernel/iommu_groups/15
# List all devices in a group:
ls /sys/kernel/iommu_groups/15/devices/
# 0000:03:00.0 0000:03:00.1 ← e.g., NIC with two functions
# For passthrough, ALL devices in the group must be passed through
# (or left to vfio-pci stub driver)
VFIO architecture
Userspace (QEMU/application)
↓ open/ioctl
/dev/vfio/vfio ← container (IOMMU domain)
/dev/vfio/<group_id> ← IOMMU group
↓
VFIO kernel driver
↓
IOMMU driver (Intel VT-d / AMD-Vi)
↓
PCIe device BAR access (MMIO mapped to guest/userspace)
DMA: guest GPA → IOMMU → host HPA (remapped per container)
Key objects
/* 1. Container: an IOMMU address space (shared by multiple groups) */
/* fd = open("/dev/vfio/vfio", O_RDWR) */
/* 2. Group: a set of devices that share DMA isolation requirements */
/* fd = open("/dev/vfio/15", O_RDWR) (group 15) */
/* 3. Device: one PCI function within a group */
/* fd = ioctl(group_fd, VFIO_GROUP_GET_DEVICE_FD, "0000:03:00.0") */
Setting up VFIO passthrough
# Step 1: Enable IOMMU in boot parameters (/etc/default/grub):
GRUB_CMDLINE_LINUX="intel_iommu=on iommu=pt"
# iommu=pt: passthrough mode (better performance for non-vfio devices)
update-grub && reboot
# Step 2: Load vfio-pci module:
modprobe vfio-pci
# Step 3: Unbind device from its current driver:
echo 0000:03:00.0 > /sys/bus/pci/devices/0000:03:00.0/driver/unbind
# Step 4: Bind to vfio-pci:
echo "10de 1234" > /sys/bus/pci/drivers/vfio-pci/new_id # vendor:device
# or:
echo 0000:03:00.0 > /sys/bus/pci/drivers/vfio-pci/bind
# Step 5: Verify:
ls /sys/bus/pci/devices/0000:03:00.0/driver
# → /sys/bus/pci/drivers/vfio-pci
# Step 6: Check group:
ls /sys/kernel/iommu_groups/15/devices/
# All devices in group must be bound to vfio-pci (or no driver)
QEMU device passthrough
# Pass an NVIDIA GPU to a VM:
qemu-system-x86_64 \
-enable-kvm \
-m 8G \
-cpu host \
-device vfio-pci,host=03:00.0,multifunction=on \
-device vfio-pci,host=03:00.1 \ # audio function
-drive if=virtio,file=vm.img \
-net user,hostfwd=tcp::2222-:22
# Pass an NVMe SSD:
qemu-system-x86_64 \
-enable-kvm \
-device vfio-pci,host=02:00.0 \ # NVMe controller
...
Userspace VFIO API
Applications (not just VMs) can use VFIO directly for device access:
#include <linux/vfio.h>
/* 1. Open container */
int container = open("/dev/vfio/vfio", O_RDWR);
ioctl(container, VFIO_GET_API_VERSION); /* must be VFIO_API_VERSION */
/* 2. Open IOMMU group */
int group = open("/dev/vfio/15", O_RDWR);
struct vfio_group_status status;
ioctl(group, VFIO_GROUP_GET_STATUS, &status);
assert(status.flags & VFIO_GROUP_FLAGS_VIABLE);
/* 3. Add group to container */
ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
/* 4. Set IOMMU type (creates the DMA mapping domain) */
ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
/* 5. Map DMA buffers into IOMMU */
struct vfio_iommu_type1_dma_map dma_map = {
.argsz = sizeof(dma_map),
.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
.vaddr = (uintptr_t)buf, /* host virtual address */
.iova = 0x10000000, /* IOVA (device sees this address) */
.size = BUF_SIZE,
};
ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
/* Device can now DMA to/from IOVA 0x10000000 */
/* 6. Get device fd */
int device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:03:00.0");
/* 7. Get device info (BARs, interrupts) */
struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
/* device_info.num_regions: number of BARs/capabilities */
/* device_info.num_irqs: number of interrupt types */
/* 8. Map BAR 0 (MMIO registers) into process address space */
struct vfio_region_info reg = {
.argsz = sizeof(reg),
.index = VFIO_PCI_BAR0_REGION_INDEX,
};
ioctl(device, VFIO_DEVICE_GET_REGION_INFO, ®);
void *mmio = mmap(NULL, reg.size, PROT_READ|PROT_WRITE,
MAP_SHARED, device, reg.offset);
/* Now: read/write directly to mmio like a kernel driver would */
SR-IOV: Virtual Functions
SR-IOV (Single Root I/O Virtualization) allows one physical device to appear as multiple virtual functions (VFs):
# Enable SR-IOV (create 4 VFs from one PF):
echo 4 > /sys/bus/pci/devices/0000:03:00.0/sriov_numvfs
# VFs appear as new PCI devices:
lspci | grep "Virtual Function"
# 03:00.1 VF 1
# 03:00.2 VF 2
# 03:00.3 VF 3
# 03:00.4 VF 4
# Bind each VF to vfio-pci and pass to different VMs:
echo 0000:03:00.1 > /sys/bus/pci/drivers/vfio-pci/bind
# Network SR-IOV: VF gets its own MAC/VLAN, dedicated queue rings
ip link set eth0 vf 0 mac 52:54:00:01:02:03 vlan 100 rate 1000
Mediated devices (mdev)
Some devices support mediated passthrough — the host driver slices the device and presents virtual sub-devices:
# Intel GVT-g: share one GPU across multiple VMs
ls /sys/bus/pci/devices/0000:00:02.0/mdev_supported_types/
# i915-GVTg_V5_4 ← 4 GPU VMs sharing one GPU
# Create a mediated device instance:
uuidgen # e.g., 83b8f4f2-509f-382f-3c1e-e6bfe0fa1001
echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" > \
/sys/bus/pci/devices/0000:00:02.0/mdev_supported_types/i915-GVTg_V5_4/create
# Assign to QEMU:
qemu-system-x86_64 \
-device vfio-pci,sysfsdev=/sys/bus/mdev/devices/83b8f4f2-...
VFIO kernel internals
/* drivers/vfio/vfio.c */
/* The container manages the IOMMU domain: */
struct vfio_container {
struct kref kref;
struct list_head group_list;
struct rw_semaphore group_lock;
struct vfio_iommu_driver *iommu_driver;
void *iommu_data; /* iommu_domain */
bool noiommu;
};
/* Each group connects devices to a container: */
struct vfio_group {
struct iommu_group *iommu_group;
struct vfio_container *container;
struct list_head device_list;
struct cdev cdev; /* /dev/vfio/<id> */
};
/* VFIO PCI device - wraps a PCI device for userspace access: */
struct vfio_pci_core_device {
struct vfio_device vdev;
struct pci_dev *pdev;
void __iomem *barmap[PCI_STD_NUM_BARS]; /* BAR mappings */
struct eventfd_ctx *ctx[VFIO_PCI_NUM_IRQS]; /* IRQ eventfds */
/* ... */
};
DMA mapping path
/* When userspace calls VFIO_IOMMU_MAP_DMA: */
static int vfio_iommu_type1_dma_map(...)
{
/* 1. Pin the userspace pages */
pin_user_pages_fast(vaddr, npage, FOLL_WRITE, pages);
/* 2. Create IOMMU mapping: IOVA → physical pages */
iommu_map(domain, iova, phys_addr, size,
IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE);
/* Now device DMA to IOVA goes through IOMMU → correct physical pages */
/* Without IOMMU: device could DMA anywhere → security hole */
}
Observability
# Check IOMMU is active:
dmesg | grep -i iommu | head -5
# [ 0.234] DMAR: IOMMU enabled
# Check VFIO groups:
ls /dev/vfio/
# vfio 15 23 ...
# IOMMU mappings for a container:
cat /sys/kernel/debug/iommu/intel/iommu_devices 2>/dev/null | head -20
# Performance: check for IOMMU TLB shootdowns
perf stat -e power/pts/ # Intel IOMMU TLB misses
Further reading
- KVM Architecture — VM exits and VMCS
- Memory Virtualization — EPT and shadow paging
- IOMMU Architecture — IOMMU hardware and DMA API
- PCI Drivers — How PCI drivers work before VFIO
drivers/vfio/— VFIO frameworkdrivers/vfio/pci/— vfio-pci driverDocumentation/driver-api/vfio.rst