userfaultfd: Userspace Page Fault Handling
Let userspace handle page faults — the mechanism behind QEMU live migration and CRIU
What problem does userfaultfd solve?
Normally, page faults are handled by the kernel: a missing page triggers an anonymous allocation, swap-in, or file read. But some applications need to intercept page faults — to provide pages from their own storage or to implement copy-on-write in userspace.
Use cases: - QEMU live migration: receive guest RAM pages on-the-fly as the guest accesses them - CRIU (Checkpoint/Restore In Userspace): restore process memory lazily - Hypervisors: implement demand-paging for guest memory from userspace - Garbage collectors: implement generational GC barriers without mprotect overhead - Sandboxes: provide deterministic memory contents for testing
API overview
#include <sys/ioctl.h>
#include <linux/userfaultfd.h>
#include <poll.h>
/* 1. Create userfaultfd */
int uf_fd = syscall(SYS_userfaultfd, O_CLOEXEC | O_NONBLOCK);
/* 2. Check API version */
struct uffdio_api api = {
.api = UFFD_API,
.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
UFFD_FEATURE_MINOR_FAULTS,
};
ioctl(uf_fd, UFFDIO_API, &api);
/* api.features now has supported features */
/* api.ioctls has available ioctls */
/* 3. Register a memory range to intercept */
struct uffdio_register reg = {
.range.start = (unsigned long)addr,
.range.len = size,
.mode = UFFDIO_REGISTER_MODE_MISSING, /* intercept missing pages */
};
ioctl(uf_fd, UFFDIO_REGISTER, ®);
/* 4. Poll for faults in a thread */
struct pollfd pfd = { .fd = uf_fd, .events = POLLIN };
while (1) {
poll(&pfd, 1, -1);
struct uffd_msg msg;
read(uf_fd, &msg, sizeof(msg));
if (msg.event == UFFD_EVENT_PAGEFAULT) {
unsigned long fault_addr = msg.arg.pagefault.address;
unsigned long flags = msg.arg.pagefault.flags;
/* UFFD_PAGEFAULT_FLAG_WRITE: write fault */
/* UFFD_PAGEFAULT_FLAG_WP: write-protect fault */
/* Resolve: provide a zero page */
struct uffdio_zeropage zp = {
.range.start = fault_addr & ~(PAGE_SIZE - 1),
.range.len = PAGE_SIZE,
};
ioctl(uf_fd, UFFDIO_ZEROPAGE, &zp);
}
}
Fault resolution methods
UFFDIO_COPY — install a page with data
static char page_data[PAGE_SIZE];
/* Fill page_data with the content you want */
struct uffdio_copy copy = {
.dst = fault_addr & ~(PAGE_SIZE - 1),
.src = (unsigned long)page_data,
.len = PAGE_SIZE,
.mode = 0, /* or UFFDIO_COPY_MODE_DONTWAKE if you'll wake later */
};
ioctl(uf_fd, UFFDIO_COPY, ©);
/* Wakes the faulting thread */
UFFDIO_ZEROPAGE — install a zero page
struct uffdio_zeropage zp = {
.range.start = fault_addr & ~(PAGE_SIZE - 1),
.range.len = PAGE_SIZE,
.mode = 0,
};
ioctl(uf_fd, UFFDIO_ZEROPAGE, &zp);
UFFDIO_CONTINUE — resolve minor fault (for shared mappings)
/* Minor fault: page exists in page cache but not in PTE */
struct uffdio_continue cont = {
.range.start = fault_addr & ~(PAGE_SIZE - 1),
.range.len = PAGE_SIZE,
.mode = 0,
};
ioctl(uf_fd, UFFDIO_CONTINUE, &cont);
UFFDIO_WRITEPROTECT — write-protect pages to catch writes
/* Write-protect a region (triggers WP fault on write) */
struct uffdio_writeprotect wp = {
.range.start = (unsigned long)addr,
.range.len = size,
.mode = UFFDIO_WRITEPROTECT_MODE_WP, /* enable WP */
};
ioctl(uf_fd, UFFDIO_WRITEPROTECT, &wp);
/* Later, when WP fault arrives (UFFD_PAGEFAULT_FLAG_WP set): */
wp.mode = 0; /* remove WP — allow future writes */
ioctl(uf_fd, UFFDIO_WRITEPROTECT, &wp);
Registration modes
/* UFFDIO_REGISTER_MODE_MISSING: intercept missing page faults */
reg.mode = UFFDIO_REGISTER_MODE_MISSING;
/* UFFDIO_REGISTER_MODE_WP: intercept write-protect faults */
reg.mode = UFFDIO_REGISTER_MODE_WP;
/* Both (Linux 5.7+): */
reg.mode = UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_WP;
/* UFFDIO_REGISTER_MODE_MINOR (Linux 5.13+): for shmem/hugetlb minor faults */
reg.mode = UFFDIO_REGISTER_MODE_MINOR;
Complete example: lazy page provider
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#define SIZE (64 * 1024 * 1024) /* 64MB */
static int uf_fd;
static void *region;
static void *fault_handler_thread(void *arg)
{
struct pollfd pfd = { .fd = uf_fd, .events = POLLIN };
static char page[4096];
while (1) {
poll(&pfd, 1, -1);
struct uffd_msg msg;
if (read(uf_fd, &msg, sizeof(msg)) != sizeof(msg))
break;
if (msg.event != UFFD_EVENT_PAGEFAULT)
continue;
unsigned long page_addr = msg.arg.pagefault.address & ~0xFFFUL;
unsigned long page_index = (page_addr - (unsigned long)region) / 4096;
/* Generate the page content on-demand */
memset(page, 0, sizeof(page));
snprintf(page, 64, "Page %lu: generated on demand", page_index);
struct uffdio_copy copy = {
.dst = page_addr,
.src = (unsigned long)page,
.len = PAGE_SIZE,
};
ioctl(uf_fd, UFFDIO_COPY, ©);
}
return NULL;
}
int main(void)
{
/* Create a large anonymous mapping (no physical pages yet) */
region = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
/* Create userfaultfd */
uf_fd = syscall(SYS_userfaultfd, O_CLOEXEC | O_NONBLOCK);
struct uffdio_api api = { .api = UFFD_API };
ioctl(uf_fd, UFFDIO_API, &api);
/* Register the entire region */
struct uffdio_register reg = {
.range = { .start = (unsigned long)region, .len = SIZE },
.mode = UFFDIO_REGISTER_MODE_MISSING,
};
ioctl(uf_fd, UFFDIO_REGISTER, ®);
/* Start fault handler thread */
pthread_t thr;
pthread_create(&thr, NULL, fault_handler_thread, NULL);
/* Access memory — each access triggers a fault, handled by our thread */
for (int i = 0; i < SIZE / 4096; i += 1024) {
char *p = (char *)region + i * 4096;
printf("Reading page %d: '%s'\n", i, p);
}
return 0;
}
QEMU live migration with userfaultfd
QEMU uses userfaultfd for post-copy live migration: start the destination VM before all memory has arrived.
Source VM Destination VM
─────────────────────────────────────────────────────
QEMU registers guest RAM with uffd
Transfer CPU state + a few pages ────►
Start guest execution
↓
Guest accesses page not yet received
→ userfaultfd fault event
↓
Fault handler requests page from source
◄────── request page N
Transfer page N ──────────────────────►
uffd_copy resolves fault
Guest continues
(background: transfer remaining pages)
The key benefit: the destination VM starts running quickly, with only hot pages needing to be fetched on-demand.
Kernel implementation
Fault interception
/* mm/userfaultfd.c */
/*
* Called from handle_mm_fault() when a page is missing.
* Returns VM_FAULT_RETRY if userfaultfd will handle it.
*/
vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
{
struct mm_struct *mm = vmf->vma->vm_mm;
struct userfaultfd_ctx *ctx;
struct uffd_msg msg = {};
ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
return 0; /* no uffd registered for this VMA */
/* Build the fault message */
msg.event = UFFD_EVENT_PAGEFAULT;
msg.arg.pagefault.flags = 0;
if (reason & VM_UFFD_WP)
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
if (vmf->flags & FAULT_FLAG_WRITE)
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
msg.arg.pagefault.address = vmf->address;
/* Enqueue the fault message — the uffd file becomes readable */
userfaultfd_event_wait_completion(ctx, &ewq);
/*
* The fault-handler thread has resolved the fault via UFFDIO_COPY
* or UFFDIO_ZEROPAGE. Now retry the fault — the page is present.
*/
return VM_FAULT_RETRY;
}
struct userfaultfd_ctx
struct userfaultfd_ctx {
wait_queue_head_t fault_pending_wqh; /* faults waiting to be read */
wait_queue_head_t fault_wqh; /* faults waiting for resolution */
wait_queue_head_t event_wqh; /* non-fault events (fork, remap) */
wait_queue_head_t fd_wqh; /* fd poll wait queue */
seqcount_spinlock_t refile_seq;
unsigned long features;
bool released;
atomic_t mmap_changing;
struct mm_struct *mm;
};
Events beyond page faults
userfaultfd can also deliver non-fault events when the monitored region changes:
/* Features to enable non-fault events: */
api.features = UFFD_FEATURE_EVENT_FORK | /* fork() duplicates VMA */
UFFD_FEATURE_EVENT_REMAP | /* mremap() */
UFFD_FEATURE_EVENT_REMOVE | /* madvise(MADV_REMOVE) */
UFFD_FEATURE_EVENT_UNMAP; /* munmap() */
/* In the handler: */
switch (msg.event) {
case UFFD_EVENT_FORK:
/* msg.arg.fork.ufd = new uffd for child process */
break;
case UFFD_EVENT_REMAP:
/* msg.arg.remap.from/to/len */
break;
case UFFD_EVENT_REMOVE:
case UFFD_EVENT_UNMAP:
/* msg.arg.remove.start/end */
break;
}
Performance considerations
userfaultfd overhead:
- Per fault: ~1-10µs (syscall + thread wakeup + copy + wakeup)
- UFFDIO_COPY for 1 page: ~5µs
- vs. normal page fault: ~1µs (anon) to 1ms+ (from disk)
Optimization techniques:
1. Batch: use UFFDIO_COPY with larger ranges when possible
2. Async wake: UFFDIO_COPY_MODE_DONTWAKE, then UFFDIO_WAKE
3. Multiple handler threads: each reads from the same uf_fd
4. Huge pages: register with huge page alignment, copy 2MB at a time
# Monitor userfaultfd activity
bpftrace -e '
kprobe:handle_userfault {
@[kstack] = count();
}'
# Check if a process uses userfaultfd
ls -la /proc/<pid>/fd/ | grep userfaultfd
# Trace uffd ioctls
bpftrace -e '
tracepoint:syscalls:sys_enter_ioctl
/args->fd > 0 && (args->cmd == 0xc018aa03 || args->cmd == 0xc028aa04)/
{ printf("UFFDIO from pid %d\n", pid); }'
Further reading
- Page Fault Handler — handle_mm_fault, the entry point for userfaultfd
- File-backed mmap — mmap fundamentals
- QEMU — live migration using userfaultfd
- CRIU documentation — restore using lazy page transfer
mm/userfaultfd.c— kernel implementationtools/testing/selftests/mm/userfaultfd.c— kernel self-test and usage example