Walking Page Tables Programmatically
pagewalk API, apply_to_page_range, and ptdump — reading page tables from kernel code
Why walk page tables?
The kernel needs to inspect or modify page table entries programmatically in many contexts:
- KVM: scan guest memory to find dirty pages
- Migration: find all PTEs mapping a page before moving it
- Debugging: ptdump dumps page tables to /sys/kernel/debug/kernel_page_tables
- mprotect: change protection bits on a VA range
- userfaultfd: set up write-protection on a range
- NUMA balancing: scan PTEs to detect hot/cold pages across NUMA nodes
The pagewalk API
#include <linux/pagewalk.h>
/* Callbacks invoked as the walker traverses the page table hierarchy */
struct mm_walk_ops {
/* Called for each PGD entry (top level) */
int (*pgd_entry)(pgd_t *pgd, unsigned long addr, unsigned long next,
struct mm_walk *walk);
/* Called for each P4D (5-level paging), or not at all if 4-level */
int (*p4d_entry)(p4d_t *p4d, unsigned long addr, unsigned long next,
struct mm_walk *walk);
/* Called for each PUD entry */
int (*pud_entry)(pud_t *pud, unsigned long addr, unsigned long next,
struct mm_walk *walk);
/* Called for each PMD entry */
int (*pmd_entry)(pmd_t *pmd, unsigned long addr, unsigned long next,
struct mm_walk *walk);
/* Called for each PTE (leaf, most common) */
int (*pte_entry)(pte_t *pte, unsigned long addr, unsigned long next,
struct mm_walk *walk);
/* Called for holes (no VMA found): */
int (*pte_hole)(unsigned long addr, unsigned long next,
int depth, struct mm_walk *walk);
/* Called before and after entering a huge page PMD/PUD */
int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long next,
struct mm_walk *walk);
/* Called for each VMA before walking its range */
int (*pre_vma)(unsigned long start, unsigned long end,
struct mm_walk *walk);
/* Called for each VMA after walking its range */
void (*post_vma)(struct mm_walk *walk);
/* Test whether to walk this VMA (return 0=skip, 1=walk) */
int (*test_walk)(unsigned long start, unsigned long end,
struct mm_walk *walk);
};
/* mm_walk carries context through the callbacks */
struct mm_walk {
const struct mm_walk_ops *ops;
struct mm_struct *mm;
pgd_t *pgd; /* current PGD being walked */
struct vm_area_struct *vma; /* current VMA */
void *private; /* caller private data */
bool no_vma; /* walk without VMA lookup */
};
Walk a user address range
static int count_present_pte(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
long *count = walk->private;
if (pte_present(*pte))
(*count)++;
return 0;
}
static const struct mm_walk_ops count_ops = {
.pte_entry = count_present_pte,
};
long count_resident_pages(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
long count = 0;
mmap_read_lock(mm);
walk_page_range(mm, start, end, &count_ops, &count);
mmap_read_unlock(mm);
return count;
}
Walk the kernel address space
/* Walk kernel page tables (no mm, no VMA): */
static int dump_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
if (pte_present(*pte)) {
pr_info(" %016lx: PTE=%016llx pfn=%lx %s%s%s\n",
addr, (u64)pte_val(*pte),
pte_pfn(*pte),
pte_write(*pte) ? "W" : "r",
pte_exec(*pte) ? "X" : "-",
pte_dirty(*pte) ? "D" : "-");
}
return 0;
}
static const struct mm_walk_ops kernel_walk_ops = {
.pte_entry = dump_pte_entry,
};
/* Walk kernel vmalloc range: */
walk_page_range_novma(&init_mm, VMALLOC_START, VMALLOC_END,
&kernel_walk_ops, NULL, NULL);
Modifying PTEs during the walk
/* Write-protect a range (used by userfaultfd, KVM dirty tracking): */
static int wp_pte(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl = pte_lockptr(walk->mm, pte);
spin_lock(ptl);
if (pte_present(*pte) && pte_write(*pte)) {
pte_t old_pte = *pte;
pte_t new_pte = pte_wrprotect(old_pte);
/* Atomically update the PTE */
set_pte_at(walk->mm, addr, pte, new_pte);
/* Flush TLB for this address */
flush_tlb_page(vma, addr);
}
spin_unlock(ptl);
return 0;
}
apply_to_page_range: set up mappings
apply_to_page_range allocates page table pages as needed and calls a function on each PTE:
/* Signature: */
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
unsigned long size, pte_fn_t fn, void *data);
/* Example: map a range of physical pages into kernel vmalloc space */
static int set_pte_fn(pte_t *pte, unsigned long addr,
unsigned long next, void *data)
{
phys_addr_t phys = *(phys_addr_t *)data + (addr - start_addr);
pte_t new_pte = pfn_pte(phys >> PAGE_SHIFT,
PAGE_KERNEL); /* RW, NX */
set_pte_at(&init_mm, addr, pte, new_pte);
return 0;
}
apply_to_page_range(&init_mm, vaddr, size, set_pte_fn, &phys_base);
ptdump: debugging page tables
# Kernel page table dump (requires CONFIG_PTDUMP_DEBUGFS=y):
cat /sys/kernel/debug/kernel_page_tables | head -50
# 0xffff888000000000-0xffffc87fffffffff 131072G PGD RW NX SZ=1G
# 0xffffc90000000000-0xffffe8ffffffffff 32768G PGD RW NX SZ=2M
# 0xffffea0000000000-0xffffeb7fffffffff 3072G PGD RW NX SZ=2M dirty
# 0xffffffff80000000-0xffffffff9fffffff 512M PGD ro X SZ=2M (kernel text)
# 0xffffffff9fffffff-0xffffffffa0000000 0 PGD (hole)
# 0xffffffffa0000000-0xffffffffb0000fffff ~1G PGD RW NX (modules)
# User process page table:
cat /sys/kernel/debug/page_tables/pid_<PID>
# Decode a page table entry manually:
python3 -c "
pte = 0x8000000004021025 # from ptdump
pfn = (pte >> 12) & 0xFFFFFFFFF
present = pte & 1
rw = (pte >> 1) & 1
user = (pte >> 2) & 1
nx = pte >> 63
print(f'PFN: {pfn:#x} present={present} rw={rw} user={user} NX={nx}')
"
PTE bit manipulation functions
/* Read PTE properties: */
pte_present(pte) /* page is in RAM (P bit set) */
pte_write(pte) /* writable */
pte_exec(pte) /* executable (NX bit clear on x86) */
pte_dirty(pte) /* page was written (hardware sets this) */
pte_young(pte) /* page was accessed (hardware sets this) */
pte_soft_dirty(pte) /* software dirty tracking bit */
pte_special(pte) /* special mapping (no struct page) */
pte_devmap(pte) /* device memory (ZONE_DEVICE) */
pte_pfn(pte) /* physical frame number */
/* Modify PTE properties: */
pte_mkwrite(pte) /* set writable */
pte_wrprotect(pte) /* clear writable */
pte_mkdirty(pte) /* set dirty */
pte_mkclean(pte) /* clear dirty */
pte_mkyoung(pte) /* set accessed */
pte_mkold(pte) /* clear accessed */
pte_mkspecial(pte) /* set special */
/* Create a PTE from a PFN and protection flags: */
pte_t pte = pfn_pte(pfn, PAGE_KERNEL); /* kernel RW NX */
pte_t pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); /* kernel RX */
pte_t pte = pfn_pte(pfn, PAGE_SHARED); /* user RW */
/* Get the struct page from a PTE: */
struct page *page = pte_page(pte); /* via pfn_to_page() */
Huge page PTEs
/* PMD-level huge page (2MB): */
pmd_t *pmd = pmd_offset(pud, addr);
if (pmd_large(*pmd)) {
/* This is a 2MB leaf PTE, not a pointer to PT */
phys_addr_t phys = (phys_addr_t)pmd_pfn(*pmd) << PAGE_SHIFT;
phys += addr & ~PMD_MASK; /* offset within 2MB page */
}
/* PUD-level huge page (1GB): */
pud_t *pud = pud_offset(p4d, addr);
if (pud_large(*pud)) {
/* 1GB leaf */
phys_addr_t phys = (phys_addr_t)pud_pfn(*pud) << PAGE_SHIFT;
phys += addr & ~PUD_MASK;
}
NUMA balancing page table scan
The NUMA balancer uses page table walking to find pages that should be migrated:
/* kernel/sched/fair.c (simplified) */
static void task_numa_work(struct callback_head *work)
{
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
unsigned long start = p->mm->numa_scan_offset;
/* Walk a chunk of the process address space: */
walk_page_range(mm, start, start + NUMA_SCAN_SIZE,
&numa_walk_ops, p);
/* numa_walk_ops.pte_entry does:
* - pte_mkold() to clear the Accessed bit
* - On next access, CPU sets Accessed bit → NUMA fault
* - task_numa_fault() records the NUMA node of the access
* - Scheduler uses this to prefer running the task
* near its hot pages
*/
}
Further reading
- Page Tables — page table structure and format
- Page Fault Handler — how faults are resolved
- NUMA — NUMA balancing using page walk
- KVM Memory — dirty page tracking
- userfaultfd — write-protect via page walk
mm/pagewalk.c— pagewalk implementationarch/x86/mm/dump_pagetables.c— ptdump debugfs