NUMA Automatic Balancing
How the scheduler detects and fixes NUMA locality mismatches
The problem
On NUMA systems, memory access latency depends on which NUMA node holds the physical page:
Node 0: Node 1:
CPU 0, CPU 1 CPU 2, CPU 3
Memory 0 (fast) Memory 1 (fast)
│ │
└──────── QPI/UPI ────────┘
(slow: ~2× penalty)
Task running on CPU 2 reading pages on Node 0:
→ every cache miss → remote memory access → 2× latency!
Automatic NUMA balancing (CONFIG_NUMA_BALANCING) detects this situation and migrates pages to the node where they're being used. Introduced in Linux 3.13 by Mel Gorman (SUSE) et al. — 10fc05d0e551.
How it works
Phase 1: Mark pages as inaccessible (PROT_NONE)
The NUMA balancer periodically scans a process's page tables and changes the PTE of accessed pages to PROT_NONE (unmaps them without freeing):
/* kernel/sched/fair.c */
static void task_numa_work(struct callback_head *work)
{
struct task_struct *p = current;
unsigned long start = p->numa_scan_offset;
/* Walk a chunk of VA space, NUMA-fault the PTEs: */
change_prot_numa(p->mm, start, start + NUMA_SCAN_SIZE);
/* Sets PTE to PROT_NONE with _PAGE_PROTNONE (was _PAGE_NUMA pre-3.17) */
p->numa_scan_offset = start + NUMA_SCAN_SIZE;
if (p->numa_scan_offset >= p->mm->highest_vm_end)
p->numa_scan_offset = 0; /* wrap around */
}
Phase 2: Catch NUMA page faults
When the task accesses the PROT_NONE page, a page fault fires:
/* mm/memory.c: do_numa_page() */
static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct page *page = vm_normal_page(vmf->vma, vmf->address, vmf->orig_pte);
int page_nid = page_to_nid(page); /* current NUMA node of page */
int this_nid = numa_node_id(); /* CPU's NUMA node */
/* Record the access for statistics */
task_numa_fault(last_cpupid, page_nid, 1, flags);
/* Should we migrate this page? */
if (page_nid != this_nid) {
/* Migrate the page to this_nid */
migrate_misplaced_page(page, vmf->vma, this_nid);
}
/* Restore the PTE (make it accessible again) */
pte_t pte = pte_modify(vmf->orig_pte, vma->vm_page_prot);
set_pte_at(vmf->vma->vm_mm, vmf->address, vmf->pte, pte);
return 0;
}
Phase 3: Task migration
If the task has most of its pages on Node 1 but runs on Node 0, the scheduler may migrate the task instead of (or in addition to) the pages:
/* kernel/sched/fair.c */
static bool task_numa_compare(struct task_numa_env *env,
long taskimp, long groupimp, bool maymove)
{
/*
* Compare: task running on preferred_node vs current_node
* taskimp: improvement score from moving task
* groupimp: improvement from moving entire task group
*/
long imp = env->p->numa_group ? groupimp : taskimp;
if (imp <= env->best_imp)
return false;
/* Better NUMA locality → update best candidate */
env->best_imp = imp;
env->best_task = env->dst_task;
env->best_cpu = env->dst_cpu;
return true;
}
NUMA statistics per task
/* task_struct NUMA fields: */
struct task_struct {
/* ... */
int numa_preferred_nid; /* preferred NUMA node */
unsigned long numa_migrate_retry;
u64 node_stamp; /* last numa balancing work */
u64 last_task_numa_placement;
u64 last_sum_exec_runtime;
struct callback_head numa_work;
struct numa_group __rcu *numa_group; /* task group for shared memory */
unsigned long *numa_faults; /* faults per node per access type */
/* [2 * nr_nodes]: [local faults, remote faults] for each node */
/* [2]: cpu vs mem accesses */
};
NUMA groups: shared memory balancing
When multiple tasks share memory (e.g., a multithreaded program), they form a NUMA group:
struct numa_group {
refcount_t refcount;
spinlock_t lock;
int nr_tasks;
pid_t gid;
int active_nodes;
/* Fault statistics per node: */
unsigned long total_faults;
unsigned long max_faults_cpu;
unsigned long *faults; /* faults[node][type] */
unsigned long *faults_cpu;
struct rcu_head rcu;
unsigned long nodes[]; /* online nodes participating */
};
/* Tasks in a group are migrated together to preserve shared memory locality */
Balancing scan rate
The scan rate adapts based on how many NUMA faults are found:
/* How often to scan (pages per second): */
/* Controlled by: /proc/sys/kernel/numa_balancing_scan_period_min/max */
static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private)
{
/* More faults → scan more aggressively (lower period) */
/* Fewer faults → scan less often (higher period) */
unsigned int period = p->numa_scan_period;
if (faults > p->numa_faults_locality[2])
period = max(period / 2, task_scan_min(p)); /* speed up */
else
period = min(period * 2, task_scan_max(p)); /* slow down */
p->numa_scan_period = period;
}
Configuration
# Enable/disable automatic NUMA balancing:
echo 1 > /proc/sys/kernel/numa_balancing # enable (default)
echo 0 > /proc/sys/kernel/numa_balancing # disable
# Scan period (milliseconds) — how often to scan each task's VAS:
cat /proc/sys/kernel/numa_balancing_scan_period_min # default 1000
cat /proc/sys/kernel/numa_balancing_scan_period_max # default 60000
# Scan size — bytes scanned per interval:
cat /proc/sys/kernel/numa_balancing_scan_size # default 256
# Tune for a database workload (aggressive):
echo 100 > /proc/sys/kernel/numa_balancing_scan_period_min
echo 5000 > /proc/sys/kernel/numa_balancing_scan_period_max
echo 1024 > /proc/sys/kernel/numa_balancing_scan_size
Observing NUMA balancing
# Per-process NUMA stats:
cat /proc/<pid>/sched # numa_faults, numa_preferred_nid, ...
cat /proc/<pid>/numa_maps
# 7f1234000000 default anon=12 dirty=12 N0=8 N1=4
# ↑N0 ↑N1 = pages on each node
# System-wide NUMA balancing stats:
cat /proc/vmstat | grep numa
# numa_hint_faults 12345 ← PROT_NONE faults triggered
# numa_hint_faults_local 8000 ← faults already on preferred node
# numa_pages_migrated 4345 ← pages migrated
# numa_pte_updates 50000 ← PTEs marked PROT_NONE
# NUMA misses in perf:
perf stat -e dtlb_load_misses.miss_causes_a_walk,node_loads,node_stores \
-p <pid> sleep 5
# numastat: per-node allocation stats
numastat <pid>
numastat -m # system memory per node
# Trace NUMA page fault + migration:
bpftrace -e '
kprobe:do_numa_page {
@numa_faults[tid] = count();
}
kprobe:migrate_misplaced_page {
@migrations = count();
}'
# Check NUMA topology:
numactl --hardware
# available: 2 nodes (0-1)
# node 0 cpus: 0 1 2 3
# node 1 cpus: 4 5 6 7
# node distances:
# node 0 1
# 0: 10 21
# 1: 21 10
When to disable NUMA balancing
# Disable for workloads where it hurts:
# 1. Already NUMA-pinned workloads:
numactl --cpunodebind=0 --membind=0 ./myapp
# (explicit placement beats auto-balancing)
# 2. Latency-sensitive workloads:
# NUMA faults add ~10µs per fault — avoid for RT tasks
echo 0 > /proc/sys/kernel/numa_balancing
# 3. tmpfs/huge page workloads:
# Pages in tmpfs or huge pages aren't scanned by NUMA balancer
# No benefit to enable
# 4. Mostly sequential single-node access:
# If the app already accesses data on the local node, balancing adds
# overhead (scan + faults) with no benefit
NUMA-aware memory allocation
For applications that want explicit NUMA control:
/* Link with -lnuma */
#include <numa.h>
/* Allocate on node 0: */
void *mem = numa_alloc_onnode(size, 0);
/* Allocate on the local node (where this thread runs): */
void *mem = numa_alloc_local(size);
/* Set memory policy for subsequent allocations: */
struct bitmask *mask = numa_bitmask_alloc(numa_num_configured_nodes());
numa_bitmask_setbit(mask, 0);
numa_set_membind(mask); /* only allocate from node 0 */
Further reading
- NUMA — NUMA architecture and zonelist
- NUMA Distance — ACPI SRAT and distance matrices
- NUMA Reclaim — reclaim policies per node
- CFS Load Balancing — CPU scheduler load balancing
- NUMA Zonelist — memory allocation fallback
kernel/sched/fair.c— task_numa_workmm/memory.c— do_numa_pagemm/migrate.c— migrate_misplaced_page