x86 Exception and Interrupt Handling
IDT, exception entry, hardware exceptions, and the page fault handler path
x86 exception fundamentals
On x86, all exceptions and interrupts are delivered via the Interrupt Descriptor Table (IDT). The IDT has 256 entries:
IDT entry layout (gate descriptor, 16 bytes on x86-64):
┌──────────────────────────────────────────────────┐
│ Offset[63:32] │
├──────────────────────────────────────────────────┤
│ Reserved │ Offset[31:16] │
├───────────┬───────────────┼──────────────────────┤
│ P DPL S │ Type │ Segment Selector │
├───────────┴───────────────┴──────────────────────┤
│ Offset[15:0] │
└──────────────────────────────────────────────────┘
P = present
DPL = descriptor privilege level (0=kernel, 3=user)
Type= 0xE = interrupt gate (clears IF), 0xF = trap gate (keeps IF)
Entries 0-31 are reserved for CPU exceptions. Entries 32-255 are for external interrupts and software traps (syscalls use int 0x80 historically, or SYSCALL instruction on x86-64).
CPU exception vectors
| Vector | Mnemonic | Name | Error code | Fault/Trap/Abort |
|---|---|---|---|---|
| 0 | #DE | Divide Error | No | Fault |
| 1 | #DB | Debug | No | Fault/Trap |
| 2 | NMI | Non-Maskable Interrupt | No | Interrupt |
| 3 | #BP | Breakpoint | No | Trap |
| 4 | #OF | Overflow | No | Trap |
| 5 | #BR | BOUND Range Exceeded | No | Fault |
| 6 | #UD | Invalid Opcode | No | Fault |
| 7 | #NM | Device Not Available | No | Fault |
| 8 | #DF | Double Fault | Zero | Abort |
| 10 | #TS | Invalid TSS | Yes | Fault |
| 11 | #NP | Segment Not Present | Yes | Fault |
| 12 | #SS | Stack-Segment Fault | Yes | Fault |
| 13 | #GP | General Protection | Yes | Fault |
| 14 | #PF | Page Fault | Yes | Fault |
| 16 | #MF | x87 FPU Error | No | Fault |
| 17 | #AC | Alignment Check | Zero | Fault |
| 18 | #MC | Machine Check | No | Abort |
| 19 | #XM | SIMD FP Exception | No | Fault |
| 20 | #VE | Virtualization Exception | No | Fault |
Fault: saved CS:RIP points to the faulting instruction (re-executed after handler). Trap: saved CS:RIP points to the instruction after the trap. Abort: unrecoverable; saved state may be corrupted.
IDT initialization
/* arch/x86/kernel/idt.c */
/* IDT gate descriptor in kernel code */
struct idt_data {
unsigned int vector;
unsigned int segment;
struct idt_bits bits;
const void *addr;
};
/* Early IDT setup (before C code) */
static const __initconst struct idt_data def_idts[] = {
INTG(X86_TRAP_DE, asm_exc_divide_error),
INTG(X86_TRAP_NMI, asm_exc_nmi),
SYSG(X86_TRAP_BP, asm_exc_int3), /* DPL=3: user can trigger INT3 */
SYSG(X86_TRAP_OF, asm_exc_overflow), /* DPL=3: user can trigger INTO */
INTG(X86_TRAP_UD, asm_exc_invalid_op),
INTG(X86_TRAP_NM, asm_exc_device_not_available),
INTG(X86_TRAP_GP, asm_exc_general_protection),
INTG(X86_TRAP_PF, asm_exc_page_fault),
/* ... */
};
void __init idt_setup_early_traps(void)
{
idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts), true);
load_idt(&idt_descr); /* LIDT instruction */
}
The LIDT instruction loads the IDT register (IDTR) with the base address and limit of the IDT table.
Exception entry: hardware behavior
When the CPU delivers an exception to ring 0 (from ring 3 userspace):
Hardware actions (automatic, before any software):
1. Look up IDT[vector] → get handler address and DPL
2. Check DPL ≥ CPL (prevents user invoking kernel-only gates)
3. Switch to kernel stack (via TSS.RSP0 for ring 0 target)
4. Push on new stack:
[SS] ← old user SS
[RSP] ← old user RSP
[RFLAGS] ← old RFLAGS
[CS] ← old user CS
[RIP] ← address of faulting instruction (fault) or next (trap)
[Error Code] ← for exceptions with error codes (GP, PF, etc.)
5. Clear IF (interrupt gate): disables maskable interrupts
6. Jump to handler address from IDT entry
For ring-0 → ring-0 exceptions (kernel taking a fault): - No stack switch (stays on current kernel stack) - On x86-64, SS/RSP are always pushed even for same-privilege exceptions (unlike 32-bit x86) - Still pushes SS/RSP/RFLAGS/CS/RIP and error code (if applicable)
Exception entry assembly (x86-64)
Linux uses DEFINE_IDTENTRY_* macros to generate exception entry stubs:
/* arch/x86/include/asm/idtentry.h */
/* Generates the asm entry stub + C handler declaration */
#define DEFINE_IDTENTRY_ERRORCODE(func) \
__visible noinstr void func(struct pt_regs *regs, long error_code)
/* arch/x86/kernel/traps.c */
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
struct task_struct *tsk = current;
unsigned long condition = error_code;
char *desc;
cond_local_irq_enable(regs);
if (static_cpu_has(X86_FEATURE_UMIP)) {
if (user_mode(regs) && fixup_umip_exception(regs))
return;
}
if (v8086_mode(regs)) {
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
return;
}
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
printk_ratelimit()) {
pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
tsk->comm, task_pid_nr(tsk),
regs->ip, regs->sp, error_code);
}
force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)0);
}
struct pt_regs
All exception handlers receive a struct pt_regs that captures the full register state:
/* arch/x86/include/asm/ptrace.h */
struct pt_regs {
/* Callee-saved registers (pushed by entry asm) */
unsigned long r15, r14, r13, r12;
unsigned long bp;
unsigned long bx;
/* Argument registers */
unsigned long r11, r10, r9, r8;
unsigned long ax, cx, dx, si, di;
/* CPU-pushed on exception entry */
unsigned long orig_ax; /* syscall number or error code */
unsigned long ip;
unsigned long cs;
unsigned long flags;
unsigned long sp;
unsigned long ss;
};
The entry assembly (arch/x86/entry/entry_64.S) saves all registers to build this structure before calling the C handler.
Task State Segment (TSS)
The TSS is critical for stack switching on exception delivery:
/* arch/x86/include/asm/processor.h */
struct x86_hw_tss {
u32 reserved1;
u64 sp0; /* RSP0: kernel stack pointer (used for ring-0 target) */
u64 sp1;
u64 sp2;
u64 reserved2;
u64 ist[7]; /* IST1-IST7: Interrupt Stack Table entries */
u32 reserved3;
u32 reserved4;
u16 reserved5;
u16 io_bitmap_base;
} __attribute__((packed));
IST (Interrupt Stack Table): Some critical exceptions use dedicated stacks from IST to handle faults even when the regular kernel stack is corrupted:
| Exception | IST | Reason |
|---|---|---|
| NMI | IST1 | Can interrupt any code including exception handlers |
| #DF (double fault) | IST2 | Stack pointer might be corrupted |
| #MC (machine check) | IST3 | Hardware may be completely broken |
| #DB (debug) | IST4 | Can fire in weird contexts |
Page fault handler (#PF)
The page fault is the most important exception in a running system:
Error code bits
#PF error code:
bit 0: P — 0=non-present, 1=protection violation
bit 1: W/R — 0=read, 1=write
bit 2: U/S — 0=supervisor (kernel), 1=user
bit 3: RSVD — reserved bit violation
bit 4: I/D — 0=data, 1=instruction fetch
bit 5: PK — protection key violation
bit 6: SGX — SGX-related fault
Handler entry
/* arch/x86/mm/fault.c */
/*
* CR2 contains the faulting linear address.
* Called from asm_exc_page_fault after saving pt_regs.
*/
DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
{
unsigned long address = read_cr2(); /* faulting virtual address */
irqentry_state_t state;
prefetchw(¤t->mm->mmap_lock);
/*
* KVM: notify before running page fault handler.
* Allows hypervisor to handle EPT-related faults first.
*/
if (kvm_handle_async_pf(regs, (u32)error_code))
return;
state = irqentry_enter(regs);
instrumentation_begin();
handle_page_fault(regs, error_code, address);
instrumentation_end();
irqentry_exit(regs, state);
}
Fault classification
static noinline void
handle_page_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
/* Fault from kernel mode? */
if (unlikely(fault_in_kernel_space(address))) {
do_kern_addr_fault(regs, error_code, address);
return;
}
/* Fault from user mode or kernel accessing user memory */
do_user_addr_fault(regs, error_code, address);
}
User address fault path
static inline void
do_user_addr_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
struct vm_area_struct *vma;
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
vm_fault_t fault;
unsigned int flags = FAULT_FLAG_DEFAULT;
/* Spurious fault? Check if address is in a valid VMA */
if (unlikely(!mm)) {
bad_area_nosemaphore(regs, error_code, address);
return;
}
/* Indicate this is a write or instruction fetch */
if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
if (error_code & X86_PF_USER)
flags |= FAULT_FLAG_USER;
/* Acquire mmap_lock (shared for reads, exclusive if needed) */
if (unlikely(!mmap_read_trylock(mm))) {
/* If in interrupt context or killed, can't wait */
if (unlikely(faulthandler_disabled() || !user_mode(regs))) {
bad_area_nosemaphore(regs, error_code, address);
return;
}
mmap_read_lock(mm);
}
/* Find the VMA containing the faulting address */
vma = find_vma(mm, address);
if (unlikely(!vma)) {
bad_area(regs, error_code, address, NULL);
goto done;
}
if (likely(vma->vm_start <= address))
goto good_area;
/* Address is in a hole between VMAs — check if it's a stack growth */
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
bad_area(regs, error_code, address, vma);
goto done;
}
if (unlikely(expand_stack(vma, address))) {
bad_area(regs, error_code, address, vma);
goto done;
}
good_area:
fault = handle_mm_fault(vma, address, flags, regs);
/* ... */
done:
mmap_read_unlock(mm);
}
Kernel address fault path
static noinline void
do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
/* Check the fixup table: expected faults in kernel code */
if (fixup_exception(regs, X86_TRAP_PF, hw_error_code, address))
return;
/* KASAN/KFENCE shadow memory? */
if (kfence_handle_page_fault(address, hw_error_code & X86_PF_WRITE, regs))
return;
/* vmalloc fault? (lazy mapping, VMALLOC_START to VMALLOC_END) */
if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PK))) {
if (vmalloc_fault(address) >= 0)
return;
}
/* Oops: unexpected kernel fault */
bad_area_nosemaphore(regs, hw_error_code, address);
}
Exception fixup table
The kernel's exception fixup table maps faulting instruction addresses to recovery code. Used for copy_from_user/copy_to_user:
/* arch/x86/lib/usercopy_64.S */
/* If this copy instruction faults: */
1: rep movsb
/* ... */
2: ret
/* Fixup entry: if [1] faults, jump to [3] */
_ASM_EXTABLE_UA(1b, 3f)
3: mov %ecx, %eax /* Return number of bytes not copied */
ret
/* In C: */
unsigned long copy_from_user(void *to, const void __user *from, unsigned long n)
{
/*
* If the copy faults (bad user pointer), the CPU exception
* handler checks the fixup table and jumps to the error path
* instead of oopsing.
*/
return raw_copy_from_user(to, from, n);
}
NMI handling
Non-Maskable Interrupts cannot be masked with cli. They require careful handling because they can interrupt any code including interrupt handlers:
/* arch/x86/kernel/nmi.c */
DEFINE_IDTENTRY_NMI(exc_nmi)
{
irqentry_state_t irq_state;
/*
* NMI uses its own IST stack. Since NMI can nest (NMI within NMI
* handling), the kernel uses a careful "NMI nesting" protocol:
*/
/* Check if we interrupted an NMI handler itself */
if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
this_cpu_write(nmi_cr2, read_cr2());
this_cpu_write(nmi_state, NMI_LATCHED);
return;
}
this_cpu_write(nmi_state, NMI_EXECUTING);
this_cpu_write(nmi_cr2, read_cr2());
nmi_restart:
irq_state = irqentry_nmi_enter(regs);
do_nmi(regs, 0);
irqentry_nmi_exit(regs, irq_state);
/* Check if NMI was latched while handling */
if (this_cpu_cmpxchg(nmi_state, NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING)
goto nmi_restart; /* state changed to NMI_LATCHED by nested NMI */
}
NMIs are used for:
- Hardware errors (MCE, watchdog)
- Perf PMU overflow interrupts (PEBS/LBR)
- kdump NMI shootdown (crash on one CPU, NMI all others)
- Kernel watchdog (CONFIG_HARDLOCKUP_DETECTOR)
SYSCALL/SYSRET (fast system call)
Modern x86-64 uses the SYSCALL instruction instead of int 0x80:
SYSCALL (user → kernel):
1. Load RIP from MSR_LSTAR (syscall entry point)
2. Save user RIP in RCX, RFLAGS in R11
3. Load CS/SS from MSR_STAR segments
4. Clear RFLAGS bits in MSR_SFMASK
* Does NOT switch stack — kernel must do this in entry asm
SYSRET (kernel → user):
1. Restore RIP from RCX, RFLAGS from R11
2. Restore CS/SS
* Does NOT save/restore any other registers
/* arch/x86/entry/entry_64.S */
SYM_CODE_START(entry_SYSCALL_64)
/* Switch from user stack to kernel stack: */
swapgs /* swap GS with MSR_KERNEL_GS_BASE */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Save user state: */
pushq $__USER_DS /* ss */
pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* rsp */
pushq %r11 /* rflags */
pushq $__USER_CS /* cs */
pushq %rcx /* rip (saved by SYSCALL) */
/* ... save remaining regs, call do_syscall_64() ... */
Observability
# Count exceptions per type (perf)
perf stat -e exceptions:page_fault_user,exceptions:page_fault_kernel \
-p <pid> sleep 5
# Watch page fault rate (vmstat)
vmstat 1
# pgfault: minor faults (page in from cache or anon)
# pgmajfault: major faults (disk read)
# Trace all page faults (heavy!)
bpftrace -e '
tracepoint:exceptions:page_fault_user {
@[ustack] = count();
}'
# Decode IDT
crash> idt
# Show exception handler address for vector 14 (#PF)
python3 -c "
import struct, sys
# /sys/kernel/debug/x86/idt_table is a binary dump of the IDT
with open('/sys/kernel/debug/x86/idt_table','rb') as f:
data = f.read()
v = 14 # page fault
entry = data[v*16:(v+1)*16]
lo16, seg, bits16, hi16, hi32, _ = struct.unpack('<HHHHII', entry)
addr = lo16 | (hi16 << 16) | (hi32 << 32)
print(f'#PF handler: {addr:#x}')
"
# /proc/interrupts shows interrupt counts per CPU
cat /proc/interrupts | head -5
Further reading
- ARM64 Exception Model — equivalent AArch64 mechanisms
- Page Fault Handler — handle_mm_fault and VMA resolution
- IRQ Handling — external interrupt delivery
- Syscall Entry — SYSCALL/SYSRET path in detail
- Kernel Hardening — SMEP/SMAP, KPTI
arch/x86/kernel/traps.c— exception handler implementationsarch/x86/mm/fault.c— page fault handlerarch/x86/entry/entry_64.S— exception/syscall entry assembly- Intel SDM Vol 3A, Chapter 6 — Interrupt and Exception Handling