perf Events
Hardware performance counters, sampling, and profiling
What perf events are
The perf events subsystem was created by Ingo Molnár and Peter Zijlstra and introduced in Linux 2.6.31 (LWN). perf_event_open is the Linux syscall for accessing performance monitoring hardware:
- Hardware counters (PMU): CPU cycles, instructions, cache misses, branch mispredictions
- Software counters: page faults, context switches, CPU migrations
- Tracepoints: static kernel events
- Dynamic events: kprobes, uprobes, BPF
- Sampling: record PC/stack every N events for statistical profiling
perf_event_open syscall
#include <linux/perf_event.h>
struct perf_event_attr attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES, /* count CPU cycles */
.size = sizeof(attr),
.disabled = 1, /* start disabled */
.exclude_kernel = 0, /* count kernel events too */
.exclude_hv = 1, /* exclude hypervisor */
};
/* Monitor the current process on any CPU */
int fd = syscall(SYS_perf_event_open, &attr,
0, /* pid: 0=current process */
-1, /* cpu: -1=any */
-1, /* group_fd: -1=no group */
0); /* flags */
/* Start counting */
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
/* ... code to measure ... */
/* Read counter */
long long count;
read(fd, &count, sizeof(count));
printf("CPU cycles: %lld\n", count);
/* Stop */
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
close(fd);
Event types
/* Hardware events (CPU PMU) */
.type = PERF_TYPE_HARDWARE
.config:
PERF_COUNT_HW_CPU_CYCLES /* CPU cycles */
PERF_COUNT_HW_INSTRUCTIONS /* retired instructions */
PERF_COUNT_HW_CACHE_REFERENCES /* cache accesses */
PERF_COUNT_HW_CACHE_MISSES /* cache misses */
PERF_COUNT_HW_BRANCH_INSTRUCTIONS /* branch instructions */
PERF_COUNT_HW_BRANCH_MISSES /* branch mispredictions */
PERF_COUNT_HW_BUS_CYCLES
PERF_COUNT_HW_STALLED_CYCLES_FRONTEND
PERF_COUNT_HW_STALLED_CYCLES_BACKEND
/* Software events (kernel counters) */
.type = PERF_TYPE_SOFTWARE
.config:
PERF_COUNT_SW_CPU_CLOCK /* CPU clock (ns) */
PERF_COUNT_SW_TASK_CLOCK /* task-local clock */
PERF_COUNT_SW_PAGE_FAULTS /* page faults */
PERF_COUNT_SW_CONTEXT_SWITCHES /* voluntary + involuntary */
PERF_COUNT_SW_CPU_MIGRATIONS /* task moved between CPUs */
PERF_COUNT_SW_PAGE_FAULTS_MIN /* minor faults (no disk I/O) */
PERF_COUNT_SW_PAGE_FAULTS_MAJ /* major faults (disk I/O) */
PERF_COUNT_SW_ALIGNMENT_FAULTS
PERF_COUNT_SW_EMULATION_FAULTS
PERF_COUNT_SW_DUMMY
/* Hardware cache events */
.type = PERF_TYPE_HW_CACHE
/* .config encodes: cache_id | (cache_op_id << 8) | (result_id << 16) */
/* e.g., L1D read misses: */
.config = PERF_COUNT_HW_CACHE_L1D | (PERF_COUNT_HW_CACHE_OP_READ << 8) |
(PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
/* Tracepoint events */
.type = PERF_TYPE_TRACEPOINT
.config = <tracepoint_id> /* from /sys/kernel/tracing/events/.../id */
/* Raw CPU PMU events */
.type = PERF_TYPE_RAW
.config = <cpu-specific PMU event code> /* e.g., Intel perfmon event */
Sampling mode
Sampling records the instruction pointer (and optionally call stack) every N events:
struct perf_event_attr attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID |
PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN,
.sample_period = 1000000, /* every 1M cycles */
/* OR: */
.sample_freq = 99, /* ~99 Hz (kernel adjusts period) */
.freq = 1, /* enable freq mode */
.wakeup_events = 1, /* wake userspace every N samples */
/* Callchain depth: */
.exclude_callchain_kernel = 0,
.exclude_callchain_user = 0,
};
Samples are written to an mmap'd ring buffer (the perf mmap buffer):
/* Map the ring buffer */
void *mmap_buf = mmap(NULL, (1 + pages) * PAGE_SIZE,
PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
struct perf_event_mmap_page *header = mmap_buf;
/* header->data_head: write position (updated by kernel) */
/* header->data_tail: read position (updated by userspace) */
void *data = mmap_buf + PAGE_SIZE; /* ring starts after metadata page */
/* Read loop: */
while (header->data_tail != header->data_head) {
struct perf_event_header *event = data + (header->data_tail % data_size);
/* process event */
header->data_tail += event->size;
}
perf tool: the userspace frontend
Counting
# Count events for a command
perf stat ls
# Performance counter stats for 'ls':
# 0.98 msec task-clock # 0.783 CPUs utilized
# 1 context-switches # 1.022 K/sec
# 0 cpu-migrations # 0.000 /sec
# 163 page-faults # 166.389 K/sec
# 2,345,678 cycles # 2.394 GHz
# 1,234,567 instructions # 0.53 insn per cycle
# 456,789 branches # 466.520 M/sec
# 12,345 branch-misses # 2.70% of all branches
# Specific events
perf stat -e cycles,instructions,cache-misses,cache-references -- ./myapp
# System-wide
perf stat -a sleep 5
# CPI (cycles per instruction) analysis
perf stat -e cycles,instructions -r 3 ./myapp # repeat 3 times
CPU profiling with perf record
# Record with call graphs (frame pointer)
perf record -g -F 99 -- ./myapp
# Or: sample all CPUs
perf record -g -F 99 -a sleep 30
# Report
perf report
# Overhead Command Symbol
# 25.43% myapp [k] __schedule
# 18.21% myapp [.] malloc
# 12.33% myapp [.] my_function
# Show call graph
perf report --call-graph=graph
# Annotate with source
perf report --stdio
Flame graphs
# Record
perf record -g -F 99 -p <pid> sleep 30
# Convert to flame graph (via Brendan Gregg's scripts)
perf script | stackcollapse-perf.pl | flamegraph.pl > flamegraph.svg
# Or use hotspot (GUI)
hotspot perf.data
Kernel internals: struct perf_event
/* include/linux/perf_event.h */
struct perf_event {
struct list_head event_entry;
struct list_head sibling_list; /* group siblings */
struct list_head active_list;
struct perf_event_attr attr; /* user-space config */
struct hw_perf_event hw; /* PMU hardware state */
struct perf_event_context *ctx; /* context (task or CPU) */
atomic_long_t refcount;
/* Callback when an overflow/sample occurs: */
perf_overflow_handler_t overflow_handler;
void *overflow_handler_context;
/* Ring buffer for samples: */
struct perf_buffer __rcu *rb;
/* Tracepoint: */
struct trace_event_call *tp_event;
/* BPF: */
struct bpf_prog *prog;
u64 id; /* unique event ID */
/* ... */
};
struct hw_perf_event {
/* PMU hardware registers: */
union {
struct { /* hardware */
u64 config; /* programmed into PMU */
u64 last_tag;
unsigned long config_base;
unsigned long event_base;
int event_base_rdpmc;
int idx; /* counter index */
int last_cpu;
int flags;
};
/* ... */
};
u64 prev_count; /* last read value */
u64 sample_period;
u64 last_period;
local64_t period_left;
/* ... */
};
PMU: Hardware Performance Monitoring Unit
Each CPU architecture has a PMU with a limited number of programmable counters. On Intel:
Intel PMU (Skylake):
- 4 general-purpose counters (PERFCTR0..3) ← programmable with any event
- 3 fixed-function counters:
FIXED_CTR0 = instructions retired
FIXED_CTR1 = CPU clock cycles
FIXED_CTR2 = reference cycles
- Each counter generates an NMI when it overflows
Relevant MSRs:
IA32_PERFEVTSELx: event selector (event, umask, usr, os, edge, etc.)
IA32_PMCx: counter value
IA32_FIXED_CTR_CTRL: enable fixed counters
IA32_PERF_GLOBAL_CTRL: enable/disable all counters
The perf subsystem multiplexes software events onto the limited hardware counters using context_switch and rotation.
Observing perf internals
# List available events
perf list
# Check if PMU is working
perf stat -e cycles echo hello
# Hardware event codes for raw events
# Intel: https://perfmon-events.intel.com/
perf stat -e r4c14 # raw Intel event code (hex)
# Check NMI handler
cat /proc/interrupts | grep NMI
# perf_event_paranoid: controls access
cat /proc/sys/kernel/perf_event_paranoid
# -1: no restrictions, 0: allow kernel profiling, 1: user only, 2: no kernel addr (default)
echo 0 > /proc/sys/kernel/perf_event_paranoid
Further reading
- ftrace — Low-level function tracing
- Kprobes and Tracepoints — Dynamic and static events used with perf
- BPF: Architecture — BPF_PROG_TYPE_PERF_EVENT programs
tools/perf/in the kernel tree — perf tool sourceDocumentation/admin-guide/perf-security.rst