Lock Contention Debugging

Finding and fixing lock hotspots in the kernel

Identifying the problem

Lock contention manifests as CPUs burning time waiting rather than doing useful work. Symptoms: - High CPU utilization but low throughput - perf top shows _raw_spin_lock or mutex_lock near the top - sar -u shows high %sys with low actual work rate - perf stat shows many lock:contention_begin events

/proc/lock_stat

Requires CONFIG_LOCK_STAT=y. Provides per-lock-class contention statistics:

# Enable lock stats collection
echo 1 > /proc/sys/kernel/lock_stat

# View stats (reset on read if you use >)
cat /proc/lock_stat

# Reset counters
echo 0 > /proc/lock_stat

Output format:

class name    con-bounces  contentions  waittime-min  waittime-max  waittime-total  acq-bounces  acquisitions  holdtime-min  holdtime-max  holdtime-total
&rq->lock:
                   234567       234567          0.20       4567.89    12345678.90       345678      23456789          0.10        234.56    23456789.12

Key columns: - contentions: number of times a caller had to wait (lock was already held) - waittime-max: worst-case wait time (µs) — your tail latency - holdtime-total: total time the lock was held — high means long critical sections - con-bounces: contentions where the previous holder was on a different CPU (cross-CPU bouncing)

# Find the most contended locks
cat /proc/lock_stat | sort -k2 -rn | head -20

# Focus on high wait times
awk '$5 > 1000' /proc/lock_stat  # waittime-max > 1ms

perf lock

perf lock traces lock events with timestamps:

# Record lock events (requires perf with lock support)
perf lock record -a sleep 10

# Analyze: show most contended locks
perf lock report

# Summary view
perf lock report --key acquired

Output:

                Name   acquired  contended  avg wait (ns)  total wait (ns)
        &rq->__lock:    1234567      45678          12345      12345678901
  &mm->mmap_lock(W):      23456       1234         234567       1234567890

perf top for lock hotspots

# Find which functions spend most time waiting on locks
perf top -e lock:contention_begin,lock:contention_end

# Profile lock acquisition overhead
perf record -e lock:contention_begin -ag sleep 10
perf report --stdio

bpftrace for lock analysis

# Track lock wait time per lock and call site
bpftrace -e '
kprobe:mutex_lock {
    @ts[tid] = nsecs;
    @caller[tid] = kstack;
}
kretprobe:mutex_lock /@ts[tid]/ {
    $wait = nsecs - @ts[tid];
    if ($wait > 1000000) {  /* > 1ms */
        printf("Long mutex wait: %lu ns\n  caller:\n%s\n",
               $wait, @caller[tid]);
    }
    delete(@ts[tid]);
    delete(@caller[tid]);
}'

# Histogram of spinlock hold times
bpftrace -e '
kprobe:_raw_spin_lock {
    @start[tid] = nsecs;
}
kprobe:_raw_spin_unlock /@start[tid]/ {
    @hold_ns = hist(nsecs - @start[tid]);
    delete(@start[tid]);
}'

# Find which locks have the most contention (spin loops)
bpftrace -e '
kprobe:__pv_queued_spin_lock_slowpath {
    @[kstack] = count();
}
interval:s:5 {
    print(@); clear(@);
}'

Lockdep contention tracking

# With CONFIG_LOCK_STAT and CONFIG_LOCKDEP, check /proc/lockdep_stats
cat /proc/lockdep_stats
# lock-classes: 2341
# direct dependencies: 8234
# chain cache misses: 123
# hardirq-safe locks:  456
# softirq-safe locks:  789

ftrace: tracing specific lock points

# Trace all mutex_lock calls with stack
echo mutex_lock > /sys/kernel/debug/tracing/set_ftrace_filter
echo function_graph > /sys/kernel/debug/tracing/current_tracer
echo 1 > /sys/kernel/debug/tracing/tracing_on
# ... reproduce the issue ...
echo 0 > /sys/kernel/debug/tracing/tracing_on
cat /sys/kernel/debug/tracing/trace | head -100

Common lock contention patterns and fixes

1. Fine-grained locking

Replace one coarse lock with many per-bucket or per-object locks:

/* Before: one global lock */
static DEFINE_SPINLOCK(hash_lock);
static struct hlist_head hash_table[HASH_SIZE];

/* After: per-bucket locks */
struct hash_bucket {
    spinlock_t lock;
    struct hlist_head head;
} ____cacheline_aligned;
static struct hash_bucket hash_table[HASH_SIZE];

The ____cacheline_aligned attribute prevents false sharing between adjacent buckets.

2. Read-write splitting

If most access is read-only, use rwsem or RCU:

/* Before: spinlock serializes readers and writers */
spin_lock(&config_lock);
val = config->value;
spin_unlock(&config_lock);

/* After: RCU allows concurrent reads */
rcu_read_lock();
cfg = rcu_dereference(config);
val = cfg->value;
rcu_read_unlock();

3. Per-CPU sharding

For counters and stats, use per-CPU variables:

/* Before: atomic increment on shared counter */
atomic_inc(&event_count);

/* After: per-CPU counter, no synchronization needed */
this_cpu_inc(event_count);

4. Lock elision / trylock patterns

In some cases, falling back to a slower path avoids contention:

/* Try to acquire, fall back to queuing work if busy */
if (!spin_trylock(&cache_lock)) {
    schedule_work(&cache_work);  /* retry later */
    return;
}
/* fast path */
spin_unlock(&cache_lock);

5. Batching

Reduce lock granularity by batching multiple updates into one critical section:

/* Before: one lock per item */
for (item in items) {
    spin_lock(&list_lock);
    list_add(&item->node, &list);
    spin_unlock(&list_lock);
}

/* After: batch */
spin_lock(&list_lock);
for (item in items)
    list_add(&item->node, &list);
spin_unlock(&list_lock);