Workqueue Internals
cmwq architecture, worker pools, work item lifecycle, and NUMA affinity
Why workqueues exist
Interrupt handlers (hardirq and softirq) run atomically — they cannot sleep or block. Work that needs to sleep, acquire mutexes, or do I/O must be deferred. Workqueues provide a kernel thread pool that can run this deferred work in process context.
Interrupt fires:
hardirq handler → schedule_work(&my_work) ← non-sleeping, fast
│
▼
[kworker thread] ← sleeps OK
my_work_func()
→ can mutex_lock(), kmalloc(GFP_KERNEL), etc.
Concurrency-Managed Workqueues (cmwq)
Since Linux 2.6.36, workqueues use cmwq — introduced by Tejun Heo (LWN) — a unified worker pool that automatically manages concurrency:
Before cmwq (per-CPU dedicated threads):
cpu0: kblockd/0, kworker/0:0, kworker/0:1, ...
cpu1: kblockd/1, kworker/1:0, kworker/1:1, ...
Problem: each workqueue had its own per-CPU thread → thread explosion
After cmwq (shared worker pool):
cpu0: [shared pool] kworker/0:0, kworker/0:1, kworker/0:2H (high-prio)
cpu1: [shared pool] kworker/1:0, kworker/1:1, kworker/1:2H
All workqueues share the pool → far fewer threads
Worker pool types
/* kernel/workqueue.c */
/* Per-CPU worker pools (2 per CPU: normal and highpri): */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
cpu_worker_pools);
/*
* cpu_worker_pools[0]: normal priority (SCHED_NORMAL, nice=0)
* cpu_worker_pools[1]: high priority (SCHED_NORMAL, nice=-20)
*/
/* Unbound worker pools: not tied to any CPU, for WQ_UNBOUND workqueues */
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
Creating workqueues
#include <linux/workqueue.h>
/* System workqueues (prefer these): */
system_wq /* general-purpose, can sleep */
system_highpri_wq /* high-priority, time-critical work */
system_long_wq /* for work that may run for a long time */
system_unbound_wq /* not bound to a specific CPU */
system_freezable_wq /* suspended during system freeze */
system_power_efficient_wq /* prefers low-power CPUs */
/* Use system_wq for most cases: */
schedule_work(&my_work); /* submits to system_wq */
/* Custom workqueue: */
struct workqueue_struct *wq;
/* Bound to a CPU (work runs on the CPU it was submitted from): */
wq = alloc_workqueue("my-wq", WQ_MEM_RECLAIM, 1);
/* Unbound (can run on any CPU, better for NUMA): */
wq = alloc_workqueue("my-unbound-wq", WQ_UNBOUND, 0);
/* High priority + unbound: */
wq = alloc_workqueue("my-hipri-wq", WQ_UNBOUND | WQ_HIGHPRI, 0);
/* Flags: */
WQ_UNBOUND /* not bound to a specific CPU */
WQ_FREEZABLE /* freeze during system suspend */
WQ_MEM_RECLAIM /* may be needed during memory reclaim (reserves a rescue thread) */
WQ_HIGHPRI /* high-priority worker pool */
WQ_CPU_INTENSIVE /* CPU-intensive work: excluded from pool concurrency count,
allowing other pending work to start alongside it */
WQ_SYSFS /* expose workqueue attributes in /sys/bus/workqueue */
/* Last arg: max_active = max concurrent work items per CPU (0 = default=256) */
Work items
/* Declare and initialize work: */
DECLARE_WORK(my_work, my_work_func);
/* or: */
struct work_struct my_work;
INIT_WORK(&my_work, my_work_func);
/* Work function: */
static void my_work_func(struct work_struct *work)
{
struct my_device *dev = container_of(work, struct my_device, work);
/* Process dev->data — can sleep here */
mutex_lock(&dev->lock);
process_data(dev);
mutex_unlock(&dev->lock);
}
/* Submit work: */
schedule_work(&my_work); /* to system_wq, current CPU */
queue_work(wq, &my_work); /* to custom wq */
queue_work_on(cpu, wq, &my_work); /* to specific CPU's pool */
/* Wait for work to complete: */
flush_work(&my_work); /* blocks until this work item finishes */
flush_workqueue(wq); /* blocks until all items in wq finish */
Delayed work
struct delayed_work my_dwork;
INIT_DELAYED_WORK(&my_dwork, my_dwork_func);
/* Schedule to run after 100ms: */
schedule_delayed_work(&my_dwork, msecs_to_jiffies(100));
queue_delayed_work(wq, &my_dwork, msecs_to_jiffies(100));
/* Cancel (may or may not cancel if already running): */
cancel_delayed_work(&my_dwork);
/* Cancel and wait for completion: */
cancel_delayed_work_sync(&my_dwork);
struct work_struct internals
struct work_struct {
atomic_long_t data; /* encodes: pool pointer + flags */
struct list_head entry; /* in pool->worklist */
work_func_t func;
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
};
/* data field encodes: */
/* bits [0]: WORK_STRUCT_PENDING — work is queued */
/* bits [1]: WORK_STRUCT_INACTIVE — work is in an inactive list */
/* bits [2]: WORK_STRUCT_PWQ — lower bits point to pool_workqueue */
/* bits [WORK_STRUCT_FLAG_BITS..]: pointer to pool_workqueue or last pool */
Worker pool and concurrency management
struct worker_pool {
spinlock_t lock;
int cpu; /* CPU or -1 for unbound */
int node; /* NUMA node */
int id;
unsigned int flags;
unsigned long watchdog_ts;
bool cpu_stall;
struct list_head worklist; /* pending work items */
int nr_workers;
int nr_idle;
struct list_head idle_list; /* idle workers */
struct timer_list idle_timer;
struct work_struct idle_cull_work;
struct timer_list mayday_timer; /* safety valve */
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
struct worker *manager; /* thread managing the pool */
struct list_head workers; /* all workers in pool */
struct completion *detach_completion;
struct ida worker_ida;
struct workqueue_attrs *attrs;
struct hlist_node hash_node;
int refcnt;
struct rcu_head rcu;
};
Concurrency management algorithm
cmwq maintains exactly enough workers to keep the CPU busy:
Rule: pool tries to keep at least one runnable worker
(one that is executing work AND not sleeping)
When a worker goes to sleep (blocks on mutex, I/O, etc.):
→ pool checks: is there another runnable worker?
→ if not: wake an idle worker (or create a new one)
→ prevents CPU stall while work is blocked
When a worker wakes up:
→ pool checks: too many runnable workers?
→ if yes: go back to sleep (idle)
Result: CPU always busy, minimal thread creation
/* kernel/workqueue.c */
static void worker_enter_idle(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
/* ... */
pool->nr_idle++;
worker->last_active = jiffies;
worker->flags |= WORKER_IDLE;
list_add(&worker->entry, &pool->idle_list);
/* Destroy idle workers after 5 minutes of idleness */
if (too_many_workers(pool))
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
}
static bool keep_working(struct worker_pool *pool)
{
return !list_empty(&pool->worklist) &&
atomic_read(&pool->nr_running) <= 1;
}
Workqueue attributes and NUMA affinity
/* Get/set workqueue attributes (WQ_UNBOUND only): */
struct workqueue_attrs *attrs = alloc_workqueue_attrs();
attrs->nice = -5; /* worker thread niceness */
attrs->cpumask = cpu_mask; /* which CPUs workers can run on */
attrs->no_numa = false; /* respect NUMA locality */
apply_workqueue_attrs(wq, attrs);
free_workqueue_attrs(attrs);
/* NUMA-aware unbound pools:
By default, unbound workqueues have per-NUMA-node pools.
Work submitted from node 0 runs on node 0 workers.
This ensures cache-hot data stays on the same node. */
Workqueue debugging
# Show all workqueues and their stats:
cat /sys/kernel/debug/workqueue/stats
# Show worker threads:
ps aux | grep kworker
# kworker/0:1 — bound to CPU 0, pool 1 (normal)
# kworker/0:1H — bound to CPU 0, pool H (high priority)
# kworker/u8:2 — unbound pool, pool ID=8, worker ID=2
# Show per-workqueue stats (if WQ_SYSFS):
ls /sys/bus/workqueue/devices/
cat /sys/bus/workqueue/devices/system_wq/per_cpu
# Dump all task stacks (including kworker threads) via SysRq:
echo t > /proc/sysrq-trigger
# Shows all threads in dmesg; use /sys/kernel/debug/workqueue for
# workqueue-specific state (pending counts, queue depths)
# Trace work submission and execution:
bpftrace -e '
kprobe:queue_work_on {
/* queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) */
printf("queue_work: cpu=%d func=%s\n", (int)arg0,
ksym(((struct work_struct *)arg2)->func));
}
kprobe:process_one_work {
printf("exec_work: cpu=%d pid=%d func=%s\n",
cpu, pid, ksym(((struct work_struct *)arg1)->func));
}'
# Watchdog: detect hung workers (CONFIG_WQ_WATCHDOG=y)
# Kernel will warn if a worker pool stalls for > 30 seconds:
# "BUG: workqueue lockup"
# Tune:
echo 60 > /sys/module/workqueue/parameters/watchdog_thresh
WQ_MEM_RECLAIM and rescue workers
/*
* Work used during memory reclaim (e.g., writeback) MUST use WQ_MEM_RECLAIM.
* This ensures a dedicated rescue worker exists that won't be blocked
* waiting for memory (avoiding deadlock):
*
* Memory reclaim path:
* 1. kswapd needs to write dirty pages → queues work
* 2. Normal workers might be sleeping waiting for memory
* 3. Rescue worker picks up the work — always has one thread reserved
*/
wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
Further reading
- Workqueues — API reference and basic usage
- Softirq and Tasklets — lighter-weight deferral (no sleep)
- Threaded IRQs — IRQ handlers in thread context
- RCU — lock-free deferred work
kernel/workqueue.c— complete implementation (~6000 lines)Documentation/core-api/workqueue.rst— kernel documentation