Skip to content

Workqueue Internals

cmwq architecture, worker pools, work item lifecycle, and NUMA affinity

Why workqueues exist

Interrupt handlers (hardirq and softirq) run atomically — they cannot sleep or block. Work that needs to sleep, acquire mutexes, or do I/O must be deferred. Workqueues provide a kernel thread pool that can run this deferred work in process context.

Interrupt fires:
  hardirq handler → schedule_work(&my_work)   ← non-sleeping, fast
                    [kworker thread]            ← sleeps OK
                    my_work_func()
                    → can mutex_lock(), kmalloc(GFP_KERNEL), etc.

Concurrency-Managed Workqueues (cmwq)

Since Linux 2.6.36, workqueues use cmwq — introduced by Tejun Heo (LWN) — a unified worker pool that automatically manages concurrency:

Before cmwq (per-CPU dedicated threads):
  cpu0: kblockd/0, kworker/0:0, kworker/0:1, ...
  cpu1: kblockd/1, kworker/1:0, kworker/1:1, ...
  Problem: each workqueue had its own per-CPU thread → thread explosion

After cmwq (shared worker pool):
  cpu0: [shared pool] kworker/0:0, kworker/0:1, kworker/0:2H (high-prio)
  cpu1: [shared pool] kworker/1:0, kworker/1:1, kworker/1:2H
  All workqueues share the pool → far fewer threads

Worker pool types

/* kernel/workqueue.c */

/* Per-CPU worker pools (2 per CPU: normal and highpri): */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
                                     cpu_worker_pools);
/*
 * cpu_worker_pools[0]: normal priority (SCHED_NORMAL, nice=0)
 * cpu_worker_pools[1]: high priority  (SCHED_NORMAL, nice=-20)
 */

/* Unbound worker pools: not tied to any CPU, for WQ_UNBOUND workqueues */
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);

Creating workqueues

#include <linux/workqueue.h>

/* System workqueues (prefer these): */
system_wq             /* general-purpose, can sleep */
system_highpri_wq     /* high-priority, time-critical work */
system_long_wq        /* for work that may run for a long time */
system_unbound_wq     /* not bound to a specific CPU */
system_freezable_wq   /* suspended during system freeze */
system_power_efficient_wq  /* prefers low-power CPUs */

/* Use system_wq for most cases: */
schedule_work(&my_work);  /* submits to system_wq */

/* Custom workqueue: */
struct workqueue_struct *wq;

/* Bound to a CPU (work runs on the CPU it was submitted from): */
wq = alloc_workqueue("my-wq", WQ_MEM_RECLAIM, 1);

/* Unbound (can run on any CPU, better for NUMA): */
wq = alloc_workqueue("my-unbound-wq", WQ_UNBOUND, 0);

/* High priority + unbound: */
wq = alloc_workqueue("my-hipri-wq", WQ_UNBOUND | WQ_HIGHPRI, 0);

/* Flags: */
WQ_UNBOUND          /* not bound to a specific CPU */
WQ_FREEZABLE        /* freeze during system suspend */
WQ_MEM_RECLAIM      /* may be needed during memory reclaim (reserves a rescue thread) */
WQ_HIGHPRI          /* high-priority worker pool */
WQ_CPU_INTENSIVE    /* CPU-intensive work: excluded from pool concurrency count,
                       allowing other pending work to start alongside it */
WQ_SYSFS            /* expose workqueue attributes in /sys/bus/workqueue */

/* Last arg: max_active = max concurrent work items per CPU (0 = default=256) */

Work items

/* Declare and initialize work: */
DECLARE_WORK(my_work, my_work_func);
/* or: */
struct work_struct my_work;
INIT_WORK(&my_work, my_work_func);

/* Work function: */
static void my_work_func(struct work_struct *work)
{
    struct my_device *dev = container_of(work, struct my_device, work);
    /* Process dev->data — can sleep here */
    mutex_lock(&dev->lock);
    process_data(dev);
    mutex_unlock(&dev->lock);
}

/* Submit work: */
schedule_work(&my_work);          /* to system_wq, current CPU */
queue_work(wq, &my_work);         /* to custom wq */
queue_work_on(cpu, wq, &my_work); /* to specific CPU's pool */

/* Wait for work to complete: */
flush_work(&my_work);             /* blocks until this work item finishes */
flush_workqueue(wq);              /* blocks until all items in wq finish */

Delayed work

struct delayed_work my_dwork;
INIT_DELAYED_WORK(&my_dwork, my_dwork_func);

/* Schedule to run after 100ms: */
schedule_delayed_work(&my_dwork, msecs_to_jiffies(100));
queue_delayed_work(wq, &my_dwork, msecs_to_jiffies(100));

/* Cancel (may or may not cancel if already running): */
cancel_delayed_work(&my_dwork);

/* Cancel and wait for completion: */
cancel_delayed_work_sync(&my_dwork);

struct work_struct internals

struct work_struct {
    atomic_long_t   data;       /* encodes: pool pointer + flags */
    struct list_head entry;     /* in pool->worklist */
    work_func_t     func;
#ifdef CONFIG_LOCKDEP
    struct lockdep_map lockdep_map;
#endif
};

/* data field encodes: */
/* bits [0]: WORK_STRUCT_PENDING — work is queued */
/* bits [1]: WORK_STRUCT_INACTIVE — work is in an inactive list */
/* bits [2]: WORK_STRUCT_PWQ — lower bits point to pool_workqueue */
/* bits [WORK_STRUCT_FLAG_BITS..]: pointer to pool_workqueue or last pool */

Worker pool and concurrency management

struct worker_pool {
    spinlock_t          lock;
    int                 cpu;        /* CPU or -1 for unbound */
    int                 node;       /* NUMA node */
    int                 id;
    unsigned int        flags;

    unsigned long       watchdog_ts;
    bool                cpu_stall;

    struct list_head    worklist;   /* pending work items */
    int                 nr_workers;
    int                 nr_idle;

    struct list_head    idle_list;  /* idle workers */
    struct timer_list   idle_timer;
    struct work_struct  idle_cull_work;

    struct timer_list   mayday_timer; /* safety valve */
    DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);

    struct worker       *manager;   /* thread managing the pool */
    struct list_head    workers;    /* all workers in pool */
    struct completion   *detach_completion;

    struct ida          worker_ida;
    struct workqueue_attrs *attrs;
    struct hlist_node   hash_node;
    int                 refcnt;
    struct rcu_head     rcu;
};

Concurrency management algorithm

cmwq maintains exactly enough workers to keep the CPU busy:

Rule: pool tries to keep at least one runnable worker
  (one that is executing work AND not sleeping)

When a worker goes to sleep (blocks on mutex, I/O, etc.):
  → pool checks: is there another runnable worker?
  → if not: wake an idle worker (or create a new one)
  → prevents CPU stall while work is blocked

When a worker wakes up:
  → pool checks: too many runnable workers?
  → if yes: go back to sleep (idle)

Result: CPU always busy, minimal thread creation
/* kernel/workqueue.c */
static void worker_enter_idle(struct worker *worker)
{
    struct worker_pool *pool = worker->pool;

    /* ... */
    pool->nr_idle++;
    worker->last_active = jiffies;
    worker->flags |= WORKER_IDLE;
    list_add(&worker->entry, &pool->idle_list);

    /* Destroy idle workers after 5 minutes of idleness */
    if (too_many_workers(pool))
        mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
}

static bool keep_working(struct worker_pool *pool)
{
    return !list_empty(&pool->worklist) &&
           atomic_read(&pool->nr_running) <= 1;
}

Workqueue attributes and NUMA affinity

/* Get/set workqueue attributes (WQ_UNBOUND only): */
struct workqueue_attrs *attrs = alloc_workqueue_attrs();
attrs->nice = -5;           /* worker thread niceness */
attrs->cpumask = cpu_mask;  /* which CPUs workers can run on */
attrs->no_numa = false;     /* respect NUMA locality */

apply_workqueue_attrs(wq, attrs);
free_workqueue_attrs(attrs);

/* NUMA-aware unbound pools:
   By default, unbound workqueues have per-NUMA-node pools.
   Work submitted from node 0 runs on node 0 workers.
   This ensures cache-hot data stays on the same node. */

Workqueue debugging

# Show all workqueues and their stats:
cat /sys/kernel/debug/workqueue/stats

# Show worker threads:
ps aux | grep kworker
# kworker/0:1   — bound to CPU 0, pool 1 (normal)
# kworker/0:1H  — bound to CPU 0, pool H (high priority)
# kworker/u8:2  — unbound pool, pool ID=8, worker ID=2

# Show per-workqueue stats (if WQ_SYSFS):
ls /sys/bus/workqueue/devices/
cat /sys/bus/workqueue/devices/system_wq/per_cpu

# Dump all task stacks (including kworker threads) via SysRq:
echo t > /proc/sysrq-trigger
# Shows all threads in dmesg; use /sys/kernel/debug/workqueue for
# workqueue-specific state (pending counts, queue depths)

# Trace work submission and execution:
bpftrace -e '
kprobe:queue_work_on {
    /* queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) */
    printf("queue_work: cpu=%d func=%s\n", (int)arg0,
           ksym(((struct work_struct *)arg2)->func));
}
kprobe:process_one_work {
    printf("exec_work: cpu=%d pid=%d func=%s\n",
           cpu, pid, ksym(((struct work_struct *)arg1)->func));
}'

# Watchdog: detect hung workers (CONFIG_WQ_WATCHDOG=y)
# Kernel will warn if a worker pool stalls for > 30 seconds:
# "BUG: workqueue lockup"
# Tune:
echo 60 > /sys/module/workqueue/parameters/watchdog_thresh

WQ_MEM_RECLAIM and rescue workers

/*
 * Work used during memory reclaim (e.g., writeback) MUST use WQ_MEM_RECLAIM.
 * This ensures a dedicated rescue worker exists that won't be blocked
 * waiting for memory (avoiding deadlock):
 *
 * Memory reclaim path:
 *   1. kswapd needs to write dirty pages → queues work
 *   2. Normal workers might be sleeping waiting for memory
 *   3. Rescue worker picks up the work — always has one thread reserved
 */
wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);

Further reading

  • Workqueues — API reference and basic usage
  • Softirq and Tasklets — lighter-weight deferral (no sleep)
  • Threaded IRQs — IRQ handlers in thread context
  • RCU — lock-free deferred work
  • kernel/workqueue.c — complete implementation (~6000 lines)
  • Documentation/core-api/workqueue.rst — kernel documentation