Completions and Wait Queues
Blocking synchronization: sleeping until an event occurs
struct completion
completion is a simple one-shot synchronization primitive: one thread waits, another signals when done. It's simpler and safer than using wait_queue_head_t directly for this common pattern.
#include <linux/completion.h>
/* Declare and initialize */
DECLARE_COMPLETION(my_completion);
/* Or dynamically: */
struct completion c;
init_completion(&c);
/* Waiter thread: block until signaled */
wait_for_completion(&c);
/* Execution continues here after signal */
/* Signaler thread: wake up the waiter */
complete(&c);
/* After complete(), one waiter is woken */
Completion variants
/* Timed wait: return 0 on timeout, 1 on completion */
unsigned long remaining = wait_for_completion_timeout(&c, HZ);
if (!remaining)
pr_err("Timed out waiting!\n");
/* Interruptible wait: return -ERESTARTSYS if signal arrives */
int ret = wait_for_completion_interruptible(&c);
if (ret)
return ret; /* interrupted */
/* Interruptible with timeout */
long ret = wait_for_completion_interruptible_timeout(&c, timeout);
/* ret > 0: completed */
/* ret == 0: timed out */
/* ret < 0: interrupted */
/* Wake all waiters (not just one) */
complete_all(&c);
/* Reinitialize after all waiters have woken */
reinit_completion(&c);
/* Check without blocking */
if (completion_done(&c)) {
/* already signaled */
}
Completion with try_wait
/* Non-blocking check: acquire if already signaled */
if (try_wait_for_completion(&c)) {
/* Was already complete — no blocking */
} else {
/* Not yet complete — would have blocked */
}
struct wait_queue_head_t
wait_queue_head_t is the lower-level primitive that completion builds on. It supports:
- Multiple waiters on the same event
- Custom wake conditions
- Exclusive vs non-exclusive wakeup
- poll()/select()/epoll integration
#include <linux/wait.h>
/* Declare */
DECLARE_WAIT_QUEUE_HEAD(my_wq);
/* Or: */
wait_queue_head_t wq;
init_waitqueue_head(&wq);
wait_event: the standard pattern
wait_event combines the wait with a condition check:
/* Waiter: sleep until condition is true */
wait_event(wq, condition);
/* condition is re-evaluated after every wakeup */
/* Interruptible version */
int ret = wait_event_interruptible(wq, condition);
if (ret)
return ret; /* -ERESTARTSYS: signal received */
/* With timeout */
long ret = wait_event_timeout(wq, condition, timeout_jiffies);
/* ret > 0: condition met */
/* ret == 0: timed out */
/* With interruptible + timeout */
long ret = wait_event_interruptible_timeout(wq, condition, timeout);
The wake_up / wait_event contract:
/* Signaler: modify the condition variable, then wake */
WRITE_ONCE(data_ready, true);
wake_up(&wq); /* wake one non-exclusive waiter */
wake_up_all(&wq); /* wake all waiters */
wake_up_interruptible(&wq); /* wake only interruptible waiters */
wait_event uses a loop:
/* wait_event expansion (simplified): */
for (;;) {
prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);
if (condition)
break;
schedule(); /* sleep */
}
finish_wait(&wq, &__wait);
This correctly handles races: if the condition becomes true between the check and schedule(), the wake_up will have set the task state back to TASK_RUNNING.
Custom wait: prepare_to_wait / finish_wait
For more complex patterns:
DEFINE_WAIT(wait);
prepare_to_wait(&wq, &wait, TASK_INTERRUPTIBLE);
while (!condition) {
if (signal_pending(current)) {
/* Signal received */
break;
}
/* Drop any locks here before sleeping */
spin_unlock_irq(&lock);
schedule();
spin_lock_irq(&lock);
/* Recheck condition here */
}
finish_wait(&wq, &wait);
Exclusive waiters
Exclusive waiters are woken one at a time (thundering herd prevention):
/* Add as exclusive waiter (only woken by wake_up, not wake_up_all) */
prepare_to_wait_exclusive(&wq, &wait, TASK_INTERRUPTIBLE);
/* wake_up: wakes exactly one exclusive waiter + all non-exclusive */
wake_up(&wq);
Used for accept() on TCP listen sockets: only one of many waiting accept() calls needs to be woken when a connection arrives.
poll/select/epoll integration
Drivers implement poll (or ->poll in file_operations) using wait queues:
/* In file_operations.poll or .f_poll: */
static __poll_t mydev_poll(struct file *file, poll_table *wait)
{
struct mydev *dev = file->private_data;
__poll_t mask = 0;
/* Register this file's wait queue with the poll table */
poll_wait(file, &dev->read_wq, wait);
poll_wait(file, &dev->write_wq, wait);
/* Check current state */
if (!kfifo_is_empty(&dev->recv_fifo))
mask |= EPOLLIN | EPOLLRDNORM; /* data available to read */
if (!kfifo_is_full(&dev->send_fifo))
mask |= EPOLLOUT | EPOLLWRNORM; /* space available to write */
return mask;
}
/* When data arrives: wake the wait queue */
void mydev_data_arrived(struct mydev *dev)
{
kfifo_put(&dev->recv_fifo, data);
wake_up_interruptible(&dev->read_wq);
}
poll_wait adds the wait queue to the poll table so that epoll monitors it. When wake_up fires, epoll wakes the calling process.
wait_queue_entry: custom callbacks
Advanced use: add a custom function to call on wakeup:
/* Custom wait entry with callback */
static int my_wakeup_func(struct wait_queue_entry *curr,
unsigned mode, int wake_flags, void *key)
{
struct mydata *data = container_of(curr, struct mydata, wait_entry);
/* Called in the waker's context (IRQ or process) */
/* Return 0 to continue waking other waiters, 1 to stop */
/* Schedule a task for later processing */
schedule_work(&data->work);
return 1; /* don't wake the task directly */
}
struct wait_queue_entry wqe = {
.private = &mydata,
.func = my_wakeup_func,
.flags = 0, /* or WQ_FLAG_EXCLUSIVE for exclusive */
};
add_wait_queue(&wq, &wqe);
/* ... */
remove_wait_queue(&wq, &wqe);
Common patterns
Producer/consumer with wait queue
struct ring_buffer {
u8 data[1024];
unsigned int head, tail;
wait_queue_head_t not_empty; /* consumers wait here */
wait_queue_head_t not_full; /* producers wait here */
spinlock_t lock;
};
/* Consumer */
int rb_read(struct ring_buffer *rb, u8 *buf, size_t len)
{
int ret;
spin_lock(&rb->lock);
ret = wait_event_interruptible_lock_irq(rb->not_empty,
!rb_is_empty(rb),
rb->lock);
if (ret) {
spin_unlock(&rb->lock);
return ret; /* interrupted */
}
memcpy(buf, &rb->data[rb->tail], len);
rb->tail = (rb->tail + len) % sizeof(rb->data);
spin_unlock(&rb->lock);
wake_up(&rb->not_full);
return len;
}
/* Producer */
int rb_write(struct ring_buffer *rb, const u8 *buf, size_t len)
{
spin_lock(&rb->lock);
wait_event_interruptible_lock_irq(rb->not_full, !rb_is_full(rb), rb->lock);
memcpy(&rb->data[rb->head], buf, len);
rb->head = (rb->head + len) % sizeof(rb->data);
spin_unlock(&rb->lock);
wake_up(&rb->not_empty);
return len;
}
Further reading
- SRCU — sleepable RCU with read-side blocking
- Mutex — sleeping mutual exclusion
- IPC: Signals — signal delivery to waiting tasks
- io_uring — io_uring avoids wait queues entirely
include/linux/wait.h— full wait queue API