Skip to content

Clocksource and Clockevent Drivers

Hardware timers in the kernel: TSC, HPET, clockevents, and tick_device

Two clock abstractions

The kernel separates time into two concepts:

clocksource:   reads current time (monotonic counter)
               "What time is it right now?"
               Example: TSC counter, HPET counter

clockevent:    programs a future interrupt (one-shot or periodic)
               "Wake me up in N nanoseconds"
               Example: Local APIC timer, HPET, ARM generic timer

Together they implement the kernel's timekeeping:

clocksource  →  timekeeping (clock_gettime, ktime_get)
clockevent   →  tick device  →  scheduler tick, hrtimers, timers

struct clocksource

/* include/linux/clocksource.h */
struct clocksource {
    /*
     * read() returns a 64-bit cycle counter.
     * The counter may wrap around — timekeeping handles this.
     */
    u64 (*read)(struct clocksource *cs);

    u64             mask;       /* bitmask to handle counter wrap */
    u32             mult;       /* cycles to ns: ns = cycles * mult >> shift */
    u32             shift;
    u64             max_idle_ns; /* max time between reads before losing accuracy */
    u32             maxadj;
    u64             max_cycles; /* max cycles before ns calculation overflows */
    const char      *name;
    struct list_head list;
    int             rating;     /* quality: 500=perfect, 300=good, 100=basic */
    enum clocksource_ids  id;
    unsigned long   flags;
    /* CLOCK_SOURCE_IS_CONTINUOUS: counter never jumps */
    /* CLOCK_SOURCE_VALID_FOR_HRES: can be used for high-res timers */

    /* Which VDSO clock mode to use for fast userspace reads: */
    enum vdso_clock_mode vdso_clock_mode;
    void (*suspend)(struct clocksource *cs);
    void (*resume)(struct clocksource *cs);
};

TSC (Time Stamp Counter)

The TSC is the primary clocksource on modern x86:

/* arch/x86/kernel/tsc.c */
static struct clocksource clocksource_tsc = {
    .name                   = "tsc",
    .rating                 = 300,
    .read                   = read_tsc,
    .mask                   = CLOCKSOURCE_MASK(64),
    .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
                              CLOCK_SOURCE_VALID_FOR_HRES,
};

static u64 read_tsc(struct clocksource *cs)
{
    return (u64)rdtsc_ordered();  /* RDTSC instruction */
}

/* Convert TSC cycles to nanoseconds: */
/* ns = (cycles * mult) >> shift */
/* mult and shift are calibrated at boot against HPET or PIT */
# Check the current clocksource:
cat /sys/devices/system/clocksource/clocksource0/current_clocksource
# tsc

# Available clocksources (sorted by preference):
cat /sys/devices/system/clocksource/clocksource0/available_clocksource
# tsc hpet acpi_pm

# TSC frequency:
dmesg | grep "tsc: Detected"
# tsc: Detected 3600.000 MHz processor

# TSC stability:
dmesg | grep tsc | grep -i "unstable\|skew\|reliable"

TSC reliability issues

# TSC may be unreliable:
# 1. Old multi-socket systems: TSCs not synchronized between sockets
# 2. CPU frequency scaling: TSC rate changes with frequency (older CPUs)
# 3. Virtualization: hypervisor may not provide stable TSC

# Modern CPUs have invariant TSC (constant rate regardless of P-state):
grep "constant_tsc\|nonstop_tsc" /proc/cpuinfo | head -2
# flags: ... constant_tsc nonstop_tsc ...

# Force a different clocksource:
echo hpet > /sys/devices/system/clocksource/clocksource0/current_clocksource

Registering a clocksource

/* For a custom hardware counter (e.g., in a SoC driver): */
#include <linux/clocksource.h>

static u64 my_timer_read(struct clocksource *cs)
{
    return readl(my_timer_base + TIMER_COUNT_REG);
}

static struct clocksource my_clocksource = {
    .name    = "my-hw-timer",
    .rating  = 200,
    .read    = my_timer_read,
    .mask    = CLOCKSOURCE_MASK(32),
    .flags   = CLOCK_SOURCE_IS_CONTINUOUS,
};

/* In probe(): */
clocksource_register_hz(&my_clocksource, 24000000); /* 24 MHz */
/* Computes mult/shift from frequency */

struct clock_event_device

/* include/linux/clockchips.h */
struct clock_event_device {
    void (*event_handler)(struct clock_event_device *);
    /* Called on each timer interrupt (set by tick layer) */

    int  (*set_next_event)(unsigned long evt,
                           struct clock_event_device *);
    /* Program the hardware to fire in `evt` cycles */

    int  (*set_next_ktime)(ktime_t expires,
                           struct clock_event_device *);

    ktime_t             next_event;
    u64                 max_delta_ns;
    u64                 min_delta_ns;
    u32                 mult;
    u32                 shift;

    enum clock_event_state  state_use_accessors;
    unsigned int        features;
    /* CLOCK_EVT_FEAT_PERIODIC: supports periodic mode */
    /* CLOCK_EVT_FEAT_ONESHOT:  supports one-shot mode */
    /* CLOCK_EVT_FEAT_C3STOP:   stops in deep idle */

    const char          *name;
    int                 rating;
    int                 irq;
    int                 bound_on;
    const struct cpumask *cpumask;
    struct list_head    list;
};

Local APIC timer (per-CPU clockevent)

/* arch/x86/kernel/apic/apic.c */
static struct clock_event_device lapic_clockevent = {
    .name                   = "lapic",
    .features               = CLOCK_EVT_FEAT_PERIODIC |
                              CLOCK_EVT_FEAT_ONESHOT  |
                              CLOCK_EVT_FEAT_C3STOP   |
                              CLOCK_EVT_FEAT_DUMMY,
    .shift                  = 32,
    .set_state_shutdown     = lapic_timer_shutdown,
    .set_state_periodic     = lapic_timer_set_periodic,
    .set_state_oneshot      = lapic_timer_set_oneshot,
    .set_next_event         = lapic_next_event,
    .broadcast              = lapic_timer_broadcast,
    .rating                 = 100,
    .irq                    = -1,
};

static int lapic_next_event(unsigned long delta,
                             struct clock_event_device *evt)
{
    /* Program the APIC timer to fire in `delta` cycles */
    apic_write(APIC_TMICT, delta);
    return 0;
}

tick device and NOHZ

Each CPU has a tick device — the clockevent used for the scheduling tick:

/* kernel/time/tick-common.c */
struct tick_device {
    struct clock_event_device *evtdev;
    enum tick_device_mode      mode;  /* TICKDEV_MODE_PERIODIC or ONESHOT */
};

DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
# In periodic mode: tick fires every 1/HZ seconds (HZ=250 → 4ms)
grep "^CONFIG_HZ=" /boot/config-$(uname -r)
# CONFIG_HZ=250

# In NOHZ (tickless) mode: tick is suppressed when CPU is idle
# Reduces power consumption and improves real-time latency

# Check NOHZ mode:
cat /sys/kernel/debug/sched/domains/cpu0/domain0/stats
# or:
cat /proc/timer_list | grep "Tick Device" | head -5

# NOHZ_FULL (adaptive tick): suppress tick even when running
# Requires: isolcpus + nohz_full= boot params (for RT/HPC)

Timekeeping: clocksource → wall clock

/* kernel/time/timekeeping.c */
struct timekeeper {
    struct tk_read_base     tkr_mono;   /* monotonic clock */
    struct tk_read_base     tkr_raw;    /* raw hardware clock */

    u64                     xtime_sec;  /* real wall-clock seconds */
    unsigned long           ktime_sec;  /* monotonic seconds */
    struct timespec64       wall_to_monotonic; /* offset */
    ktime_t                 offs_real;  /* monotonic → realtime offset */
    ktime_t                 offs_boot;  /* boot time offset */
    ktime_t                 offs_tai;   /* TAI offset */

    u32                     tai_offset; /* TAI - UTC in seconds */
    u32                     clock_was_set_seq;
    u8                      cs_was_changed_seq;
    ktime_t                 next_leap_ktime;
};

/* Reading current time (seqcount protects against concurrent updates): */
ktime_t ktime_get(void)
{
    struct timekeeper *tk = &tk_core.timekeeper;
    unsigned int seq;
    ktime_t base;
    u64 nsecs;

    do {
        seq = read_seqcount_begin(&tk_core.seq);
        base  = tk->tkr_mono.base;
        nsecs = timekeeping_get_ns(&tk->tkr_mono);
    } while (read_seqcount_retry(&tk_core.seq, seq));

    return ktime_add_ns(base, nsecs);
}

Observing timekeeping

# Current time sources and offsets:
cat /proc/timer_list

# NTP synchronization:
timedatectl status
# System clock synchronized: yes
# NTP service: active
# RTC in local TZ: no

# Clock error estimation (NTP):
adjtimex -p | grep -E "offset|freq|error"

# High-resolution timer benchmark:
cyclictest -n -m -p99 -i200 -l1000000
# Tests: time from sleep request to actual wakeup (uses hrtimer)

# TSC vs HPET accuracy comparison:
bpftrace -e '
BEGIN { @start = nsecs; }
interval:s:1 {
    printf("elapsed: %lld ns\n", nsecs - @start);
    @start = nsecs;
}'

Further reading

  • hrtimers — high-resolution timers using clockevent
  • POSIX Timers — user-facing timer API
  • cpuidle and C-states — CLOCK_EVT_FEAT_C3STOP
  • vDSO — fast clock_gettime using clocksource
  • Real-Time Tuning — tick suppression for RT
  • kernel/time/timekeeping.c — timekeeping core
  • arch/x86/kernel/tsc.c — TSC clocksource
  • arch/x86/kernel/apic/apic.c — APIC clockevent