XDP: eXpress Data Path
Kernel bypass at the driver level: BPF programs on the receive fast path
What is XDP?
XDP (eXpress Data Path) runs a BPF program at the earliest possible point in the kernel network stack — inside the NIC driver, before skb allocation. This achieves near-DPDK speeds while staying in the kernel:
Packet arrival:
Without XDP:
NIC → DMA → driver alloc skb → kernel stack → socket → userspace
Latency: ~1-5µs, throughput: limited by skb overhead
With XDP:
NIC → DMA → XDP BPF program → (pass/drop/tx/redirect)
Latency: ~100-200ns, throughput: 10-50+ Mpps (million packets/sec)
DPDK (comparison):
NIC → userspace poll → DPDK app
Latency: ~50-100ns, throughput: 60+ Mpps
But: requires dedicated CPUs, no kernel network stack
XDP hook points
/* Three attachment modes: */
/* 1. Native XDP (fastest): runs in driver's RX path */
/* Supported by: mlx5, i40e, ixgbe, bpfilter, veth, etc. */
ip link set eth0 xdp obj xdp_prog.o sec xdp
/* 2. Generic XDP (skb-based): works on any driver, slower */
/* Runs after skb allocation, in netif_receive_skb() */
ip link set eth0 xdpgeneric obj xdp_prog.o sec xdp
/* 3. HW offload: runs on the NIC itself (very few NICs) */
ip link set eth0 xdpoffload obj xdp_prog.o sec xdp
XDP verdicts
/* BPF program returns one of these: */
XDP_DROP /* Drop the packet immediately (no skb alloc, no notification) */
XDP_PASS /* Continue normal kernel processing (alloc skb, pass up stack) */
XDP_TX /* Retransmit on the same interface (e.g., for reflection) */
XDP_REDIRECT /* Forward to another interface, CPU, or AF_XDP socket */
XDP_ABORTED /* Drop + trace point for debugging (treated like DROP) */
XDP program structure
/* xdp_prog.c */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
/* BPF map: track per-source-IP packet counts */
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, __u32); /* source IP */
__type(value, __u64); /* packet count */
__uint(max_entries, 65536);
} ip_counter SEC(".maps");
/* Blocklist map */
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, __u32); /* blocked IP */
__type(value, __u8);
__uint(max_entries, 1024);
} blocklist SEC(".maps");
SEC("xdp")
int xdp_firewall(struct xdp_md *ctx)
{
void *data = (void *)(long)ctx->data;
void *data_end = (void *)(long)ctx->data_end;
/* Parse Ethernet header */
struct ethhdr *eth = data;
if ((void *)(eth + 1) > data_end)
return XDP_DROP; /* truncated packet */
if (eth->h_proto != bpf_htons(ETH_P_IP))
return XDP_PASS; /* not IPv4, pass to stack */
/* Parse IP header */
struct iphdr *ip = (void *)(eth + 1);
if ((void *)(ip + 1) > data_end)
return XDP_DROP;
__u32 src_ip = ip->saddr;
/* Check blocklist */
if (bpf_map_lookup_elem(&blocklist, &src_ip))
return XDP_DROP; /* blocked: drop silently */
/* Count packets per source IP */
__u64 *count = bpf_map_lookup_elem(&ip_counter, &src_ip);
if (count) {
__sync_fetch_and_add(count, 1);
} else {
__u64 one = 1;
bpf_map_update_elem(&ip_counter, &src_ip, &one, BPF_ANY);
}
return XDP_PASS;
}
char _license[] SEC("license") = "GPL";
# Compile:
clang -O2 -target bpf -c xdp_prog.c -o xdp_prog.o
# Attach:
ip link set eth0 xdp obj xdp_prog.o sec xdp
# Check:
ip link show eth0
# 2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp ...
# xdp id 42 ← program loaded
# Detach:
ip link set eth0 xdp off
Packet modification (NAT)
SEC("xdp")
int xdp_dnat(struct xdp_md *ctx)
{
void *data = (void *)(long)ctx->data;
void *data_end = (void *)(long)ctx->data_end;
struct ethhdr *eth = data;
if ((void *)(eth + 1) > data_end)
return XDP_PASS;
if (eth->h_proto != bpf_htons(ETH_P_IP))
return XDP_PASS;
struct iphdr *ip = (void *)(eth + 1);
if ((void *)(ip + 1) > data_end)
return XDP_PASS;
/* Rewrite destination IP (save old value first for checksum diff) */
__u32 old_dst = ip->daddr;
__u32 new_dst = bpf_htonl(0xc0a80101); /* 192.168.1.1 */
ip->daddr = new_dst;
/* Recompute IP checksum inline (bpf_l3_csum_replace/bpf_l4_csum_replace
* and bpf_csum_diff are TC-only helpers not available in XDP programs;
* use incremental checksum update via RFC 1624): */
__u32 csum_diff = (~old_dst & 0xffff) + (~old_dst >> 16) +
(new_dst & 0xffff) + (new_dst >> 16);
__u16 csum = (~ntohs(ip->check) & 0xffff) + (csum_diff & 0xffff) +
(csum_diff >> 16);
ip->check = htons(~(csum + (csum >> 16)));
return XDP_PASS; /* or XDP_TX to send back */
}
XDP redirect
Between interfaces
/* Redirect packet to eth1: */
SEC("xdp")
int xdp_redirect_iface(struct xdp_md *ctx)
{
int ifindex = 3; /* eth1's ifindex */
return bpf_redirect(ifindex, 0);
}
DEVMAP: batch redirect
/* High-performance redirect via devmap */
struct {
__uint(type, BPF_MAP_TYPE_DEVMAP);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
__uint(max_entries, 256);
} tx_ports SEC(".maps");
SEC("xdp")
int xdp_devmap_redirect(struct xdp_md *ctx)
{
int port = 0; /* devmap key */
return bpf_redirect_map(&tx_ports, port, XDP_DROP);
}
# Populate devmap (userspace):
int map_fd = bpf_obj_get("/sys/fs/bpf/tx_ports");
int ifindex = if_nametoindex("eth1");
bpf_map_update_elem(map_fd, &key, &ifindex, BPF_ANY);
CPUMAP: steer to specific CPU
/* Redirect to CPU for further processing (after XDP) */
struct {
__uint(type, BPF_MAP_TYPE_CPUMAP);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(struct bpf_cpumap_val));
__uint(max_entries, 12);
} cpumap SEC(".maps");
SEC("xdp")
int xdp_cpumap_redirect(struct xdp_md *ctx)
{
/* RSS-based steering: hash to CPU */
__u32 cpu = bpf_get_smp_processor_id() % 4;
return bpf_redirect_map(&cpumap, cpu, 0);
}
AF_XDP: zero-copy to userspace
AF_XDP (eXpress Data Path socket) moves packets directly to userspace memory without any kernel copying:
NIC DMA → UMEM (userspace memory) → XSK socket → userspace app
↑ zero copy! kernel never touches packet data
Setup
/* Userspace: create AF_XDP socket */
#include <linux/if_xdp.h>
#include <sys/socket.h>
/* 1. Allocate UMEM (packet buffer pool): */
void *umem_area = mmap(NULL, UMEM_SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB, -1, 0);
struct xdp_umem_reg umem_reg = {
.addr = (uint64_t)umem_area,
.len = UMEM_SIZE,
.chunk_size = FRAME_SIZE, /* 4096 bytes per frame */
.headroom = 0,
};
setsockopt(xsk_fd, SOL_XDP, XDP_UMEM_REG, &umem_reg, sizeof(umem_reg));
/* 2. Create rings (FILL/COMPLETION/RX/TX): */
int ring_size = 2048;
setsockopt(xsk_fd, SOL_XDP, XDP_RX_RING, &ring_size, sizeof(ring_size));
setsockopt(xsk_fd, SOL_XDP, XDP_UMEM_FILL_RING, &ring_size, sizeof(ring_size));
/* 3. mmap the rings: */
struct xdp_mmap_offsets off;
getsockopt(xsk_fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
void *fill_ring = mmap(NULL, off.fr.desc + ring_size * sizeof(uint64_t),
PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
xsk_fd, XDP_UMEM_PGOFF_FILL_RING);
/* 4. Bind to interface queue: */
struct sockaddr_xdp sxdp = {
.sxdp_family = AF_XDP,
.sxdp_ifindex = if_nametoindex("eth0"),
.sxdp_queue_id = 0, /* NIC queue 0 */
.sxdp_flags = XDP_ZEROCOPY, /* requires native XDP driver support */
};
bind(xsk_fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
XDP program for AF_XDP redirect
/* BPF map of AF_XDP sockets */
struct {
__uint(type, BPF_MAP_TYPE_XSKMAP);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
__uint(max_entries, 64);
} xsks_map SEC(".maps");
SEC("xdp")
int xdp_sock_prog(struct xdp_md *ctx)
{
int index = ctx->rx_queue_index;
/* Redirect to AF_XDP socket on same queue */
if (bpf_map_lookup_elem(&xsks_map, &index))
return bpf_redirect_map(&xsks_map, index, 0);
return XDP_PASS;
}
XDP vs alternatives
| XDP | DPDK | kernel TC | iptables | |
|---|---|---|---|---|
| Throughput | 50+ Mpps | 60+ Mpps | 10+ Mpps | 1-5 Mpps |
| Latency | ~200ns | ~100ns | ~1µs | ~10µs |
| Kernel stack access | Yes (XDP_PASS) | No | Yes | Yes |
| CPU dedication | No | Yes | No | No |
| Programmability | BPF | C | BPF/C | limited |
| Use case | DDoS mitigation, LB, firewall | High-freq trading, NFV | QoS, TC filter | Generic firewall |
Observability
# XDP program info:
bpftool net show dev eth0
# xdp:
# id 42 name xdp_firewall flags 0x0
# XDP stats (via driver):
ethtool -S eth0 | grep xdp
# rx_xdp_drop: 1234567
# rx_xdp_pass: 9876543
# rx_xdp_tx: 0
# rx_xdp_redirect: 0
# BPF map inspection:
bpftool map dump id 5 # dump blocklist map
# XDP program tracing:
bpftrace -e '
tracepoint:xdp:xdp_exception
{ printf("XDP exception: ifindex=%d, prog_id=%d, act=%d\n",
args->ifindex, args->prog_id, args->act); }'
# Perf event for XDP drop:
perf record -e xdp:xdp_drop -ag -- sleep 10
perf script
Further reading
- BPF Networking — TC BPF, sockmap
- sk_buff: The Network Buffer — XDP_PASS creates sk_buff
- NVMe Driver — similar DMA/ring buffer pattern
- IRQ Affinity — NIC queue/CPU affinity for XDP
net/core/filter.c— BPF network filter infrastructuredrivers/net/ethernet/intel/i40e/i40e_xsk.c— XDP in i40e driversamples/bpf/xdpsock_user.c— AF_XDP userspace exampletools/lib/bpf/xsk.c— libbpf AF_XDP helper