AF_PACKET: Raw Socket Packet Capture
How tcpdump, Wireshark, and BPF filtering work at the packet capture layer
What is AF_PACKET?
AF_PACKET sockets give userspace direct access to the network device's frame layer (Layer 2). Applications receive raw Ethernet frames (including headers) before the kernel's network stack processes them.
Used by:
- tcpdump and Wireshark — packet capture
- arping, arpwatch — ARP utilities
- Network boot agents (DHCP, PXE)
- Custom protocol implementations
- Network intrusion detection systems
Creating an AF_PACKET socket
#include <sys/socket.h>
#include <linux/if_packet.h>
#include <net/ethernet.h>
/* SOCK_RAW: receive raw Ethernet frames with headers */
/* ETH_P_ALL: capture all protocols */
int sock = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
/* Bind to a specific interface: */
struct sockaddr_ll sll = {
.sll_family = AF_PACKET,
.sll_protocol = htons(ETH_P_ALL),
.sll_ifindex = if_nametoindex("eth0"),
};
bind(sock, (struct sockaddr *)&sll, sizeof(sll));
/* Set promiscuous mode: receive all frames, even non-unicast: */
struct packet_mreq mreq = {
.mr_ifindex = if_nametoindex("eth0"),
.mr_type = PACKET_MR_PROMISC,
};
setsockopt(sock, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq));
Reading packets
uint8_t buf[65536];
struct sockaddr_ll src_addr;
socklen_t addr_len = sizeof(src_addr);
ssize_t n = recvfrom(sock, buf, sizeof(buf), 0,
(struct sockaddr *)&src_addr, &addr_len);
/* buf[0..13]: Ethernet header
buf[0..5]: destination MAC
buf[6..11]: source MAC
buf[12..13]: EtherType (0x0800=IPv4, 0x0806=ARP, 0x86DD=IPv6)
buf[14..n-1]: payload */
struct ethhdr *eth = (struct ethhdr *)buf;
printf("src: %02x:%02x:%02x:%02x:%02x:%02x → "
"dst: %02x:%02x:%02x:%02x:%02x:%02x "
"proto: 0x%04x\n",
eth->h_source[0], eth->h_source[1], eth->h_source[2],
eth->h_source[3], eth->h_source[4], eth->h_source[5],
eth->h_dest[0], eth->h_dest[1], eth->h_dest[2],
eth->h_dest[3], eth->h_dest[4], eth->h_dest[5],
ntohs(eth->h_proto));
Classic BPF socket filters
tcpdump compiles its filters to BPF bytecode and attaches them to the socket, so the kernel only copies matching packets to userspace:
#include <linux/filter.h>
/* tcpdump compiles "tcp port 80" to BPF bytecode like this: */
/* (generated by: tcpdump -d tcp port 80) */
struct sock_filter filter[] = {
/* Load IP protocol (offset 23 from Ethernet frame) */
{ BPF_LD | BPF_B | BPF_ABS, 0, 0, 23 },
/* Is it TCP (6)? */
{ BPF_JMP | BPF_JEQ | BPF_K, 0, 7, IPPROTO_TCP },
/* Load source port */
{ BPF_LD | BPF_H | BPF_ABS, 0, 0, 34 },
/* Is it port 80? */
{ BPF_JMP | BPF_JEQ | BPF_K, 2, 0, 80 },
/* Load destination port */
{ BPF_LD | BPF_H | BPF_ABS, 0, 0, 36 },
/* Is it port 80? */
{ BPF_JMP | BPF_JEQ | BPF_K, 0, 3, 80 },
/* Match: return full packet */
{ BPF_RET | BPF_K, 0, 0, 0xffffffff },
/* No match: drop */
{ BPF_RET | BPF_K, 0, 0, 0 },
};
struct sock_fprog fprog = {
.len = sizeof(filter) / sizeof(filter[0]),
.filter = filter,
};
setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &fprog, sizeof(fprog));
/* Now only TCP port 80 frames reach recvfrom() */
PACKET_MMAP: zero-copy ring buffer
For high-performance capture, PACKET_MMAP maps a ring buffer shared between kernel and userspace:
/* Set up TX/RX ring: */
struct tpacket_req3 tp = {
.tp_block_size = 1 << 22, /* 4MB per block */
.tp_block_nr = 64, /* 64 blocks = 256MB total */
.tp_frame_size = 2048, /* max frame size */
.tp_frame_nr = (1<<22) / 2048 * 64,
/* One frame per packet */
/* V3 features: */
.tp_retire_blk_tov = 60, /* flush block after 60ms */
.tp_sizeof_priv = 0,
.tp_feature_req_word = TP_FT_REQ_FILL_RXHASH,
};
setsockopt(sock, SOL_PACKET, PACKET_RX_RING, &tp, sizeof(tp));
/* mmap the ring buffer: */
size_t ring_size = tp.tp_block_size * tp.tp_block_nr;
void *ring = mmap(NULL, ring_size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_LOCKED, sock, 0);
/* Process blocks: */
uint8_t *block = ring; /* ring[0] is block 0 */
struct tpacket_block_desc *bh;
while (1) {
bh = (struct tpacket_block_desc *)block;
/* Wait for block to become ready: */
while (!(bh->hdr.bh1.block_status & TP_STATUS_USER))
poll(&pfd, 1, -1);
/* Walk frames in this block: */
struct tpacket3_hdr *ppd = (void *)block + bh->hdr.bh1.offset_to_first_pkt;
for (int i = 0; i < bh->hdr.bh1.num_pkts; i++) {
uint8_t *pkt = (uint8_t *)ppd + ppd->tp_mac;
size_t len = ppd->tp_snaplen;
/* Process packet at pkt[0..len-1] */
ppd = (void *)ppd + ppd->tp_next_offset;
}
/* Return block to kernel: */
bh->hdr.bh1.block_status = TP_STATUS_KERNEL;
block += tp.tp_block_size;
if (block >= (uint8_t *)ring + ring_size)
block = ring;
}
Sending raw packets
/* SOCK_RAW: must provide Ethernet header manually */
uint8_t frame[1500];
struct ethhdr *eth = (struct ethhdr *)frame;
memcpy(eth->h_dest, dest_mac, ETH_ALEN);
memcpy(eth->h_source, src_mac, ETH_ALEN);
eth->h_proto = htons(ETH_P_IP);
/* Fill in IP + transport headers at frame[14..] */
/* ... */
struct sockaddr_ll dst = {
.sll_family = AF_PACKET,
.sll_ifindex = if_nametoindex("eth0"),
.sll_halen = ETH_ALEN,
};
memcpy(dst.sll_addr, dest_mac, ETH_ALEN);
sendto(sock, frame, frame_len, 0,
(struct sockaddr *)&dst, sizeof(dst));
/* SOCK_DGRAM: kernel fills Ethernet header automatically */
int dgram_sock = socket(AF_PACKET, SOCK_DGRAM, htons(ETH_P_IP));
/* sendto() with sll_addr set to target MAC */
Kernel implementation
Packet socket registration
/* net/packet/af_packet.c */
static const struct proto_ops packet_ops = {
.family = PF_PACKET,
.bind = packet_bind,
.recvmsg = packet_recvmsg,
.sendmsg = packet_sendmsg,
.poll = packet_poll,
.mmap = packet_mmap,
/* ... */
};
/* When a packet arrives (from netif_receive_skb): */
static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
struct sock *sk = pt->af_packet_priv;
struct packet_sock *po = pkt_sk(sk);
/* Run the BPF filter: */
if (po->has_vnet_hdr || skb_copy_datagram_from_iov(...)) {
if (run_filter(skb, sk, snaplen) == 0)
goto drop;
}
/* Copy the sk_buff to the socket receive queue */
if (po->has_rx_ring) {
/* PACKET_MMAP path: write directly to ring buffer */
packet_rcv_ring(skb, sk, snaplen);
} else {
/* Normal path: enqueue to sk->sk_receive_queue */
__skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_data_ready(sk);
}
return NET_RX_SUCCESS;
}
tcpdump under the hood
# See what tcpdump does:
strace tcpdump -i eth0 tcp port 80 2>&1 | head -30
# socket(AF_PACKET, SOCK_RAW|SOCK_CLOEXEC, 768) = 3 (ETH_P_ALL=0x0003; 768=htons(ETH_P_ALL) in little-endian)
# setsockopt(3, SOL_SOCKET, SO_ATTACH_FILTER, ...) = 0
# bind(3, {sa_family=AF_PACKET, sll_protocol=...}) = 0
# setsockopt(3, SOL_PACKET, PACKET_VERSION, ...) = 0
# setsockopt(3, SOL_PACKET, PACKET_RX_RING, ...) = 0
# mmap(NULL, ..., PROT_READ|PROT_WRITE, MAP_SHARED, 3, 0) = 0x7f...
# poll([{fd=3, events=POLLIN|POLLERR}], 1, -1) = 1
# Compile a BPF filter and show bytecode:
tcpdump -d tcp port 80
# (000) ldh [12]
# (001) jeq #0x86dd jt 2 jf 8
# ...
# Use libpcap directly:
# pcap_open_live() → socket(AF_PACKET, SOCK_RAW, ...)
# pcap_setfilter() → setsockopt(SO_ATTACH_FILTER)
# pcap_loop() → poll() + mmap ring
Performance
# Check packet drops:
cat /proc/net/packet
# sk RefCnt Type Proto Iface R Rmem User Inode
# ...
# Per-socket stats (drops due to full receive queue):
ss -p | grep packet
# Use PACKET_STATISTICS:
struct tpacket_stats stats;
socklen_t len = sizeof(stats);
getsockopt(sock, SOL_PACKET, PACKET_STATISTICS, &stats, &len);
printf("received: %u, dropped: %u\n", stats.tp_packets, stats.tp_drops);
# High-performance: use PACKET_MMAP v3 ring (avoids per-packet syscalls)
# Even higher: use AF_XDP for kernel bypass
Further reading
- XDP — higher-performance kernel bypass
- AF_XDP — zero-copy userspace packet processing
- Netfilter — packet filtering at higher layer
- sk_buff — packet data structure
- BPF Networking — tc/cgroup BPF
net/packet/af_packet.c— AF_PACKET implementationman 7 packet— detailed man page