Skip to content

Linux Kernel Internals

AF_PACKET Raw Sockets

AF_PACKET: Raw Socket Packet Capture

How tcpdump, Wireshark, and BPF filtering work at the packet capture layer

What is AF_PACKET?

AF_PACKET sockets give userspace direct access to the network device's frame layer (Layer 2). Applications receive raw Ethernet frames (including headers) before the kernel's network stack processes them.

Used by: - tcpdump and Wireshark — packet capture - arping, arpwatch — ARP utilities - Network boot agents (DHCP, PXE) - Custom protocol implementations - Network intrusion detection systems

Creating an AF_PACKET socket

#include <sys/socket.h>
#include <linux/if_packet.h>
#include <net/ethernet.h>

/* SOCK_RAW: receive raw Ethernet frames with headers */
/* ETH_P_ALL: capture all protocols */
int sock = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));

/* Bind to a specific interface: */
struct sockaddr_ll sll = {
    .sll_family   = AF_PACKET,
    .sll_protocol = htons(ETH_P_ALL),
    .sll_ifindex  = if_nametoindex("eth0"),
};
bind(sock, (struct sockaddr *)&sll, sizeof(sll));

/* Set promiscuous mode: receive all frames, even non-unicast: */
struct packet_mreq mreq = {
    .mr_ifindex = if_nametoindex("eth0"),
    .mr_type    = PACKET_MR_PROMISC,
};
setsockopt(sock, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq));

Reading packets

uint8_t buf[65536];
struct sockaddr_ll src_addr;
socklen_t addr_len = sizeof(src_addr);

ssize_t n = recvfrom(sock, buf, sizeof(buf), 0,
                     (struct sockaddr *)&src_addr, &addr_len);

/* buf[0..13]: Ethernet header
   buf[0..5]:  destination MAC
   buf[6..11]: source MAC
   buf[12..13]: EtherType (0x0800=IPv4, 0x0806=ARP, 0x86DD=IPv6)
   buf[14..n-1]: payload */

struct ethhdr *eth = (struct ethhdr *)buf;
printf("src: %02x:%02x:%02x:%02x:%02x:%02x → "
       "dst: %02x:%02x:%02x:%02x:%02x:%02x "
       "proto: 0x%04x\n",
       eth->h_source[0], eth->h_source[1], eth->h_source[2],
       eth->h_source[3], eth->h_source[4], eth->h_source[5],
       eth->h_dest[0], eth->h_dest[1], eth->h_dest[2],
       eth->h_dest[3], eth->h_dest[4], eth->h_dest[5],
       ntohs(eth->h_proto));

Classic BPF socket filters

tcpdump compiles its filters to BPF bytecode and attaches them to the socket, so the kernel only copies matching packets to userspace:

#include <linux/filter.h>

/* tcpdump compiles "tcp port 80" to BPF bytecode like this: */
/* (generated by: tcpdump -d tcp port 80) */
struct sock_filter filter[] = {
    /* Load IP protocol (offset 23 from Ethernet frame) */
    { BPF_LD  | BPF_B   | BPF_ABS, 0, 0, 23 },
    /* Is it TCP (6)? */
    { BPF_JMP | BPF_JEQ | BPF_K, 0, 7, IPPROTO_TCP },
    /* Load source port */
    { BPF_LD  | BPF_H   | BPF_ABS, 0, 0, 34 },
    /* Is it port 80? */
    { BPF_JMP | BPF_JEQ | BPF_K, 2, 0, 80 },
    /* Load destination port */
    { BPF_LD  | BPF_H   | BPF_ABS, 0, 0, 36 },
    /* Is it port 80? */
    { BPF_JMP | BPF_JEQ | BPF_K, 0, 3, 80 },
    /* Match: return full packet */
    { BPF_RET | BPF_K, 0, 0, 0xffffffff },
    /* No match: drop */
    { BPF_RET | BPF_K, 0, 0, 0 },
};

struct sock_fprog fprog = {
    .len    = sizeof(filter) / sizeof(filter[0]),
    .filter = filter,
};
setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &fprog, sizeof(fprog));
/* Now only TCP port 80 frames reach recvfrom() */

PACKET_MMAP: zero-copy ring buffer

For high-performance capture, PACKET_MMAP maps a ring buffer shared between kernel and userspace:

/* Set up TX/RX ring: */
struct tpacket_req3 tp = {
    .tp_block_size = 1 << 22,   /* 4MB per block */
    .tp_block_nr   = 64,         /* 64 blocks = 256MB total */
    .tp_frame_size = 2048,       /* max frame size */
    .tp_frame_nr   = (1<<22) / 2048 * 64,
    /* One frame per packet */

    /* V3 features: */
    .tp_retire_blk_tov = 60,     /* flush block after 60ms */
    .tp_sizeof_priv    = 0,
    .tp_feature_req_word = TP_FT_REQ_FILL_RXHASH,
};
setsockopt(sock, SOL_PACKET, PACKET_RX_RING, &tp, sizeof(tp));

/* mmap the ring buffer: */
size_t ring_size = tp.tp_block_size * tp.tp_block_nr;
void *ring = mmap(NULL, ring_size, PROT_READ | PROT_WRITE,
                  MAP_SHARED | MAP_LOCKED, sock, 0);

/* Process blocks: */
uint8_t *block = ring;  /* ring[0] is block 0 */
struct tpacket_block_desc *bh;

while (1) {
    bh = (struct tpacket_block_desc *)block;

    /* Wait for block to become ready: */
    while (!(bh->hdr.bh1.block_status & TP_STATUS_USER))
        poll(&pfd, 1, -1);

    /* Walk frames in this block: */
    struct tpacket3_hdr *ppd = (void *)block + bh->hdr.bh1.offset_to_first_pkt;
    for (int i = 0; i < bh->hdr.bh1.num_pkts; i++) {
        uint8_t *pkt = (uint8_t *)ppd + ppd->tp_mac;
        size_t  len  = ppd->tp_snaplen;
        /* Process packet at pkt[0..len-1] */

        ppd = (void *)ppd + ppd->tp_next_offset;
    }

    /* Return block to kernel: */
    bh->hdr.bh1.block_status = TP_STATUS_KERNEL;

    block += tp.tp_block_size;
    if (block >= (uint8_t *)ring + ring_size)
        block = ring;
}

Sending raw packets

/* SOCK_RAW: must provide Ethernet header manually */
uint8_t frame[1500];
struct ethhdr *eth = (struct ethhdr *)frame;

memcpy(eth->h_dest,   dest_mac, ETH_ALEN);
memcpy(eth->h_source, src_mac,  ETH_ALEN);
eth->h_proto = htons(ETH_P_IP);

/* Fill in IP + transport headers at frame[14..] */
/* ... */

struct sockaddr_ll dst = {
    .sll_family  = AF_PACKET,
    .sll_ifindex = if_nametoindex("eth0"),
    .sll_halen   = ETH_ALEN,
};
memcpy(dst.sll_addr, dest_mac, ETH_ALEN);

sendto(sock, frame, frame_len, 0,
       (struct sockaddr *)&dst, sizeof(dst));

/* SOCK_DGRAM: kernel fills Ethernet header automatically */
int dgram_sock = socket(AF_PACKET, SOCK_DGRAM, htons(ETH_P_IP));
/* sendto() with sll_addr set to target MAC */

Kernel implementation

Packet socket registration

/* net/packet/af_packet.c */

static const struct proto_ops packet_ops = {
    .family   = PF_PACKET,
    .bind     = packet_bind,
    .recvmsg  = packet_recvmsg,
    .sendmsg  = packet_sendmsg,
    .poll     = packet_poll,
    .mmap     = packet_mmap,
    /* ... */
};

/* When a packet arrives (from netif_receive_skb): */
static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
                       struct packet_type *pt, struct net_device *orig_dev)
{
    struct sock *sk = pt->af_packet_priv;
    struct packet_sock *po = pkt_sk(sk);

    /* Run the BPF filter: */
    if (po->has_vnet_hdr || skb_copy_datagram_from_iov(...)) {
        if (run_filter(skb, sk, snaplen) == 0)
            goto drop;
    }

    /* Copy the sk_buff to the socket receive queue */
    if (po->has_rx_ring) {
        /* PACKET_MMAP path: write directly to ring buffer */
        packet_rcv_ring(skb, sk, snaplen);
    } else {
        /* Normal path: enqueue to sk->sk_receive_queue */
        __skb_queue_tail(&sk->sk_receive_queue, skb);
        sk->sk_data_ready(sk);
    }
    return NET_RX_SUCCESS;
}

tcpdump under the hood

# See what tcpdump does:
strace tcpdump -i eth0 tcp port 80 2>&1 | head -30
# socket(AF_PACKET, SOCK_RAW|SOCK_CLOEXEC, 768) = 3  (ETH_P_ALL=0x0003; 768=htons(ETH_P_ALL) in little-endian)
# setsockopt(3, SOL_SOCKET, SO_ATTACH_FILTER, ...) = 0
# bind(3, {sa_family=AF_PACKET, sll_protocol=...}) = 0
# setsockopt(3, SOL_PACKET, PACKET_VERSION, ...) = 0
# setsockopt(3, SOL_PACKET, PACKET_RX_RING, ...) = 0
# mmap(NULL, ..., PROT_READ|PROT_WRITE, MAP_SHARED, 3, 0) = 0x7f...
# poll([{fd=3, events=POLLIN|POLLERR}], 1, -1) = 1

# Compile a BPF filter and show bytecode:
tcpdump -d tcp port 80
# (000) ldh      [12]
# (001) jeq      #0x86dd  jt 2  jf 8
# ...

# Use libpcap directly:
# pcap_open_live() → socket(AF_PACKET, SOCK_RAW, ...)
# pcap_setfilter()  → setsockopt(SO_ATTACH_FILTER)
# pcap_loop()       → poll() + mmap ring

Performance

# Check packet drops:
cat /proc/net/packet
# sk       RefCnt Type Proto  Iface   R Rmem   User   Inode
# ...

# Per-socket stats (drops due to full receive queue):
ss -p | grep packet

# Use PACKET_STATISTICS:
struct tpacket_stats stats;
socklen_t len = sizeof(stats);
getsockopt(sock, SOL_PACKET, PACKET_STATISTICS, &stats, &len);
printf("received: %u, dropped: %u\n", stats.tp_packets, stats.tp_drops);

# High-performance: use PACKET_MMAP v3 ring (avoids per-packet syscalls)
# Even higher: use AF_XDP for kernel bypass

Further reading

XDP — higher-performance kernel bypass
AF_XDP — zero-copy userspace packet processing
Netfilter — packet filtering at higher layer
sk_buff — packet data structure
BPF Networking — tc/cgroup BPF
net/packet/af_packet.c — AF_PACKET implementation
man 7 packet — detailed man page