Skip to content

UDP Socket Internals

struct udp_sock, sendmsg/recvmsg paths, UDP segmentation offload, and multicast

UDP in the kernel

UDP is connectionless: there is no handshake, no retransmission, no flow control. The kernel's job is minimal — add a UDP header and hand the packet to IP.

Application:
    sendto(fd, buf, len, 0, &dest, sizeof(dest))
    ┌────▼────────────────────────────────────────────────────┐
    │ sock_sendmsg → udp_sendmsg                              │
    │   1. Build sk_buff                                      │
    │   2. Set UDP header (src port, dst port, len, checksum) │
    │   3. ip_make_skb → route lookup                         │
    │   4. udp_send_skb → ip_send_skb                         │
    └─────────────────────────────────────────────────────────┘
    IP layer → routing → Ethernet → wire

struct udp_sock

/* include/linux/udp.h */
struct udp_sock {
    /* inet_sock must be first: */
    struct inet_sock inet;   /* contains: struct sock sk as first member */

    int       pending;       /* Any pending frames? */
    unsigned int corkflag;   /* Cork by setting UDP_CORK socket option */
    __u8      encap_type;    /* Is this a UDP encap socket? */
    unsigned char no_check6_tx:1, no_check6_rx:1;
    unsigned char encap_enabled:1;
    unsigned char gro_enabled:1;
    unsigned char accept_udp_l4:1;
    unsigned char accept_udp_fraglist:1;

    __u16     len;           /* total length of pending frames */
    __u16     gso_size;      /* UDP GSO fragment size */

    /* Receive path: */
    int       (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
    int       (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, int err,
                               __be16 port, u32 info, u8 *payload);
    void      (*encap_destroy)(struct sock *sk);
    struct sk_buff *(*gro_receive)(struct sock *sk, struct list_head *head,
                                   struct sk_buff *skb);
    int       (*gro_complete)(struct sock *sk, struct sk_buff *skb, int nhoff);
};

UDP send path

udp_sendmsg

/* net/ipv4/udp.c */
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
    struct inet_sock *inet = inet_sk(sk);
    struct udp_sock  *up   = udp_sk(sk);
    struct flowi4     fl4;
    struct rtable    *rt = NULL;
    struct sk_buff   *skb;

    /* 1. Determine destination address and port */
    if (msg->msg_name) {
        struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
        daddr = usin->sin_addr.s_addr;
        dport = usin->sin_port;
    } else {
        /* Connected socket: use cached dst */
        daddr = inet->inet_daddr;
        dport = inet->inet_dport;
    }

    /* 2. Route lookup */
    flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark,
                       tos, RT_SCOPE_UNIVERSE, IPPROTO_UDP,
                       sk->sk_flags, daddr, saddr, dport, sport, ...);
    rt = ip_route_output_flow(net, &fl4, sk);

    /* 3. Build the sk_buff with UDP header + data */
    skb = ip_make_skb(sk, &fl4, getfrag, msg, ulen,
                      sizeof(struct udphdr), &ipc, &rt,
                      msg->msg_flags);

    /* 4. Add UDP header */
    struct udphdr *uh = udp_hdr(skb);
    uh->source = inet->inet_sport;
    uh->dest   = dport;
    uh->len    = htons(ulen);
    uh->check  = 0;  /* compute checksum below */

    /* 5. Compute checksum (or offload to hardware) */
    udp4_hwcsum(skb, fl4.saddr, fl4.daddr);

    /* 6. Hand to IP */
    return udp_send_skb(skb, &fl4, &cork.base);
}

UDP corking (MSG_MORE)

/* Corking: accumulate data before sending (like TCP_CORK) */
setsockopt(fd, IPPROTO_UDP, UDP_CORK, &one, sizeof(one));
/* or MSG_MORE flag on each sendmsg: */

/* Multiple small sendmsg calls → one UDP packet: */
sendmsg(fd, &msg1, MSG_MORE);  /* buffered */
sendmsg(fd, &msg2, MSG_MORE);  /* buffered */
sendmsg(fd, &msg3, 0);         /* flush: all three sent in one UDP packet */

UDP receive path

Incoming packet routing

/* net/ipv4/udp.c */
int udp_rcv(struct sk_buff *skb)
{
    return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
}

static int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
                            int proto)
{
    struct udphdr *uh = udp_hdr(skb);
    __be32 saddr = ip_hdr(skb)->saddr;
    __be32 daddr = ip_hdr(skb)->daddr;
    __u16  ulen  = ntohs(uh->len);

    /* 1. Validate UDP header and checksum */
    if (udp4_csum_init(skb, uh, proto))
        goto csum_error;

    /* 2. Lookup socket by dst port/addr (hash table) */
    sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
    if (sk)
        return udp_unicast_rcv_skb(sk, skb, uh);

    /* 3. No socket found — send ICMP port unreachable */
    icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
    return 0;
}

Delivering to socket receive queue

static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
    struct udp_sock *up = udp_sk(sk);

    /* Run BPF socket filter if installed */
    if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
        goto drop;

    /* Enqueue to sk->sk_receive_queue */
    if (likely(sk_receive_skb(sk, skb, 1) == NET_RX_SUCCESS))
        return 0;

    return -1;
}

/* recvmsg: userspace reads from the queue */
int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ...)
{
    struct sk_buff *skb;

    /* Wait for and dequeue a packet */
    skb = __skb_recv_udp(sk, flags, &off, &err);
    if (!skb)
        return err;

    /* Copy to userspace */
    copied = skb->len - sizeof(struct udphdr) - off;
    err = skb_copy_datagram_msg(skb, sizeof(struct udphdr) + off, msg, copied);

    /* Return source address if requested (MSG_NAME) */
    if (msg->msg_name) {
        struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
        sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
        sin->sin_port        = udp_hdr(skb)->source;
    }

    return copied;
}

UDP socket hash table

/* Sockets are stored in a hash table keyed by local port: */
struct udp_table {
    struct udp_hslot    *hash;      /* hash by port */
    struct udp_hslot    *hash2;     /* hash by port+addr (4-tuple) */
    unsigned int         mask;
    unsigned int         log;
};

/* Lookup: */
/* 1. Hash by {dest_port} → candidates list */
/* 2. Check addr match for each candidate */
/* 3. If multiple match, prefer connected sockets (4-tuple match) */

UDP GSO (Generic Segmentation Offload)

For high-throughput UDP (e.g., QUIC, game servers), UDP GSO allows sending multiple UDP payloads in one syscall:

/* Send 10 x 1400-byte UDP datagrams in one syscall: */
char buf[10 * 1400];
fill_data(buf, sizeof(buf));

struct msghdr msg = {};
struct iovec iov = { .iov_base = buf, .iov_len = sizeof(buf) };
msg.msg_iov    = &iov;
msg.msg_iovlen = 1;

/* Destination: */
struct sockaddr_in dest = { .sin_family = AF_INET, .sin_port = htons(9000) };
inet_pton(AF_INET, "192.168.1.1", &dest.sin_addr);
msg.msg_name = &dest;
msg.msg_namelen = sizeof(dest);

/* GSO control message: segment size */
char cmsg_buf[CMSG_SPACE(sizeof(uint16_t))];
struct cmsghdr *cm = (struct cmsghdr *)cmsg_buf;
cm->cmsg_level = SOL_UDP;
cm->cmsg_type  = UDP_SEGMENT;
cm->cmsg_len   = CMSG_LEN(sizeof(uint16_t));
*(uint16_t *)CMSG_DATA(cm) = 1400;  /* segment size */

msg.msg_control    = cmsg_buf;
msg.msg_controllen = sizeof(cmsg_buf);

sendmsg(fd, &msg, 0);
/* Kernel sends 10 separate UDP packets, one syscall! */

UDP GRO (Generic Receive Offload)

/* Enable UDP GRO on receive: */
int one = 1;
setsockopt(fd, SOL_UDP, UDP_GRO, &one, sizeof(one));

/* Now recvmsg with MSG_TRUNC to get the full coalesced buffer */
/* Check cmsg for UDP_GRO segment size */
char cmsg_buf[256];
msg.msg_control    = cmsg_buf;
msg.msg_controllen = sizeof(cmsg_buf);

ssize_t n = recvmsg(fd, &msg, 0);
for (struct cmsghdr *cm = CMSG_FIRSTHDR(&msg); cm;
     cm = CMSG_NXTHDR(&msg, cm)) {
    if (cm->cmsg_level == SOL_UDP && cm->cmsg_type == UDP_GRO) {
        uint16_t seg_size = *(uint16_t *)CMSG_DATA(cm);
        /* n bytes received, each segment is seg_size bytes */
    }
}

Multicast

/* Join a multicast group: */
struct ip_mreq mreq = {
    .imr_multiaddr.s_addr = inet_addr("239.0.0.1"),
    .imr_interface.s_addr = INADDR_ANY,
};
setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq));

/* Bind to multicast port: */
struct sockaddr_in addr = {
    .sin_family = AF_INET,
    .sin_port   = htons(5000),
    .sin_addr.s_addr = inet_addr("239.0.0.1"),  /* or INADDR_ANY */
};
bind(fd, (struct sockaddr *)&addr, sizeof(addr));

/* Send to multicast group: */
struct sockaddr_in dest = {
    .sin_family = AF_INET,
    .sin_port   = htons(5000),
    .sin_addr.s_addr = inet_addr("239.0.0.1"),
};
sendto(fd, data, len, 0, (struct sockaddr *)&dest, sizeof(dest));

Kernel multicast delivery

/* net/ipv4/udp.c: deliver to multiple sockets in multicast group */
static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
                                     struct udphdr *uh, ...)
{
    /* Find all sockets subscribed to this group/port */
    struct hlist_head *head = &udptable->hash[hash];
    struct sock *sk;

    sk_for_each_from(sk) {
        /* Check: port matches, multicast group joined, interface matches */
        if (!inet_mc_hash_match(sk, skb))
            continue;
        /* Clone skb for each receiving socket */
        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
        udp_queue_rcv_one_skb(sk, skb2);
    }
}

UDP performance tuning

# Increase receive buffer (reduces drops under load):
sysctl -w net.core.rmem_max=26214400        # 25 MB max
sysctl -w net.core.rmem_default=26214400
sysctl -w net.ipv4.udp_mem="102400 873800 16777216"

# Increase send buffer:
sysctl -w net.core.wmem_max=26214400

# Check UDP drop stats:
cat /proc/net/snmp | grep Udp
# UdpInDatagrams, UdpNoPorts, UdpInErrors, UdpOutDatagrams
# UdpRcvbufErrors: dropped because socket receive buffer full

netstat -su | grep -i error
ss -upe  # show UDP socket stats including drops

# Per-socket stats:
ss -u -n -e | grep ":9000"

# Trace UDP drops:
bpftrace -e '
tracepoint:udp:udp_fail_queue_rcv_skb {
    @drops = count();
}'

Further reading

  • TCP — connection-oriented transport
  • Socket Layer — socket() create, bind, sendmsg
  • sk_buff — packet data structure
  • NAPI — receive scaling
  • XDP — bypass kernel for extreme UDP throughput
  • net/ipv4/udp.c — UDP implementation