Container Isolation Internals
How Docker, Kubernetes, and container runtimes use namespaces and cgroups
What a container is, kernel-side
A container is a process (or process tree) that:
- Lives in a set of namespaces (isolated view of system resources)
- Is governed by a cgroup (resource limits)
- Has a filesystem root (via mount namespace + chroot/pivot_root)
- Is optionally restricted by seccomp (syscall filtering) and LSM (AppArmor/SELinux)
There is no "container" kernel object — it's a combination of existing primitives.
Container
┌─────────────────────────────────────────┐
│ PID namespace: PIDs 1,2,3,... │
│ Mount namespace: / → container rootfs │
│ Net namespace: eth0=veth pair │
│ User namespace: uid 0=container root │
│ UTS namespace: hostname=container-1 │
│ IPC namespace: isolated SysV/POSIX │
│ Cgroup namespace: /sys/fs/cgroup view │
├─────────────────────────────────────────┤
│ Cgroup: /sys/fs/cgroup/containers/c1/ │
│ cpu.max = 500000/1000000 (50%) │
│ memory.max = 512M │
│ pids.max = 100 │
├─────────────────────────────────────────┤
│ seccomp: syscall whitelist/blacklist │
│ AppArmor/SELinux: MAC policy │
└─────────────────────────────────────────┘
Container creation sequence
This is what docker run or runc does at the kernel level:
/* Simplified runc container creation */
/* 1. Create new namespaces via clone() */
pid_t child = clone(container_init,
stack + STACK_SIZE,
CLONE_NEWPID /* new PID namespace */
| CLONE_NEWNS /* new mount namespace */
| CLONE_NEWNET /* new network namespace */
| CLONE_NEWIPC /* new IPC namespace */
| CLONE_NEWUTS /* new UTS namespace */
| CLONE_NEWCGROUP /* new cgroup namespace */
| SIGCHLD,
&config);
/* 2. Set up cgroup (from parent namespace) */
/* Write pid to cgroup.procs */
write_to_file("/sys/fs/cgroup/containers/c1/cgroup.procs", pid_str);
/* 3. Set up network (veth pair, from parent namespace) */
/* runc calls netlink to create veth0/veth1, move veth1 to child netns */
/* 4. In child: pivot_root to container filesystem */
/* mounts container rootfs, calls pivot_root or chroot */
/* 5. In child: setuid/setgid, drop capabilities */
/* 6. In child: install seccomp filter */
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
/* 7. exec the container entrypoint */
execve("/entrypoint", argv, envp);
pivot_root: switching filesystem root
Containers use pivot_root (not just chroot) to fully isolate the filesystem:
/* Container init: */
/* 1. Mount container rootfs at a temp location */
mount(rootfs_path, "/new_root", "none", MS_BIND, NULL);
/* 2. Make mount namespace private (prevent propagation to host) */
mount("", "/", "none", MS_PRIVATE | MS_REC, NULL);
/* 3. Put old root somewhere under new root */
mkdir("/new_root/.old_root", 0755);
/* 4. pivot_root: new_root becomes /, old root goes to /.old_root */
syscall(SYS_pivot_root, "/new_root", "/new_root/.old_root");
/* 5. Unmount the old root to complete isolation */
umount2("/.old_root", MNT_DETACH);
rmdir("/.old_root");
/* Now /proc, /sys, /dev need to be mounted fresh */
mount("proc", "/proc", "proc", 0, NULL);
mount("sysfs", "/sys", "sysfs", 0, NULL);
mount("tmpfs", "/dev", "tmpfs", 0, NULL);
Network setup: veth pairs
Container networking uses virtual ethernet pairs:
Host namespace Container namespace
───────────────── ──────────────────
veth0 (10.0.0.1) ←──────────→ eth0 (172.17.0.2)
│ kernel
│ bridge
docker0 bridge
(172.17.0.1)
│
└── routes, NAT (iptables MASQUERADE)
# What runc does for container networking (simplified):
ip link add veth0 type veth peer name eth0
ip link set eth0 netns <container_pid>
# In host namespace:
ip link set veth0 master docker0
ip link set veth0 up
# In container namespace:
ip netns exec <container_pid> ip addr add 172.17.0.2/16 dev eth0
ip netns exec <container_pid> ip route add default via 172.17.0.1
Cgroup namespace: container's view of cgroups
Without a cgroup namespace, a container would see the full host cgroup hierarchy. With CLONE_NEWCGROUP:
# Host sees:
cat /proc/1/cgroup
# 0::/
# Container (in its cgroup namespace) sees:
cat /proc/1/cgroup # from inside container
# 0::/ ← the container's own cgroup looks like root
# /sys/fs/cgroup/ inside the container shows only its subtree
The cgroup namespace makes the container's cgroup appear as root, so systemd in a container doesn't see the host hierarchy.
User namespace: rootless containers
Rootless containers (Docker rootless, Podman) run entirely as unprivileged users:
Host: uid=1000 (user)
uid=100000-165535 (subuid range)
Container: uid=0 (root inside)
= uid=100000 outside (via uid_map: 0 → 100000, range 65536)
# /etc/subuid maps users to subordinate UID ranges
cat /etc/subuid
# user:100000:65536
# Start rootless container
podman run --rm -it ubuntu bash
# Inside: id → uid=0(root)
# Outside: ps shows process running as uid=100000
# The kernel enforces: this "root" has no host privilege
# It cannot access /etc/shadow, load modules, etc.
The user namespace also enables unprivileged namespace creation for all other types. User namespaces were made fully functional for unprivileged users in Linux 3.9 (LWN).
seccomp: syscall filtering
Seccomp restricts which syscalls a container can make:
/* Install a seccomp filter (BPF program over syscall args) */
struct sock_filter filter[] = {
/* Load syscall number */
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
offsetof(struct seccomp_data, nr)),
/* Allow read, write, exit, sigreturn */
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_read, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_write, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/* Default: kill */
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
};
struct sock_fprog prog = {
.len = ARRAY_SIZE(filter),
.filter = filter,
};
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
Docker's default seccomp profile blocks ~40 syscalls (including keyctl, kexec_load, perf_event_open, ptrace, etc.).
OCI runtime spec
The Open Container Initiative (OCI) runtime spec defines what a container runtime (runc, crun, kata) must do. The spec is a JSON file (config.json):
{
"ociVersion": "1.0.0",
"process": {
"user": {"uid": 0, "gid": 0},
"args": ["/bin/sh"],
"capabilities": {
"bounding": ["CAP_NET_BIND_SERVICE", "CAP_KILL", ...]
},
"rlimits": [{"type": "RLIMIT_NOFILE", "hard": 1024, "soft": 1024}],
"seccompProfile": "default.json"
},
"root": {"path": "rootfs", "readonly": true},
"mounts": [
{"destination": "/proc", "type": "proc", "source": "proc"},
{"destination": "/dev", "type": "tmpfs", "source": "tmpfs"}
],
"linux": {
"namespaces": [
{"type": "pid"}, {"type": "network"}, {"type": "ipc"},
{"type": "uts"}, {"type": "mount"}, {"type": "cgroup"}
],
"cgroupsPath": "/containers/mycontainer",
"resources": {
"memory": {"limit": 536870912},
"cpu": {"quota": 200000, "period": 1000000}
}
}
}
Debugging container isolation
# Check what namespaces a container process is in
PID=$(docker inspect --format '{{.State.Pid}}' mycontainer)
ls -la /proc/$PID/ns/
# Enter a container's namespace
nsenter --target $PID --mount --pid --net -- bash
# Check cgroup of a container process
cat /proc/$PID/cgroup
# See container's cgroup limits
cat /sys/fs/cgroup/$(cat /proc/$PID/cgroup | cut -d: -f3)/memory.max
# strace a container process from the host
strace -p $PID
# Check seccomp filter installed
cat /proc/$PID/status | grep Seccomp
# Seccomp: 2 ← 0=none, 1=strict, 2=filter
# Check capabilities
cat /proc/$PID/status | grep Cap
# CapPrm: 00000000a80425fb
capsh --decode=00000000a80425fb
Further reading
- Cgroup v2 Architecture — Resource control internals
- Resource Controllers — cpu, memory, io limits
- Namespaces — All 8 namespace types in detail
- BPF/eBPF — seccomp-bpf and cgroup BPF programs
runcsource code — reference OCI container runtime