ext4

The default Linux filesystem: journaled, extent-based, battle-tested. Developed by Theodore Ts'o, Andreas Dilger, Alex Tomas, and others; merged as stable in Linux 2.6.28.

On-disk layout

An ext4 filesystem is divided into block groups. Each block group contains:

Block Group 0:
┌─────────────────────────────────────────────────────────────────┐
│ Super-   │ Group     │ Block    │ Inode    │ Inode │ Data       │
│ block    │ Descriptor│ Bitmap   │ Bitmap   │ Table │ Blocks     │
│ (1 block)│ Table     │ (1 block)│ (1 block)│       │            │
└─────────────────────────────────────────────────────────────────┘

Superblock: copies in groups 0, 1, 3^n, 5^n, 7^n (backup)

# Inspect filesystem layout
dumpe2fs /dev/sda1 | head -80
# Inode count:              2621440
# Block count:              10485760
# Block size:               4096
# Blocks per group:         32768
# Inodes per group:         8192
# Inode size:               256
# Journal inode:            8

The ext4 superblock

/* fs/ext4/ext4.h */
struct ext4_super_block {
    __le32  s_inodes_count;         /* total inodes */
    __le32  s_blocks_count_lo;      /* total blocks (low 32 bits) */
    __le32  s_r_blocks_count_lo;    /* reserved blocks */
    __le32  s_free_blocks_count_lo;
    __le32  s_free_inodes_count;
    __le32  s_first_data_block;     /* first data block (0 for 4K blocks) */
    __le32  s_log_block_size;       /* block size = 1024 << s_log_block_size */
    __le32  s_blocks_per_group;
    __le32  s_inodes_per_group;
    __le32  s_mtime;                /* last mount time */
    __le32  s_wtime;                /* last write time */
    __le16  s_magic;                /* 0xEF53 */
    __le16  s_state;                /* filesystem state: clean/errors */
    __le16  s_errors;               /* error behavior: continue/remount-ro/panic */
    /* ... */
    __le32  s_feature_compat;       /* compatible feature flags */
    __le32  s_feature_incompat;     /* incompatible feature flags */
    __le32  s_feature_ro_compat;    /* read-only compatible features */
    __u8    s_uuid[16];             /* filesystem UUID */
    char    s_volume_name[16];
    /* ... journal info, encryption, etc. */
};

The ext4 inode

struct ext4_inode {
    __le16  i_mode;             /* file mode (S_IFREG, S_IFDIR, etc.) */
    __le16  i_uid;              /* lower 16 bits of UID */
    __le32  i_size_lo;          /* file size (lower 32 bits) */
    __le32  i_atime;            /* access time */
    __le32  i_ctime;            /* inode change time */
    __le32  i_mtime;            /* modification time */
    __le32  i_dtime;            /* deletion time */
    __le16  i_gid;
    __le16  i_links_count;      /* hard link count */
    __le32  i_blocks_lo;        /* 512-byte blocks used */
    __le32  i_flags;            /* EXT4_EXTENTS_FL, EXT4_INLINE_DATA_FL, ... */
    __le32  i_block[EXT4_N_BLOCKS]; /* 15 words: extent tree root OR block map */
    __le32  i_generation;
    __le32  i_file_acl_lo;      /* extended attribute block */
    __le32  i_size_high;        /* upper 32 bits of file size */
    /* ... extra fields at offset 128 for inodes > 128 bytes ... */
    __le16  i_extra_isize;      /* extra inode size (allows new fields) */
    /* nanosecond timestamps: */
    __le32  i_ctime_extra;
    __le32  i_mtime_extra;
    __le32  i_atime_extra;
    __le32  i_crtime;           /* creation time */
    __le32  i_crtime_extra;
    /* ... version, checksum ... */
};

Extent tree: efficient large file representation

ext4 uses extents (contiguous block runs) instead of the old indirect block maps. Extents were introduced by Alex Tomas (commit):

i_block[0..3] holds the extent tree root:
┌──────────────────────────────────────────────────────┐
│ ext4_extent_header                                    │
│   eh_magic=0xF30A, eh_entries=N, eh_depth=0 (leaf)   │
├──────────────────────────────────────────────────────┤
│ ext4_extent[0]: ee_block=0,  ee_len=128, ee_start=1024│
│ ext4_extent[1]: ee_block=128,ee_len=64,  ee_start=2048│
└──────────────────────────────────────────────────────┘

For deep files, eh_depth>0: i_block contains internal nodes
with ext4_extent_idx entries pointing to child blocks

struct ext4_extent {
    __le32  ee_block;       /* first logical block covered */
    __le16  ee_len;         /* number of blocks (or 0x8000 | len for unwritten) */
    __le16  ee_start_hi;    /* physical block (high 16 bits) */
    __le32  ee_start_lo;    /* physical block (low 32 bits) */
};

struct ext4_extent_header {
    __le16  eh_magic;       /* 0xF30A */
    __le16  eh_entries;     /* number of entries */
    __le16  eh_max;         /* capacity of this node */
    __le16  eh_depth;       /* depth (0=leaf, >0=internal) */
    __le32  eh_generation;
};

Benefits over old indirect maps: - A single extent covers contiguous blocks (e.g., 128MB in one entry) - Sequential reads are contiguous on disk → fast - Reduces inode tree depth for large files

Journaling with jbd2

ext4 uses the Journal Block Device (jbd2) for crash consistency:

Write modes

Mode	journaled	Data safety	Performance
`data=writeback`	metadata only	Data may be pre-allocated garbage after crash	Fastest
`data=ordered` (default)	metadata only, but data written before metadata	Data is committed when metadata is	Good balance
`data=journal`	both metadata and data	Strongest	Slowest

# Check current mode
tune2fs -l /dev/sda1 | grep "Default mount options"
# Default mount options: user_xattr acl

# Mount with specific journal mode
mount -o data=journal /dev/sda1 /mnt

Journal structure (jbd2)

/* fs/jbd2/journal.c */
struct journal_s {              /* journal_t */
    unsigned long   j_flags;   /* JBD2_UNMOUNT, JBD2_ABORT, ... */
    int             j_errno;

    struct buffer_head *j_sb_buffer; /* journal superblock */
    journal_superblock_t *j_superblock;

    tid_t           j_head;     /* latest transaction sequence number */
    tid_t           j_tail;     /* oldest committed transaction still on disk */
    unsigned long   j_free;     /* free space remaining */

    struct transaction_s *j_running_transaction;  /* current transaction */
    struct transaction_s *j_committing_transaction;
    struct list_head j_checkpoint_transactions;

    wait_queue_head_t j_wait_transaction_locked;
    wait_queue_head_t j_wait_done_commit;
    wait_queue_head_t j_wait_commit;
    wait_queue_head_t j_wait_updates;

    struct task_struct *j_task;  /* kjournald thread */
    int j_max_transaction_buffers;
    unsigned long j_commit_interval;  /* default 5 seconds */
};

Transaction flow

1. jbd2_journal_start(): begin transaction
      → get handle_t for this transaction
      → reserve journal space (journal credits)

2. jbd2_journal_get_write_access(bh): mark buffer as modified
      → copy original buffer to journal (for undo)

3. Modify the buffer (update data)

4. jbd2_journal_dirty_metadata(bh): mark buffer dirty in journal

5. jbd2_journal_stop(): end transaction
      → if journal full or commit_interval elapsed:
          kjournald2 commits:
            → write descriptor + data blocks to journal
            → write commit record to journal (seals the transaction)
            → checkpoint: write data blocks to final location, mark journal free

6. On crash: journal replay restores consistency

Delayed allocation (delalloc)

ext4 delays block allocation until actual writeback:

/* When a page is written: */
/* 1. Mark page dirty, NO block allocated yet */
/* 2. On writeback (dirty expire or sync): */
ext4_writepages()
  → ext4_da_convert_inline_data()
  → mpage_map_and_submit_extent()
      → ext4_map_blocks(CREATE)  ← allocate blocks NOW
          → ext4_ext_map_blocks() → find/create extent
          → jbd2_journal_get_write_access()

Benefits: - Small writes that are quickly deleted never need block allocation - Large sequential writes get contiguous blocks (better for extent efficiency) - Reduces journal pressure (allocations batched)

Online features

# Online resize (grow filesystem)
resize2fs /dev/sda1  # grow to fill partition

# Online defrag (for fragmented files)
e4defrag /data/myfile
e4defrag /data/  # defrag directory

# Check filesystem health
e2fsck -n /dev/sda1   # dry run, don't fix

# Tune journaling commit interval (default 5s)
tune2fs -o journal_data_ordered /dev/sda1
mount -o commit=60 /dev/sda1 /mnt  # commit every 60s (less safe, faster)

# Enable inline data (small files stored in inode directly)
tune2fs -O inline_data /dev/sda1

Key ext4 statistics

# ext4 stats via debugfs
debugfs /dev/sda1
> stats
> icheck <block_number>  # find inode owning block
> ncheck <inode_number>  # find name of inode
> blocks <inode_number>  # show blocks used by inode

# ext4 tracepoints
ls /sys/kernel/tracing/events/ext4/
echo 1 > /sys/kernel/tracing/events/ext4/ext4_da_write_begin/enable
cat /sys/kernel/tracing/trace_pipe

# Block allocation stats
cat /proc/fs/ext4/sda1/mb_groups  # buddy allocator per-group state