diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-09 13:41:34 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-09 13:41:34 -0800 |
| commit | c84bb79f70c634a95929f21c14340ab2078d7977 (patch) | |
| tree | b92de73eb48564d92253284bbefccd83e5e80717 /fs | |
| parent | 7e01a69f5c4f2a6af2d4cd1cc46d48efdeb98230 (diff) | |
| parent | 313c47f4fe4d07eb2969f429a66ad331fe2b3b6f (diff) | |
| download | linux-c84bb79f70c634a95929f21c14340ab2078d7977.tar.gz linux-c84bb79f70c634a95929f21c14340ab2078d7977.zip | |
Merge tag 'vfs-7.0-rc1.nullfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs nullfs update from Christian Brauner:
"Add a completely catatonic minimal pseudo filesystem called "nullfs"
and make pivot_root() work in the initramfs.
Currently pivot_root() does not work on the real rootfs because it
cannot be unmounted. Userspace has to recursively delete initramfs
contents manually before continuing boot, using the fragile
switch_root sequence (overmount + chroot).
Add nullfs, a minimal immutable filesystem that serves as the true
root of the mount hierarchy. The mutable rootfs (tmpfs/ramfs) is
mounted on top of it. This allows userspace to simply:
chdir(new_root);
pivot_root(".", ".");
umount2(".", MNT_DETACH);
without the traditional switch_root workarounds. systemd already
handles this correctly. It tries pivot_root() first and falls back
to MS_MOVE only when that fails.
This also means rootfs mounts in unprivileged namespaces no longer
need MNT_LOCKED, since the immutable nullfs guarantees nothing can be
revealed by unmounting the covering mount.
nullfs is a single-instance filesystem (get_tree_single()) marked
SB_NOUSER | SB_I_NOEXEC | SB_I_NODEV with an immutable empty root
directory. This means sooner or later it can be used to overmount
other directories to hide their contents without any additional
protection needed.
We enable it unconditionally. If we see any real regression we'll
hide it behind a boot option.
nullfs has extensions beyond this in the future. It will serve as a
concept to support the creation of completely empty mount namespaces -
which is work coming up in the next cycle"
* tag 'vfs-7.0-rc1.nullfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
fs: use nullfs unconditionally as the real rootfs
docs: mention nullfs
fs: add immutable rootfs
fs: add init_pivot_root()
fs: ensure that internal tmpfs mount gets mount id zero
Diffstat (limited to 'fs')
| -rw-r--r-- | fs/Makefile | 2 | ||||
| -rw-r--r-- | fs/init.c | 17 | ||||
| -rw-r--r-- | fs/internal.h | 1 | ||||
| -rw-r--r-- | fs/mount.h | 1 | ||||
| -rw-r--r-- | fs/namespace.c | 159 | ||||
| -rw-r--r-- | fs/nullfs.c | 70 |
6 files changed, 192 insertions, 58 deletions
diff --git a/fs/Makefile b/fs/Makefile index f238cc5ea2e9..cf4a745e9679 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ - file_attr.o fserror.o + file_attr.o fserror.o nullfs.o obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o diff --git a/fs/init.c b/fs/init.c index e0f5429c0a49..e33b2690d851 100644 --- a/fs/init.c +++ b/fs/init.c @@ -13,6 +13,23 @@ #include <linux/security.h> #include "internal.h" +int __init init_pivot_root(const char *new_root, const char *put_old) +{ + struct path new_path __free(path_put) = {}; + struct path old_path __free(path_put) = {}; + int ret; + + ret = kern_path(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new_path); + if (ret) + return ret; + + ret = kern_path(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_path); + if (ret) + return ret; + + return path_pivot_root(&new_path, &old_path); +} + int __init init_mount(const char *dev_name, const char *dir_name, const char *type_page, unsigned long flags, void *data_page) { diff --git a/fs/internal.h b/fs/internal.h index 18a062c1b5b0..9514d80ef5c4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -90,6 +90,7 @@ extern bool may_mount(void); int path_mount(const char *dev_name, const struct path *path, const char *type_page, unsigned long flags, void *data_page); int path_umount(const struct path *path, int flags); +int path_pivot_root(struct path *new, struct path *old); int show_path(struct seq_file *m, struct dentry *root); diff --git a/fs/mount.h b/fs/mount.h index 2d28ef2a3aed..e0816c11a198 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -5,6 +5,7 @@ #include <linux/ns_common.h> #include <linux/fs_pin.h> +extern struct file_system_type nullfs_fs_type; extern struct list_head notify_list; struct mnt_namespace { diff --git a/fs/namespace.c b/fs/namespace.c index c58674a20cad..53d1055c1825 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -221,7 +221,7 @@ static int mnt_alloc_id(struct mount *mnt) int res; xa_lock(&mnt_id_xa); - res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL); + res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, xa_limit_31b, GFP_KERNEL); if (!res) mnt->mnt_id_unique = ++mnt_id_ctr; xa_unlock(&mnt_id_xa); @@ -4498,36 +4498,8 @@ bool path_is_under(const struct path *path1, const struct path *path2) } EXPORT_SYMBOL(path_is_under); -/* - * pivot_root Semantics: - * Moves the root file system of the current process to the directory put_old, - * makes new_root as the new root file system of the current process, and sets - * root/cwd of all processes which had them on the current root to new_root. - * - * Restrictions: - * The new_root and put_old must be directories, and must not be on the - * same file system as the current process root. The put_old must be - * underneath new_root, i.e. adding a non-zero number of /.. to the string - * pointed to by put_old must yield the same directory as new_root. No other - * file system may be mounted on put_old. After all, new_root is a mountpoint. - * - * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. - * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives - * in this situation. - * - * Notes: - * - we don't move root/cwd if they are not at the root (reason: if something - * cared enough to change them, it's probably wrong to force them elsewhere) - * - it's okay to pick a root that isn't the root of a file system, e.g. - * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, - * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root - * first. - */ -SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, - const char __user *, put_old) +int path_pivot_root(struct path *new, struct path *old) { - struct path new __free(path_put) = {}; - struct path old __free(path_put) = {}; struct path root __free(path_put) = {}; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; int error; @@ -4535,28 +4507,18 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (!may_mount()) return -EPERM; - error = user_path_at(AT_FDCWD, new_root, - LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); - if (error) - return error; - - error = user_path_at(AT_FDCWD, put_old, - LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); - if (error) - return error; - - error = security_sb_pivotroot(&old, &new); + error = security_sb_pivotroot(old, new); if (error) return error; get_fs_root(current->fs, &root); - LOCK_MOUNT(old_mp, &old); + LOCK_MOUNT(old_mp, old); old_mnt = old_mp.parent; if (IS_ERR(old_mnt)) return PTR_ERR(old_mnt); - new_mnt = real_mount(new.mnt); + new_mnt = real_mount(new->mnt); root_mnt = real_mount(root.mnt); ex_parent = new_mnt->mnt_parent; root_parent = root_mnt->mnt_parent; @@ -4568,7 +4530,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, return -EINVAL; if (new_mnt->mnt.mnt_flags & MNT_LOCKED) return -EINVAL; - if (d_unlinked(new.dentry)) + if (d_unlinked(new->dentry)) return -ENOENT; if (new_mnt == root_mnt || old_mnt == root_mnt) return -EBUSY; /* loop, on the same file system */ @@ -4576,15 +4538,15 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) return -EINVAL; /* absolute root */ - if (!path_mounted(&new)) + if (!path_mounted(new)) return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) return -EINVAL; /* absolute root */ /* make sure we can reach put_old from new_root */ - if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new)) + if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, new)) return -EINVAL; /* make certain new is below the root */ - if (!is_path_reachable(new_mnt, new.dentry, &root)) + if (!is_path_reachable(new_mnt, new->dentry, &root)) return -EINVAL; lock_mount_hash(); umount_mnt(new_mnt); @@ -4603,10 +4565,55 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, unlock_mount_hash(); mnt_notify_add(root_mnt); mnt_notify_add(new_mnt); - chroot_fs_refs(&root, &new); + chroot_fs_refs(&root, new); return 0; } +/* + * pivot_root Semantics: + * Moves the root file system of the current process to the directory put_old, + * makes new_root as the new root file system of the current process, and sets + * root/cwd of all processes which had them on the current root to new_root. + * + * Restrictions: + * The new_root and put_old must be directories, and must not be on the + * same file system as the current process root. The put_old must be + * underneath new_root, i.e. adding a non-zero number of /.. to the string + * pointed to by put_old must yield the same directory as new_root. No other + * file system may be mounted on put_old. After all, new_root is a mountpoint. + * + * The immutable nullfs filesystem is mounted as the true root of the VFS + * hierarchy. The mutable rootfs (tmpfs/ramfs) is layered on top of this, + * allowing pivot_root() to work normally from initramfs. + * + * Notes: + * - we don't move root/cwd if they are not at the root (reason: if something + * cared enough to change them, it's probably wrong to force them elsewhere) + * - it's okay to pick a root that isn't the root of a file system, e.g. + * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, + * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root + * first. + */ +SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, + const char __user *, put_old) +{ + struct path new __free(path_put) = {}; + struct path old __free(path_put) = {}; + int error; + + error = user_path_at(AT_FDCWD, new_root, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); + if (error) + return error; + + error = user_path_at(AT_FDCWD, put_old, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); + if (error) + return error; + + return path_pivot_root(&new, &old); +} + static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) { unsigned int flags = mnt->mnt.mnt_flags; @@ -5969,24 +5976,62 @@ struct mnt_namespace init_mnt_ns = { static void __init init_mount_tree(void) { - struct vfsmount *mnt; - struct mount *m; + struct vfsmount *mnt, *nullfs_mnt; + struct mount *mnt_root; struct path root; + /* + * We create two mounts: + * + * (1) nullfs with mount id 1 + * (2) mutable rootfs with mount id 2 + * + * with (2) mounted on top of (1). + */ + nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL); + if (IS_ERR(nullfs_mnt)) + panic("VFS: Failed to create nullfs"); + mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); if (IS_ERR(mnt)) panic("Can't create rootfs"); - m = real_mount(mnt); - init_mnt_ns.root = m; - init_mnt_ns.nr_mounts = 1; - mnt_add_to_ns(&init_mnt_ns, m); + VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1); + VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2); + + /* The namespace root is the nullfs mnt. */ + mnt_root = real_mount(nullfs_mnt); + init_mnt_ns.root = mnt_root; + + /* Mount mutable rootfs on top of nullfs. */ + root.mnt = nullfs_mnt; + root.dentry = nullfs_mnt->mnt_root; + + LOCK_MOUNT_EXACT(mp, &root); + if (unlikely(IS_ERR(mp.parent))) + panic("VFS: Failed to mount rootfs on nullfs"); + scoped_guard(mount_writer) + attach_mnt(real_mount(mnt), mp.parent, mp.mp); + + pr_info("VFS: Finished mounting rootfs on nullfs\n"); + + /* + * We've dropped all locks here but that's fine. Not just are we + * the only task that's running, there's no other mount + * namespace in existence and the initial mount namespace is + * completely empty until we add the mounts we just created. + */ + for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) { + mnt_add_to_ns(&init_mnt_ns, p); + init_mnt_ns.nr_mounts++; + } + init_task.nsproxy->mnt_ns = &init_mnt_ns; get_mnt_ns(&init_mnt_ns); - root.mnt = mnt; - root.dentry = mnt->mnt_root; - + /* The root and pwd always point to the mutable rootfs. */ + root.mnt = mnt; + root.dentry = mnt->mnt_root; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); diff --git a/fs/nullfs.c b/fs/nullfs.c new file mode 100644 index 000000000000..fdbd3e5d3d71 --- /dev/null +++ b/fs/nullfs.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */ +#include <linux/fs/super_types.h> +#include <linux/fs_context.h> +#include <linux/magic.h> + +static const struct super_operations nullfs_super_operations = { + .statfs = simple_statfs, +}; + +static int nullfs_fs_fill_super(struct super_block *s, struct fs_context *fc) +{ + struct inode *inode; + + s->s_maxbytes = MAX_LFS_FILESIZE; + s->s_blocksize = PAGE_SIZE; + s->s_blocksize_bits = PAGE_SHIFT; + s->s_magic = NULL_FS_MAGIC; + s->s_op = &nullfs_super_operations; + s->s_export_op = NULL; + s->s_xattr = NULL; + s->s_time_gran = 1; + s->s_d_flags = 0; + + inode = new_inode(s); + if (!inode) + return -ENOMEM; + + /* nullfs is permanently empty... */ + make_empty_dir_inode(inode); + simple_inode_init_ts(inode); + inode->i_ino = 1; + /* ... and immutable. */ + inode->i_flags |= S_IMMUTABLE; + + s->s_root = d_make_root(inode); + if (!s->s_root) + return -ENOMEM; + + return 0; +} + +/* + * For now this is a single global instance. If needed we can make it + * mountable by userspace at which point we will need to make it + * multi-instance. + */ +static int nullfs_fs_get_tree(struct fs_context *fc) +{ + return get_tree_single(fc, nullfs_fs_fill_super); +} + +static const struct fs_context_operations nullfs_fs_context_ops = { + .get_tree = nullfs_fs_get_tree, +}; + +static int nullfs_init_fs_context(struct fs_context *fc) +{ + fc->ops = &nullfs_fs_context_ops; + fc->global = true; + fc->sb_flags = SB_NOUSER; + fc->s_iflags = SB_I_NOEXEC | SB_I_NODEV; + return 0; +} + +struct file_system_type nullfs_fs_type = { + .name = "nullfs", + .init_fs_context = nullfs_init_fs_context, + .kill_sb = kill_anon_super, +}; |
