ext4: Add support for unprivileged mounts from user namespaces

Support unprivileged mounting of ext4 volumes from user namespaces. This requires the following changes: - Perform all uid and gid conversions to/from disk relative to s_user_ns. In many cases this will already be handled by the vfs helper functions. This also requires updates to handle cases where ids may not map into s_user_ns. - Update most capability checks to check for capabilities in s_user_ns rather than init_user_ns. These mostly reflect changes to the filesystem that a user in s_user_ns could already make externally by virtue of having write access to the backing device. - Restrict unsafe options in either the mount options or the ext4 superblock. Currently the only concerning option is errors=panic, and this is made to require CAP_SYS_ADMIN in init_user_ns. - Verify that unprivileged users have the required access to the journal device at the path passed via the journal_path mount option. Note that for the journal_path and the journal_dev mount options, and for external journal devices specified in the ext4 superblock, devcgroup restrictions will be enforced by __blkdev_get(), (via blkdev_get_by_dev()), ensuring that the user has been granted appropriate access to the block device. - Set the FS_USERNS_MOUNT flag on the filesystem types supported by ext4. sysfs attributes for ext4 mounts remain writable only by real root. Signed-off-by: Seth Forshee <seth.forshee@canonical.com>

ext4: Add support for unprivileged mounts from user namespaces
Support unprivileged mounting of ext4 volumes from user namespaces. This requires the following changes: - Perform all uid and gid conversions to/from disk relative to s_user_ns. In many cases this will already be handled by the vfs helper functions. This also requires updates to handle cases where ids may not map into s_user_ns. - Update most capability checks to check for capabilities in s_user_ns rather than init_user_ns. These mostly reflect changes to the filesystem that a user in s_user_ns could already make externally by virtue of having write access to the backing device. - Restrict unsafe options in either the mount options or the ext4 superblock. Currently the only concerning option is errors=panic, and this is made to require CAP_SYS_ADMIN in init_user_ns. - Verify that unprivileged users have the required access to the journal device at the path passed via the journal_path mount option. Note that for the journal_path and the journal_dev mount options, and for external journal devices specified in the ext4 superblock, devcgroup restrictions will be enforced by __blkdev_get(), (via blkdev_get_by_dev()), ensuring that the user has been granted appropriate access to the block device. - Set the FS_USERNS_MOUNT flag on the filesystem types supported by ext4. sysfs attributes for ext4 mounts remain writable only by real root. Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
f463fe5b · Seth Forshee · d033b7ff · f463fe5b · f463fe5b · f463fe5b
Commit f463fe5b authored Oct 18, 2014 by Seth Forshee
6 changed files
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -13,7 +13,7 @@
 * Convert from filesystem to in-memory representation.
 */
 static struct posix_acl *
-ext4_acl_from_disk(const void *value, size_t size)
+ext4_acl_from_disk(struct super_block *sb, const void *value, size_t size)
 {
 	const char *end = (char *)value + size;
 	int n, count;
@@ -57,16 +57,20 @@ ext4_acl_from_disk(const void *value, size_t size)
 			if ((char *)value > end)
 				goto fail;
 			acl->a_entries[n].e_uid =
-				make_kuid(&init_user_ns,
+				make_kuid(sb->s_user_ns,
 					  le32_to_cpu(entry->e_id));
+			if (!uid_valid(acl->a_entries[n].e_uid))
+				goto fail;
 			break;
 		case ACL_GROUP:
 			value = (char *)value + sizeof(ext4_acl_entry);
 			if ((char *)value > end)
 				goto fail;
 			acl->a_entries[n].e_gid =
-				make_kgid(&init_user_ns,
+				make_kgid(sb->s_user_ns,
 					  le32_to_cpu(entry->e_id));
+			if (!gid_valid(acl->a_entries[n].e_gid))
+				goto fail;
 			break;

 		default:
@@ -86,11 +90,14 @@ ext4_acl_from_disk(const void *value, size_t size)
 * Convert from in-memory to filesystem representation.
 */
 static void *
-ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
+ext4_acl_to_disk(struct super_block *sb, const struct posix_acl *acl,
+		 size_t *size)
 {
 	ext4_acl_header *ext_acl;
 	char *e;
 	size_t n;
+	uid_t uid;
+	gid_t gid;

 	*size = ext4_acl_size(acl->a_count);
 	ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
@@ -106,13 +113,17 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
 		entry->e_perm = cpu_to_le16(acl_e->e_perm);
 		switch (acl_e->e_tag) {
 		case ACL_USER:
-			entry->e_id = cpu_to_le32(
-				from_kuid(&init_user_ns, acl_e->e_uid));
+			uid = from_kuid(sb->s_user_ns, acl_e->e_uid);
+			if (uid == (uid_t)-1)
+				goto fail;
+			entry->e_id = cpu_to_le32(uid);
 			e += sizeof(ext4_acl_entry);
 			break;
 		case ACL_GROUP:
-			entry->e_id = cpu_to_le32(
-				from_kgid(&init_user_ns, acl_e->e_gid));
+			gid = from_kgid(sb->s_user_ns, acl_e->e_gid);
+			if (gid == (gid_t)-1)
+				goto fail;
+			entry->e_id = cpu_to_le32(gid);
 			e += sizeof(ext4_acl_entry);
 			break;

@@ -165,7 +176,7 @@ ext4_get_acl(struct inode *inode, int type)
 		retval = ext4_xattr_get(inode, name_index, "", value, retval);
 	}
 	if (retval > 0)
-		acl = ext4_acl_from_disk(value, retval);
+		acl = ext4_acl_from_disk(inode->i_sb, value, retval);
 	else if (retval == -ENODATA || retval == -ENOSYS)
 		acl = NULL;
 	else
@@ -218,7 +229,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 		return -EINVAL;
 	}
 	if (acl) {
-		value = ext4_acl_to_disk(acl, &size);
+		value = ext4_acl_to_disk(inode->i_sb, acl, &size);
 		if (IS_ERR(value))
 			return (int)PTR_ERR(value);
 	}

--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -555,8 +555,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,

 	/* Hm, nope.  Are (enough) root reserved clusters available? */
 	if (uid_eq(sbi->s_resuid, current_fsuid()) ||
-	    (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
-	    capable(CAP_SYS_RESOURCE) ||
+	    (!gid_eq(sbi->s_resgid, make_kgid(sbi->s_sb->s_user_ns, 0)) && in_group_p(sbi->s_resgid)) ||
+	    ns_capable(sbi->s_sb->s_user_ns, CAP_SYS_RESOURCE) ||
 	    (flags & EXT4_MB_USE_ROOT_BLOCKS)) {

 		if (free_clusters >= (nclusters + dirty_clusters +

--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -732,6 +732,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 	if (!dir || !dir->i_nlink)
 		return ERR_PTR(-EPERM);

+	/* Supplied owner must be valid */
+	if (owner && (owner[0] == (uid_t)-1 || owner[1] == (uid_t)-1))
+		return ERR_PTR(-EOVERFLOW);
+
 	if ((ext4_encrypted_inode(dir) ||
 	     DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) &&
 	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
@@ -744,7 +748,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 			nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb);
 		encrypt = 1;
 	}
-
 	sb = dir->i_sb;
 	ngroups = ext4_get_groups_count(sb);
 	trace_ext4_request_inode(dir, mode);

--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -254,7 +254,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		 * This test looks nicer. Thanks to Pauline Middelink
 		 */
 		if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE))
+			if (!ns_capable(sb->s_user_ns, CAP_LINUX_IMMUTABLE))
 				goto flags_out;
 		}

@@ -263,7 +263,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE))
+			if (!ns_capable(sb->s_user_ns, CAP_SYS_RESOURCE))
 				goto flags_out;
 		}
 		if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
@@ -598,7 +598,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		struct fstrim_range range;
 		int ret = 0;

-		if (!capable(CAP_SYS_ADMIN))
+		if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
 			return -EPERM;

 		if (!blk_queue_discard(q))

--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -20,7 +20,7 @@ int ext4_resize_begin(struct super_block *sb)
 {
 	int ret = 0;

-	if (!capable(CAP_SYS_RESOURCE))
+	if (!ns_capable(sb->s_user_ns, CAP_SYS_RESOURCE))
 		return -EPERM;

 	/*

--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -39,6 +39,7 @@
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <linux/cleancache.h>
+#include <linux/user_namespace.h>
 #include <asm/uaccess.h>

 #include <linux/kthread.h>
@@ -91,7 +92,7 @@ static struct file_system_type ext2_fs_type = {
 	.name		= "ext2",
 	.mount		= ext4_mount,
 	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.fs_flags	= FS_REQUIRES_DEV | FS_USERNS_MOUNT,
 };
 MODULE_ALIAS_FS("ext2");
 MODULE_ALIAS("ext2");
@@ -106,7 +107,7 @@ static struct file_system_type ext3_fs_type = {
 	.name		= "ext3",
 	.mount		= ext4_mount,
 	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.fs_flags	= FS_REQUIRES_DEV | FS_USERNS_MOUNT,
 };
 MODULE_ALIAS_FS("ext3");
 MODULE_ALIAS("ext3");
@@ -1509,6 +1510,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		return -1;
 	}

+	if (token == Opt_err_panic && !capable(CAP_SYS_ADMIN)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Mount option \"%s\" not allowed for unprivileged mounts",
+			 opt);
+		return -1;
+	}
+
 	if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
 		return -1;
 	if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
@@ -1551,14 +1559,14 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 	} else if (token == Opt_stripe) {
 		sbi->s_stripe = arg;
 	} else if (token == Opt_resuid) {
-		uid = make_kuid(current_user_ns(), arg);
+		uid = make_kuid(sb->s_user_ns, arg);
 		if (!uid_valid(uid)) {
 			ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
 			return -1;
 		}
 		sbi->s_resuid = uid;
 	} else if (token == Opt_resgid) {
-		gid = make_kgid(current_user_ns(), arg);
+		gid = make_kgid(sb->s_user_ns, arg);
 		if (!gid_valid(gid)) {
 			ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
 			return -1;
@@ -1597,6 +1605,19 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 			return -1;
 		}

+		/*
+		 * Refuse access for unprivileged mounts if the user does
+		 * not have rw access to the journal device via the supplied
+		 * path.
+		 */
+		if (!capable(CAP_SYS_ADMIN) &&
+		    inode_permission(d_inode(path.dentry), MAY_READ|MAY_WRITE)) {
+			ext4_msg(sb, KERN_ERR,
+				 "error: Insufficient access to journal path %s",
+				 journal_path);
+			return -1;
+		}
+
 		journal_inode = d_inode(path.dentry);
 		if (!S_ISBLK(journal_inode->i_mode)) {
 			ext4_msg(sb, KERN_ERR, "error: journal path %s "
@@ -1827,14 +1848,14 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
 		SEQ_OPTS_PRINT("%s", token2str(m->token));
 	}

-	if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
+	if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(sb->s_user_ns, EXT4_DEF_RESUID)) ||
 	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
 		SEQ_OPTS_PRINT("resuid=%u",
-				from_kuid_munged(&init_user_ns, sbi->s_resuid));
-	if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
+				from_kuid_munged(sb->s_user_ns, sbi->s_resuid));
+	if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(sb->s_user_ns, EXT4_DEF_RESGID)) ||
 	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
 		SEQ_OPTS_PRINT("resgid=%u",
-				from_kgid_munged(&init_user_ns, sbi->s_resgid));
+				from_kgid_munged(sb->s_user_ns, sbi->s_resgid));
 	def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
 	if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
 		SEQ_OPTS_PUTS("errors=remount-ro");
@@ -2606,7 +2627,7 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
 {
 	int len = count;

-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(sbi->s_sb->s_user_ns, CAP_SYS_ADMIN))
 		return -EPERM;

 	if (len && buf[len-1] == '\n')
@@ -3590,19 +3611,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
 		set_opt(sb, WRITEBACK_DATA);

-	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
+	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) {
+		if (!capable(CAP_SYS_ADMIN))
+			goto failed_mount;
 		set_opt(sb, ERRORS_PANIC);
-	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
+	} else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) {
 		set_opt(sb, ERRORS_CONT);
-	else
+	} else {
 		set_opt(sb, ERRORS_RO);
+	}
 	/* block_validity enabled by default; disable with noblock_validity */
 	set_opt(sb, BLOCK_VALIDITY);
 	if (def_mount_opts & EXT4_DEFM_DISCARD)
 		set_opt(sb, DISCARD);

-	sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
-	sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+	sbi->s_resuid = make_kuid(sb->s_user_ns, le16_to_cpu(es->s_def_resuid));
+	if (!uid_valid(sbi->s_resuid))
+		sbi->s_resuid = make_kuid(sb->s_user_ns, EXT4_DEF_RESUID);
+	sbi->s_resgid = make_kgid(sb->s_user_ns, le16_to_cpu(es->s_def_resgid));
+	if (!gid_valid(sbi->s_resgid))
+		sbi->s_resgid = make_kgid(sb->s_user_ns, EXT4_DEF_RESGID);
 	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
 	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
 	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
@@ -4364,6 +4392,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	ext4_blkdev_remove(sbi);
 	brelse(bh);
 out_fail:
+	/* sb->s_user_ns will be put when sb is destroyed */
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
@@ -5582,7 +5611,7 @@ static struct file_system_type ext4_fs_type = {
 	.name		= "ext4",
 	.mount		= ext4_mount,
 	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.fs_flags	= FS_REQUIRES_DEV | FS_USERNS_MOUNT,
 };
 MODULE_ALIAS_FS("ext4");