super.c 40.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 *  linux/fs/super.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  super.c contains code to handle: - mount structures
 *                                   - super-block tables
 *                                   - filesystem drivers list
 *                                   - mount system call
 *                                   - umount system call
 *                                   - ustat system call
 *
 * GK 2/5/95  -  Changed to support mounting the root fs via NFS
 *
 *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
 *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
 *  Added options to /proc/mounts:
19
 *    Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
Linus Torvalds's avatar
Linus Torvalds committed
20 21 22 23
 *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
 *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
 */

24
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
25 26 27 28 29 30
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/writeback.h>		/* for the emergency remount stuff */
#include <linux/idr.h>
Ingo Molnar's avatar
Ingo Molnar committed
31
#include <linux/mutex.h>
32
#include <linux/backing-dev.h>
33
#include <linux/rculist_bl.h>
34
#include <linux/cleancache.h>
Al Viro's avatar
Al Viro committed
35
#include <linux/fsnotify.h>
36
#include <linux/lockdep.h>
37
#include <linux/user_namespace.h>
38
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
39

40
static int thaw_super_locked(struct super_block *sb);
Linus Torvalds's avatar
Linus Torvalds committed
41

42 43
static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);
Linus Torvalds's avatar
Linus Torvalds committed
44

45 46 47 48 49 50
static char *sb_writers_name[SB_FREEZE_LEVELS] = {
	"sb_writers",
	"sb_pagefaults",
	"sb_internal",
};

51 52 53 54 55 56 57
/*
 * One thing we have to be careful of with a per-sb shrinker is that we don't
 * drop the last active reference to the superblock from within the shrinker.
 * If that happens we could trigger unregistering the shrinker from within the
 * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
 * take a passive reference to the superblock to avoid this from occurring.
 */
58 59
static unsigned long super_cache_scan(struct shrinker *shrink,
				      struct shrink_control *sc)
60 61
{
	struct super_block *sb;
62 63 64 65 66
	long	fs_objects = 0;
	long	total_objects;
	long	freed = 0;
	long	dentries;
	long	inodes;
67 68 69 70 71 72 73

	sb = container_of(shrink, struct super_block, s_shrink);

	/*
	 * Deadlock avoidance.  We may hold various FS locks, and we don't want
	 * to recurse into the FS that called us in clear_inode() and friends..
	 */
74 75
	if (!(sc->gfp_mask & __GFP_FS))
		return SHRINK_STOP;
76

77
	if (!trylock_super(sb))
78
		return SHRINK_STOP;
79

80
	if (sb->s_op->nr_cached_objects)
81
		fs_objects = sb->s_op->nr_cached_objects(sb, sc);
82

83 84
	inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
	dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
85
	total_objects = dentries + inodes + fs_objects + 1;
86 87
	if (!total_objects)
		total_objects = 1;
88

89
	/* proportion the scan between the caches */
90
	dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
91
	inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
92
	fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
93

94 95 96
	/*
	 * prune the dcache first as the icache is pinned by it, then
	 * prune the icache, followed by the filesystem specific caches
97 98 99
	 *
	 * Ensure that we always scan at least one object - memcg kmem
	 * accounting uses this to fully empty the caches.
100
	 */
101
	sc->nr_to_scan = dentries + 1;
102
	freed = prune_dcache_sb(sb, sc);
103
	sc->nr_to_scan = inodes + 1;
104
	freed += prune_icache_sb(sb, sc);
105 106

	if (fs_objects) {
107
		sc->nr_to_scan = fs_objects + 1;
108
		freed += sb->s_op->free_cached_objects(sb, sc);
109 110
	}

111
	up_read(&sb->s_umount);
112 113 114 115 116 117 118 119 120 121 122
	return freed;
}

static unsigned long super_cache_count(struct shrinker *shrink,
				       struct shrink_control *sc)
{
	struct super_block *sb;
	long	total_objects = 0;

	sb = container_of(shrink, struct super_block, s_shrink);

123
	/*
124 125 126 127 128 129 130 131 132 133 134 135
	 * We don't call trylock_super() here as it is a scalability bottleneck,
	 * so we're exposed to partial setup state. The shrinker rwsem does not
	 * protect filesystem operations backing list_lru_shrink_count() or
	 * s_op->nr_cached_objects(). Counts can change between
	 * super_cache_count and super_cache_scan, so we really don't need locks
	 * here.
	 *
	 * However, if we are currently mounting the superblock, the underlying
	 * filesystem might be in a state of partial construction and hence it
	 * is dangerous to access it.  trylock_super() uses a SB_BORN check to
	 * avoid this situation, so do the same here. The memory barrier is
	 * matched with the one in mount_fs() as we don't hold locks here.
136
	 */
137 138 139 140
	if (!(sb->s_flags & SB_BORN))
		return 0;
	smp_rmb();

141
	if (sb->s_op && sb->s_op->nr_cached_objects)
142
		total_objects = sb->s_op->nr_cached_objects(sb, sc);
143

144 145
	total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
	total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
146

147
	total_objects = vfs_pressure_ratio(total_objects);
148
	return total_objects;
149 150
}

151 152 153 154 155 156 157
static void destroy_super_work(struct work_struct *work)
{
	struct super_block *s = container_of(work, struct super_block,
							destroy_work);
	int i;

	for (i = 0; i < SB_FREEZE_LEVELS; i++)
158
		percpu_free_rwsem(&s->s_writers.rw_sem[i]);
159 160 161 162 163 164 165 166 167 168
	kfree(s);
}

static void destroy_super_rcu(struct rcu_head *head)
{
	struct super_block *s = container_of(head, struct super_block, rcu);
	INIT_WORK(&s->destroy_work, destroy_super_work);
	schedule_work(&s->destroy_work);
}

169 170
/* Free a superblock that has never been seen by anyone */
static void destroy_unused_super(struct super_block *s)
171
{
172 173 174
	if (!s)
		return;
	up_write(&s->s_umount);
175 176 177
	list_lru_destroy(&s->s_dentry_lru);
	list_lru_destroy(&s->s_inode_lru);
	security_sb_free(s);
178
	put_user_ns(s->s_user_ns);
179
	kfree(s->s_subtype);
180
	free_prealloced_shrinker(&s->s_shrink);
181 182
	/* no delays needed */
	destroy_super_work(&s->destroy_work);
183 184
}

Linus Torvalds's avatar
Linus Torvalds committed
185 186
/**
 *	alloc_super	-	create new superblock
187
 *	@type:	filesystem type superblock should belong to
188
 *	@flags: the mount flags
189
 *	@user_ns: User namespace for the super_block
Linus Torvalds's avatar
Linus Torvalds committed
190 191 192 193
 *
 *	Allocates and initializes a new &struct super_block.  alloc_super()
 *	returns a pointer new superblock or %NULL if allocation had failed.
 */
194 195
static struct super_block *alloc_super(struct file_system_type *type, int flags,
				       struct user_namespace *user_ns)
Linus Torvalds's avatar
Linus Torvalds committed
196
{
197
	struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
198
	static const struct super_operations default_op;
199 200 201 202
	int i;

	if (!s)
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
203

204
	INIT_LIST_HEAD(&s->s_mounts);
205
	s->s_user_ns = get_user_ns(user_ns);
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
	init_rwsem(&s->s_umount);
	lockdep_set_class(&s->s_umount, &type->s_umount_key);
	/*
	 * sget() can have s_umount recursion.
	 *
	 * When it cannot find a suitable sb, it allocates a new
	 * one (this one), and tries again to find a suitable old
	 * one.
	 *
	 * In case that succeeds, it will acquire the s_umount
	 * lock of the old one. Since these are clearly distrinct
	 * locks, and this object isn't exposed yet, there's no
	 * risk of deadlocks.
	 *
	 * Annotate this by putting this lock in a different
	 * subclass.
	 */
	down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
224

225 226
	if (security_sb_alloc(s))
		goto fail;
227

228
	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
229 230 231
		if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
					sb_writers_name[i],
					&type->s_writers_key[i]))
232
			goto fail;
Linus Torvalds's avatar
Linus Torvalds committed
233
	}
234
	init_waitqueue_head(&s->s_writers.wait_unfrozen);
235
	s->s_bdi = &noop_backing_dev_info;
236
	s->s_flags = flags;
237
	if (s->s_user_ns != &init_user_ns)
238
		s->s_iflags |= SB_I_NODEV;
239
	INIT_HLIST_NODE(&s->s_instances);
240
	INIT_HLIST_BL_HEAD(&s->s_roots);
241
	mutex_init(&s->s_sync_lock);
242
	INIT_LIST_HEAD(&s->s_inodes);
243
	spin_lock_init(&s->s_inode_list_lock);
244 245
	INIT_LIST_HEAD(&s->s_inodes_wb);
	spin_lock_init(&s->s_inode_wblist_lock);
246

247
	if (list_lru_init_memcg(&s->s_dentry_lru))
248
		goto fail;
249
	if (list_lru_init_memcg(&s->s_inode_lru))
250 251 252 253 254
		goto fail;
	s->s_count = 1;
	atomic_set(&s->s_active, 1);
	mutex_init(&s->s_vfs_rename_mutex);
	lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
255
	init_rwsem(&s->s_dquot.dqio_sem);
256 257 258
	s->s_maxbytes = MAX_NON_LFS;
	s->s_op = &default_op;
	s->s_time_gran = 1000000000;
259
	s->cleancache_poolid = CLEANCACHE_NO_POOL;
260 261 262 263 264

	s->s_shrink.seeks = DEFAULT_SEEKS;
	s->s_shrink.scan_objects = super_cache_scan;
	s->s_shrink.count_objects = super_cache_count;
	s->s_shrink.batch = 1024;
265
	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
266 267
	if (prealloc_shrinker(&s->s_shrink))
		goto fail;
Linus Torvalds's avatar
Linus Torvalds committed
268
	return s;
269

270
fail:
271
	destroy_unused_super(s);
272
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
273 274 275 276 277
}

/* Superblock refcounting  */

/*
278
 * Drop a superblock's refcount.  The caller must hold sb_lock.
Linus Torvalds's avatar
Linus Torvalds committed
279
 */
280
static void __put_super(struct super_block *s)
Linus Torvalds's avatar
Linus Torvalds committed
281
{
282 283 284 285 286 287 288 289 290
	if (!--s->s_count) {
		list_del_init(&s->s_list);
		WARN_ON(s->s_dentry_lru.node);
		WARN_ON(s->s_inode_lru.node);
		WARN_ON(!list_empty(&s->s_mounts));
		security_sb_free(s);
		put_user_ns(s->s_user_ns);
		kfree(s->s_subtype);
		call_rcu(&s->rcu, destroy_super_rcu);
Linus Torvalds's avatar
Linus Torvalds committed
291 292 293 294 295 296 297 298 299 300
	}
}

/**
 *	put_super	-	drop a temporary reference to superblock
 *	@sb: superblock in question
 *
 *	Drops a temporary reference, frees superblock if there's no
 *	references left.
 */
Al Viro's avatar
Al Viro committed
301
static void put_super(struct super_block *sb)
Linus Torvalds's avatar
Linus Torvalds committed
302 303 304 305 306 307 308 309
{
	spin_lock(&sb_lock);
	__put_super(sb);
	spin_unlock(&sb_lock);
}


/**
310
 *	deactivate_locked_super	-	drop an active reference to superblock
Linus Torvalds's avatar
Linus Torvalds committed
311 312
 *	@s: superblock to deactivate
 *
313
 *	Drops an active reference to superblock, converting it into a temporary
314
 *	one if there is no other active references left.  In that case we
Linus Torvalds's avatar
Linus Torvalds committed
315 316
 *	tell fs driver to shut it down and drop the temporary reference we
 *	had just acquired.
317 318
 *
 *	Caller holds exclusive lock on superblock; that lock is released.
Linus Torvalds's avatar
Linus Torvalds committed
319
 */
320
void deactivate_locked_super(struct super_block *s)
Linus Torvalds's avatar
Linus Torvalds committed
321 322
{
	struct file_system_type *fs = s->s_type;
Al Viro's avatar
Al Viro committed
323
	if (atomic_dec_and_test(&s->s_active)) {
324
		cleancache_invalidate_fs(s);
325
		unregister_shrinker(&s->s_shrink);
326
		fs->kill_sb(s);
Glauber Costa's avatar
Glauber Costa committed
327

328 329 330 331 332 333 334 335
		/*
		 * Since list_lru_destroy() may sleep, we cannot call it from
		 * put_super(), where we hold the sb_lock. Therefore we destroy
		 * the lru lists right now.
		 */
		list_lru_destroy(&s->s_dentry_lru);
		list_lru_destroy(&s->s_inode_lru);

Linus Torvalds's avatar
Linus Torvalds committed
336 337
		put_filesystem(fs);
		put_super(s);
338 339
	} else {
		up_write(&s->s_umount);
Linus Torvalds's avatar
Linus Torvalds committed
340 341 342
	}
}

343
EXPORT_SYMBOL(deactivate_locked_super);
Linus Torvalds's avatar
Linus Torvalds committed
344

345
/**
346
 *	deactivate_super	-	drop an active reference to superblock
347 348
 *	@s: superblock to deactivate
 *
349 350 351
 *	Variant of deactivate_locked_super(), except that superblock is *not*
 *	locked by caller.  If we are going to drop the final active reference,
 *	lock will be acquired prior to that.
352
 */
353
void deactivate_super(struct super_block *s)
354
{
355 356 357
        if (!atomic_add_unless(&s->s_active, -1, 1)) {
		down_write(&s->s_umount);
		deactivate_locked_super(s);
358 359 360
	}
}

361
EXPORT_SYMBOL(deactivate_super);
362

Linus Torvalds's avatar
Linus Torvalds committed
363 364 365 366 367 368 369 370 371
/**
 *	grab_super - acquire an active reference
 *	@s: reference we are trying to make active
 *
 *	Tries to acquire an active reference.  grab_super() is used when we
 * 	had just found a superblock in super_blocks or fs_type->fs_supers
 *	and want to turn it into a full-blown active reference.  grab_super()
 *	is called with sb_lock held and drops it.  Returns 1 in case of
 *	success, 0 if we had failed (superblock contents was already dead or
Al Viro's avatar
Al Viro committed
372 373 374
 *	dying when grab_super() had been called).  Note that this is only
 *	called for superblocks not in rundown mode (== ones still on ->fs_supers
 *	of their type), so increment of ->s_count is OK here.
Linus Torvalds's avatar
Linus Torvalds committed
375
 */
376
static int grab_super(struct super_block *s) __releases(sb_lock)
Linus Torvalds's avatar
Linus Torvalds committed
377 378 379 380
{
	s->s_count++;
	spin_unlock(&sb_lock);
	down_write(&s->s_umount);
381
	if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) {
Al Viro's avatar
Al Viro committed
382 383 384
		put_super(s);
		return 1;
	}
Linus Torvalds's avatar
Linus Torvalds committed
385 386 387 388 389
	up_write(&s->s_umount);
	put_super(s);
	return 0;
}

390
/*
391
 *	trylock_super - try to grab ->s_umount shared
392
 *	@sb: reference we are trying to grab
393
 *
394
 *	Try to prevent fs shutdown.  This is used in places where we
395
 *	cannot take an active reference but we need to ensure that the
396 397 398 399 400 401 402 403 404 405
 *	filesystem is not shut down while we are working on it. It returns
 *	false if we cannot acquire s_umount or if we lose the race and
 *	filesystem already got into shutdown, and returns true with the s_umount
 *	lock held in read mode in case of success. On successful return,
 *	the caller must drop the s_umount lock when done.
 *
 *	Note that unlike get_super() et.al. this one does *not* bump ->s_count.
 *	The reason why it's safe is that we are OK with doing trylock instead
 *	of down_read().  There's a couple of places that are OK with that, but
 *	it's very much not a general-purpose interface.
406
 */
407
bool trylock_super(struct super_block *sb)
408 409
{
	if (down_read_trylock(&sb->s_umount)) {
410
		if (!hlist_unhashed(&sb->s_instances) &&
411
		    sb->s_root && (sb->s_flags & SB_BORN))
412 413 414 415 416 417 418
			return true;
		up_read(&sb->s_umount);
	}

	return false;
}

Linus Torvalds's avatar
Linus Torvalds committed
419 420 421 422 423 424 425 426 427
/**
 *	generic_shutdown_super	-	common helper for ->kill_sb()
 *	@sb: superblock to kill
 *
 *	generic_shutdown_super() does all fs-independent work on superblock
 *	shutdown.  Typical ->kill_sb() should pick all fs-specific objects
 *	that need destruction out of superblock, call generic_shutdown_super()
 *	and release aforementioned objects.  Note: dentries and inodes _are_
 *	taken care of and do not need specific handling.
428 429 430 431
 *
 *	Upon calling this function, the filesystem may no longer alter or
 *	rearrange the set of dentries belonging to this super_block, nor may it
 *	change the attachments of dentries to inodes.
Linus Torvalds's avatar
Linus Torvalds committed
432 433 434
 */
void generic_shutdown_super(struct super_block *sb)
{
435
	const struct super_operations *sop = sb->s_op;
Linus Torvalds's avatar
Linus Torvalds committed
436

437 438
	if (sb->s_root) {
		shrink_dcache_for_umount(sb);
439
		sync_filesystem(sb);
440
		sb->s_flags &= ~SB_ACTIVE;
441

442
		fsnotify_unmount_inodes(sb);
443
		cgroup_writeback_umount();
Al Viro's avatar
Al Viro committed
444 445

		evict_inodes(sb);
Linus Torvalds's avatar
Linus Torvalds committed
446

447 448 449 450 451
		if (sb->s_dio_done_wq) {
			destroy_workqueue(sb->s_dio_done_wq);
			sb->s_dio_done_wq = NULL;
		}

Linus Torvalds's avatar
Linus Torvalds committed
452 453 454
		if (sop->put_super)
			sop->put_super(sb);

Al Viro's avatar
Al Viro committed
455
		if (!list_empty(&sb->s_inodes)) {
456 457 458
			printk("VFS: Busy inodes after unmount of %s. "
			   "Self-destruct in 5 seconds.  Have a nice day...\n",
			   sb->s_id);
Linus Torvalds's avatar
Linus Torvalds committed
459 460 461 462
		}
	}
	spin_lock(&sb_lock);
	/* should be initialized for __put_super_and_need_restart() */
Al Viro's avatar
Al Viro committed
463
	hlist_del_init(&sb->s_instances);
Linus Torvalds's avatar
Linus Torvalds committed
464 465
	spin_unlock(&sb_lock);
	up_write(&sb->s_umount);
Jan Kara's avatar
Jan Kara committed
466
	if (sb->s_bdi != &noop_backing_dev_info) {
467 468 469
		bdi_put(sb->s_bdi);
		sb->s_bdi = &noop_backing_dev_info;
	}
Linus Torvalds's avatar
Linus Torvalds committed
470 471 472 473 474
}

EXPORT_SYMBOL(generic_shutdown_super);

/**
475
 *	sget_userns -	find or create a superblock
Linus Torvalds's avatar
Linus Torvalds committed
476 477 478
 *	@type:	filesystem type superblock should belong to
 *	@test:	comparison callback
 *	@set:	setup callback
479
 *	@flags:	mount flags
480
 *	@user_ns: User namespace for the super_block
Linus Torvalds's avatar
Linus Torvalds committed
481 482
 *	@data:	argument to each of them
 */
483
struct super_block *sget_userns(struct file_system_type *type,
Linus Torvalds's avatar
Linus Torvalds committed
484 485
			int (*test)(struct super_block *,void *),
			int (*set)(struct super_block *,void *),
486
			int flags, struct user_namespace *user_ns,
Linus Torvalds's avatar
Linus Torvalds committed
487 488 489
			void *data)
{
	struct super_block *s = NULL;
490
	struct super_block *old;
Linus Torvalds's avatar
Linus Torvalds committed
491 492
	int err;

493
	if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) &&
494 495 496
	    !(type->fs_flags & FS_USERNS_MOUNT) &&
	    !capable(CAP_SYS_ADMIN))
		return ERR_PTR(-EPERM);
Linus Torvalds's avatar
Linus Torvalds committed
497 498
retry:
	spin_lock(&sb_lock);
499
	if (test) {
500
		hlist_for_each_entry(old, &type->fs_supers, s_instances) {
501 502
			if (!test(old, data))
				continue;
503 504
			if (user_ns != old->s_user_ns) {
				spin_unlock(&sb_lock);
505
				destroy_unused_super(s);
506 507
				return ERR_PTR(-EBUSY);
			}
508 509
			if (!grab_super(old))
				goto retry;
510
			destroy_unused_super(s);
511 512
			return old;
		}
Linus Torvalds's avatar
Linus Torvalds committed
513 514 515
	}
	if (!s) {
		spin_unlock(&sb_lock);
516
		s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns);
Linus Torvalds's avatar
Linus Torvalds committed
517 518 519 520
		if (!s)
			return ERR_PTR(-ENOMEM);
		goto retry;
	}
521

Linus Torvalds's avatar
Linus Torvalds committed
522 523 524
	err = set(s, data);
	if (err) {
		spin_unlock(&sb_lock);
525
		destroy_unused_super(s);
Linus Torvalds's avatar
Linus Torvalds committed
526 527 528 529 530
		return ERR_PTR(err);
	}
	s->s_type = type;
	strlcpy(s->s_id, type->name, sizeof(s->s_id));
	list_add_tail(&s->s_list, &super_blocks);
Al Viro's avatar
Al Viro committed
531
	hlist_add_head(&s->s_instances, &type->fs_supers);
Linus Torvalds's avatar
Linus Torvalds committed
532 533
	spin_unlock(&sb_lock);
	get_filesystem(type);
534
	register_shrinker_prepared(&s->s_shrink);
Linus Torvalds's avatar
Linus Torvalds committed
535 536 537
	return s;
}

538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
EXPORT_SYMBOL(sget_userns);

/**
 *	sget	-	find or create a superblock
 *	@type:	  filesystem type superblock should belong to
 *	@test:	  comparison callback
 *	@set:	  setup callback
 *	@flags:	  mount flags
 *	@data:	  argument to each of them
 */
struct super_block *sget(struct file_system_type *type,
			int (*test)(struct super_block *,void *),
			int (*set)(struct super_block *,void *),
			int flags,
			void *data)
{
	struct user_namespace *user_ns = current_user_ns();

556 557 558 559
	/* We don't yet pass the user namespace of the parent
	 * mount through to here so always use &init_user_ns
	 * until that changes.
	 */
560
	if (flags & SB_SUBMOUNT)
561 562
		user_ns = &init_user_ns;

563
	/* Ensure the requestor has permissions over the target filesystem */
564
	if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
565 566 567 568 569
		return ERR_PTR(-EPERM);

	return sget_userns(type, test, set, flags, user_ns, data);
}

Linus Torvalds's avatar
Linus Torvalds committed
570 571 572 573 574 575 576 577 578 579
EXPORT_SYMBOL(sget);

void drop_super(struct super_block *sb)
{
	up_read(&sb->s_umount);
	put_super(sb);
}

EXPORT_SYMBOL(drop_super);

580 581 582 583 584 585 586
void drop_super_exclusive(struct super_block *sb)
{
	up_write(&sb->s_umount);
	put_super(sb);
}
EXPORT_SYMBOL(drop_super_exclusive);

587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608
static void __iterate_supers(void (*f)(struct super_block *))
{
	struct super_block *sb, *p = NULL;

	spin_lock(&sb_lock);
	list_for_each_entry(sb, &super_blocks, s_list) {
		if (hlist_unhashed(&sb->s_instances))
			continue;
		sb->s_count++;
		spin_unlock(&sb_lock);

		f(sb);

		spin_lock(&sb_lock);
		if (p)
			__put_super(p);
		p = sb;
	}
	if (p)
		__put_super(p);
	spin_unlock(&sb_lock);
}
Al Viro's avatar
Al Viro committed
609 610 611 612 613 614 615 616 617 618
/**
 *	iterate_supers - call function for all active superblocks
 *	@f: function to call
 *	@arg: argument to pass to it
 *
 *	Scans the superblock list and calls given function, passing it
 *	locked superblock and given argument.
 */
void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
{
619
	struct super_block *sb, *p = NULL;
Al Viro's avatar
Al Viro committed
620 621

	spin_lock(&sb_lock);
622
	list_for_each_entry(sb, &super_blocks, s_list) {
Al Viro's avatar
Al Viro committed
623
		if (hlist_unhashed(&sb->s_instances))
Al Viro's avatar
Al Viro committed
624 625 626 627 628
			continue;
		sb->s_count++;
		spin_unlock(&sb_lock);

		down_read(&sb->s_umount);
629
		if (sb->s_root && (sb->s_flags & SB_BORN))
Al Viro's avatar
Al Viro committed
630 631 632 633
			f(sb, arg);
		up_read(&sb->s_umount);

		spin_lock(&sb_lock);
634 635 636
		if (p)
			__put_super(p);
		p = sb;
Al Viro's avatar
Al Viro committed
637
	}
638 639
	if (p)
		__put_super(p);
Al Viro's avatar
Al Viro committed
640 641 642
	spin_unlock(&sb_lock);
}

Al Viro's avatar
Al Viro committed
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657
/**
 *	iterate_supers_type - call function for superblocks of given type
 *	@type: fs type
 *	@f: function to call
 *	@arg: argument to pass to it
 *
 *	Scans the superblock list and calls given function, passing it
 *	locked superblock and given argument.
 */
void iterate_supers_type(struct file_system_type *type,
	void (*f)(struct super_block *, void *), void *arg)
{
	struct super_block *sb, *p = NULL;

	spin_lock(&sb_lock);
658
	hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
Al Viro's avatar
Al Viro committed
659 660 661 662
		sb->s_count++;
		spin_unlock(&sb_lock);

		down_read(&sb->s_umount);
663
		if (sb->s_root && (sb->s_flags & SB_BORN))
Al Viro's avatar
Al Viro committed
664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
			f(sb, arg);
		up_read(&sb->s_umount);

		spin_lock(&sb_lock);
		if (p)
			__put_super(p);
		p = sb;
	}
	if (p)
		__put_super(p);
	spin_unlock(&sb_lock);
}

EXPORT_SYMBOL(iterate_supers_type);

679
static struct super_block *__get_super(struct block_device *bdev, bool excl)
Linus Torvalds's avatar
Linus Torvalds committed
680
{
681 682
	struct super_block *sb;

Linus Torvalds's avatar
Linus Torvalds committed
683 684
	if (!bdev)
		return NULL;
685

Linus Torvalds's avatar
Linus Torvalds committed
686
	spin_lock(&sb_lock);
687 688
rescan:
	list_for_each_entry(sb, &super_blocks, s_list) {
Al Viro's avatar
Al Viro committed
689
		if (hlist_unhashed(&sb->s_instances))
690
			continue;
691 692
		if (sb->s_bdev == bdev) {
			sb->s_count++;
Linus Torvalds's avatar
Linus Torvalds committed
693
			spin_unlock(&sb_lock);
694 695 696 697
			if (!excl)
				down_read(&sb->s_umount);
			else
				down_write(&sb->s_umount);
698
			/* still alive? */
699
			if (sb->s_root && (sb->s_flags & SB_BORN))
700
				return sb;
701 702 703 704
			if (!excl)
				up_read(&sb->s_umount);
			else
				up_write(&sb->s_umount);
705
			/* nope, got unmounted */
706
			spin_lock(&sb_lock);
707 708
			__put_super(sb);
			goto rescan;
Linus Torvalds's avatar
Linus Torvalds committed
709 710 711 712 713 714
		}
	}
	spin_unlock(&sb_lock);
	return NULL;
}

715
/**
716
 *	get_super - get the superblock of a device
717 718 719
 *	@bdev: device to get the superblock for
 *
 *	Scans the superblock list and finds the superblock of the file system
720
 *	mounted on the device given. %NULL is returned if no match is found.
721
 */
722 723 724 725 726 727 728 729
struct super_block *get_super(struct block_device *bdev)
{
	return __get_super(bdev, false);
}
EXPORT_SYMBOL(get_super);

static struct super_block *__get_super_thawed(struct block_device *bdev,
					      bool excl)
730 731
{
	while (1) {
732
		struct super_block *s = __get_super(bdev, excl);
733
		if (!s || s->s_writers.frozen == SB_UNFROZEN)
734
			return s;
735 736 737 738
		if (!excl)
			up_read(&s->s_umount);
		else
			up_write(&s->s_umount);
739 740
		wait_event(s->s_writers.wait_unfrozen,
			   s->s_writers.frozen == SB_UNFROZEN);
741 742 743
		put_super(s);
	}
}
744 745 746 747 748 749 750 751 752 753 754 755 756 757

/**
 *	get_super_thawed - get thawed superblock of a device
 *	@bdev: device to get the superblock for
 *
 *	Scans the superblock list and finds the superblock of the file system
 *	mounted on the device. The superblock is returned once it is thawed
 *	(or immediately if it was not frozen). %NULL is returned if no match
 *	is found.
 */
struct super_block *get_super_thawed(struct block_device *bdev)
{
	return __get_super_thawed(bdev, false);
}
758 759
EXPORT_SYMBOL(get_super_thawed);

760 761 762 763 764 765 766 767 768 769 770 771 772 773 774
/**
 *	get_super_exclusive_thawed - get thawed superblock of a device
 *	@bdev: device to get the superblock for
 *
 *	Scans the superblock list and finds the superblock of the file system
 *	mounted on the device. The superblock is returned once it is thawed
 *	(or immediately if it was not frozen) and s_umount semaphore is held
 *	in exclusive mode. %NULL is returned if no match is found.
 */
struct super_block *get_super_exclusive_thawed(struct block_device *bdev)
{
	return __get_super_thawed(bdev, true);
}
EXPORT_SYMBOL(get_super_exclusive_thawed);

775 776 777 778 779 780
/**
 * get_active_super - get an active reference to the superblock of a device
 * @bdev: device to get the superblock for
 *
 * Scans the superblock list and finds the superblock of the file system
 * mounted on the device given.  Returns the superblock with an active
781
 * reference or %NULL if none was found.
782 783 784 785 786 787 788 789
 */
struct super_block *get_active_super(struct block_device *bdev)
{
	struct super_block *sb;

	if (!bdev)
		return NULL;

790
restart:
791 792
	spin_lock(&sb_lock);
	list_for_each_entry(sb, &super_blocks, s_list) {
Al Viro's avatar
Al Viro committed
793
		if (hlist_unhashed(&sb->s_instances))
794
			continue;
795
		if (sb->s_bdev == bdev) {
Al Viro's avatar
Al Viro committed
796
			if (!grab_super(sb))
797
				goto restart;
Al Viro's avatar
Al Viro committed
798 799
			up_write(&sb->s_umount);
			return sb;
800
		}
801 802 803 804
	}
	spin_unlock(&sb_lock);
	return NULL;
}
805

806
struct super_block *user_get_super(dev_t dev)
Linus Torvalds's avatar
Linus Torvalds committed
807
{
808
	struct super_block *sb;
Linus Torvalds's avatar
Linus Torvalds committed
809 810

	spin_lock(&sb_lock);
811 812
rescan:
	list_for_each_entry(sb, &super_blocks, s_list) {
Al Viro's avatar
Al Viro committed
813
		if (hlist_unhashed(&sb->s_instances))
814
			continue;
815 816
		if (sb->s_dev ==  dev) {
			sb->s_count++;
Linus Torvalds's avatar
Linus Torvalds committed
817
			spin_unlock(&sb_lock);
818
			down_read(&sb->s_umount);
819
			/* still alive? */
820
			if (sb->s_root && (sb->s_flags & SB_BORN))
821 822
				return sb;
			up_read(&sb->s_umount);
823
			/* nope, got unmounted */
824
			spin_lock(&sb_lock);
825 826
			__put_super(sb);
			goto rescan;
Linus Torvalds's avatar
Linus Torvalds committed
827 828 829 830 831 832 833 834 835
		}
	}
	spin_unlock(&sb_lock);
	return NULL;
}

/**
 *	do_remount_sb - asks filesystem to change mount options.
 *	@sb:	superblock in question
836
 *	@sb_flags: revised superblock flags
Linus Torvalds's avatar
Linus Torvalds committed
837 838 839 840 841
 *	@data:	the rest of options
 *      @force: whether or not to force the change
 *
 *	Alters the mount options of a mounted file system.
 */
842
int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
Linus Torvalds's avatar
Linus Torvalds committed
843 844
{
	int retval;
845
	int remount_ro;
846

847
	if (sb->s_writers.frozen != SB_UNFROZEN)
848 849
		return -EBUSY;

850
#ifdef CONFIG_BLOCK
851
	if (!(sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
Linus Torvalds's avatar
Linus Torvalds committed
852
		return -EACCES;
853
#endif
854

855
	remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
856

857
	if (remount_ro) {
858
		if (!hlist_empty(&sb->s_pins)) {
859
			up_write(&sb->s_umount);
860
			group_pin_kill(&sb->s_pins);
861 862 863 864 865
			down_write(&sb->s_umount);
			if (!sb->s_root)
				return 0;
			if (sb->s_writers.frozen != SB_UNFROZEN)
				return -EBUSY;
866
			remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
867 868 869 870
		}
	}
	shrink_dcache_sb(sb);

Linus Torvalds's avatar
Linus Torvalds committed
871 872
	/* If we are remounting RDONLY and current sb is read/write,
	   make sure there are no rw files opened */
873
	if (remount_ro) {
874
		if (force) {
Al Viro's avatar
Al Viro committed
875 876
			sb->s_readonly_remount = 1;
			smp_wmb();
877 878 879 880 881
		} else {
			retval = sb_prepare_remount_readonly(sb);
			if (retval)
				return retval;
		}
Linus Torvalds's avatar
Linus Torvalds committed
882 883 884
	}

	if (sb->s_op->remount_fs) {
885
		retval = sb->s_op->remount_fs(sb, &sb_flags, data);
886 887
		if (retval) {
			if (!force)
888
				goto cancel_readonly;
889 890 891 892
			/* If forced remount, go ahead despite any errors */
			WARN(1, "forced remount of a %s fs returned %i\n",
			     sb->s_type->name, retval);
		}
Linus Torvalds's avatar
Linus Torvalds committed
893
	}
894
	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (sb_flags & MS_RMT_MASK);
895 896 897
	/* Needs to be ordered wrt mnt_is_readonly() */
	smp_wmb();
	sb->s_readonly_remount = 0;
898

899 900 901 902 903 904 905 906 907 908
	/*
	 * Some filesystems modify their metadata via some other path than the
	 * bdev buffer cache (eg. use a private mapping, or directories in
	 * pagecache, etc). Also file data modifications go via their own
	 * mappings. So If we try to mount readonly then copy the filesystem
	 * from bdev, we could get stale data, so invalidate it to give a best
	 * effort at coherency.
	 */
	if (remount_ro && sb->s_bdev)
		invalidate_bdev(sb->s_bdev);
Linus Torvalds's avatar
Linus Torvalds committed
909
	return 0;
910 911 912 913

cancel_readonly:
	sb->s_readonly_remount = 0;
	return retval;
Linus Torvalds's avatar
Linus Torvalds committed
914 915
}

916
static void do_emergency_remount_callback(struct super_block *sb)
Linus Torvalds's avatar
Linus Torvalds committed
917
{
918 919 920 921 922 923 924
	down_write(&sb->s_umount);
	if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
	    !sb_rdonly(sb)) {
		/*
		 * What lock protects sb->s_flags??
		 */
		do_remount_sb(sb, SB_RDONLY, NULL, 1);
Linus Torvalds's avatar
Linus Torvalds committed
925
	}
926 927 928 929 930 931
	up_write(&sb->s_umount);
}

static void do_emergency_remount(struct work_struct *work)
{
	__iterate_supers(do_emergency_remount_callback);
932
	kfree(work);
Linus Torvalds's avatar
Linus Torvalds committed
933 934 935 936 937
	printk("Emergency Remount complete\n");
}

void emergency_remount(void)
{
938 939 940 941 942 943 944
	struct work_struct *work;

	work = kmalloc(sizeof(*work), GFP_ATOMIC);
	if (work) {
		INIT_WORK(work, do_emergency_remount);
		schedule_work(work);
	}
Linus Torvalds's avatar
Linus Torvalds committed
945 946
}

947 948 949
static void do_thaw_all_callback(struct super_block *sb)
{
	down_write(&sb->s_umount);
Al Viro's avatar
Al Viro committed
950
	if (sb->s_root && sb->s_flags & SB_BORN) {
951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980
		emergency_thaw_bdev(sb);
		thaw_super_locked(sb);
	} else {
		up_write(&sb->s_umount);
	}
}

static void do_thaw_all(struct work_struct *work)
{
	__iterate_supers(do_thaw_all_callback);
	kfree(work);
	printk(KERN_WARNING "Emergency Thaw complete\n");
}

/**
 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 *
 * Used for emergency unfreeze of all filesystems via SysRq
 */
void emergency_thaw_all(void)
{
	struct work_struct *work;

	work = kmalloc(sizeof(*work), GFP_ATOMIC);
	if (work) {
		INIT_WORK(work, do_thaw_all);
		schedule_work(work);
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
981 982 983 984 985
/*
 * Unnamed block devices are dummy devices used by virtual
 * filesystems which don't use real block-devices.  -- jrs
 */

986
static DEFINE_IDA(unnamed_dev_ida);
Linus Torvalds's avatar
Linus Torvalds committed
987
static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
988 989 990 991
/* Many userspace utilities consider an FSID of 0 invalid.
 * Always return at least 1 from get_anon_bdev.
 */
static int unnamed_dev_start = 1;
Linus Torvalds's avatar
Linus Torvalds committed
992

993
int get_anon_bdev(dev_t *p)
Linus Torvalds's avatar
Linus Torvalds committed
994 995 996 997 998
{
	int dev;
	int error;

 retry:
999
	if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0)
Linus Torvalds's avatar
Linus Torvalds committed
1000 1001
		return -ENOMEM;
	spin_lock(&unnamed_dev_lock);
1002
	error = ida_get_new_above(&unnamed_dev_ida, unnamed_dev_start, &dev);
1003 1004
	if (!error)
		unnamed_dev_start = dev + 1;
Linus Torvalds's avatar
Linus Torvalds committed
1005 1006 1007 1008 1009 1010 1011
	spin_unlock(&unnamed_dev_lock);
	if (error == -EAGAIN)
		/* We raced and lost with another CPU. */
		goto retry;
	else if (error)
		return -EAGAIN;

1012
	if (dev >= (1 << MINORBITS)) {
Linus Torvalds's avatar
Linus Torvalds committed
1013
		spin_lock(&unnamed_dev_lock);
1014
		ida_remove(&unnamed_dev_ida, dev);
1015 1016
		if (unnamed_dev_start > dev)
			unnamed_dev_start = dev;
Linus Torvalds's avatar
Linus Torvalds committed
1017 1018 1019
		spin_unlock(&unnamed_dev_lock);
		return -EMFILE;
	}
1020
	*p = MKDEV(0, dev & MINORMASK);
Linus Torvalds's avatar
Linus Torvalds committed
1021 1022
	return 0;
}
1023
EXPORT_SYMBOL(get_anon_bdev);
Linus Torvalds's avatar
Linus Torvalds committed
1024

1025
void free_anon_bdev(dev_t dev)
Linus Torvalds's avatar
Linus Torvalds committed
1026
{
1027
	int slot = MINOR(dev);
Linus Torvalds's avatar
Linus Torvalds committed
1028
	spin_lock(&unnamed_dev_lock);
1029
	ida_remove(&unnamed_dev_ida, slot);
1030 1031
	if (slot < unnamed_dev_start)
		unnamed_dev_start = slot;
Linus Torvalds's avatar
Linus Torvalds committed
1032 1033
	spin_unlock(&unnamed_dev_lock);
}
1034 1035 1036 1037
EXPORT_SYMBOL(free_anon_bdev);

int set_anon_super(struct super_block *s, void *data)
{
1038
	return get_anon_bdev(&s->s_dev);
1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
}

EXPORT_SYMBOL(set_anon_super);

void kill_anon_super(struct super_block *sb)
{
	dev_t dev = sb->s_dev;
	generic_shutdown_super(sb);
	free_anon_bdev(dev);
}
Linus Torvalds's avatar
Linus Torvalds committed
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060

EXPORT_SYMBOL(kill_anon_super);

void kill_litter_super(struct super_block *sb)
{
	if (sb->s_root)
		d_genocide(sb->s_root);
	kill_anon_super(sb);
}

EXPORT_SYMBOL(kill_litter_super);

1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071
static int ns_test_super(struct super_block *sb, void *data)
{
	return sb->s_fs_info == data;
}

static int ns_set_super(struct super_block *sb, void *data)
{
	sb->s_fs_info = data;
	return set_anon_super(sb, NULL);
}

1072 1073 1074
struct dentry *mount_ns(struct file_system_type *fs_type,
	int flags, void *data, void *ns, struct user_namespace *user_ns,
	int (*fill_super)(struct super_block *, void *, int))
1075 1076 1077
{
	struct super_block *sb;

1078 1079 1080
	/* Don't allow mounting unless the caller has CAP_SYS_ADMIN
	 * over the namespace.
	 */
1081
	if (!(flags & SB_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
1082 1083
		return ERR_PTR(-EPERM);

1084 1085
	sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
			 user_ns, ns);
1086
	if (IS_ERR(sb))
Al Viro's avatar
Al Viro committed
1087
		return ERR_CAST(sb);
1088 1089 1090

	if (!sb->s_root) {
		int err;
1091
		err = fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
1092
		if (err) {
1093
			deactivate_locked_super(sb);
Al Viro's avatar
Al Viro committed
1094
			return ERR_PTR(err);
1095 1096
		}

1097
		sb->s_flags |= SB_ACTIVE;
1098 1099
	}

Al Viro's avatar
Al Viro committed
1100
	return dget(sb->s_root);
1101 1102
}

Al Viro's avatar
Al Viro committed
1103
EXPORT_SYMBOL(mount_ns);
1104

1105
#ifdef CONFIG_BLOCK
Linus Torvalds's avatar
Linus Torvalds committed
1106 1107 1108 1109
static int set_bdev_super(struct super_block *s, void *data)
{
	s->s_bdev = data;
	s->s_dev = s->s_bdev->bd_dev;
1110
	s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
Jens Axboe's avatar
Jens Axboe committed
1111

Linus Torvalds's avatar
Linus Torvalds committed
1112 1113 1114 1115 1116 1117 1118 1119
	return 0;
}

static int test_bdev_super(struct super_block *s, void *data)
{
	return (void *)s->s_bdev == data;
}

Al Viro's avatar
Al Viro committed
1120
struct dentry *mount_bdev(struct file_system_type *fs_type,
Linus Torvalds's avatar
Linus Torvalds committed
1121
	int flags, const char *dev_name, void *data,
Al Viro's avatar
Al Viro committed
1122
	int (*fill_super)(struct super_block *, void *, int))
Linus Torvalds's avatar
Linus Torvalds committed
1123 1124 1125
{
	struct block_device *bdev;
	struct super_block *s;
1126
	fmode_t mode = FMODE_READ | FMODE_EXCL;
Linus Torvalds's avatar
Linus Torvalds committed
1127 1128
	int error = 0;

1129
	if (!(flags & SB_RDONLY))
1130 1131
		mode |= FMODE_WRITE;

1132
	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
Linus Torvalds's avatar
Linus Torvalds committed
1133
	if (IS_ERR(bdev))
Al Viro's avatar
Al Viro committed
1134
		return ERR_CAST(bdev);
Linus Torvalds's avatar
Linus Torvalds committed
1135 1136 1137 1138 1139 1140

	/*
	 * once the super is inserted into the list by sget, s_umount
	 * will protect the lockfs code from trying to start a snapshot
	 * while we are mounting
	 */
1141 1142 1143 1144 1145 1146
	mutex_lock(&bdev->bd_fsfreeze_mutex);
	if (bdev->bd_fsfreeze_count > 0) {
		mutex_unlock(&bdev->bd_fsfreeze_mutex);
		error = -EBUSY;
		goto error_bdev;
	}
1147
	s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
1148
		 bdev);
1149
	mutex_unlock(&bdev->bd_fsfreeze_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
1150
	if (IS_ERR(s))
1151
		goto error_s;
Linus Torvalds's avatar
Linus Torvalds committed
1152 1153

	if (s->s_root) {
1154
		if ((flags ^ s->s_flags) & SB_RDONLY) {
1155
			deactivate_locked_super(s);
1156 1157
			error = -EBUSY;
			goto error_bdev;
Linus Torvalds's avatar
Linus Torvalds committed
1158
		}
1159

1160 1161
		/*
		 * s_umount nests inside bd_mutex during
1162 1163 1164 1165
		 * __invalidate_device().  blkdev_put() acquires
		 * bd_mutex and can't be called under s_umount.  Drop
		 * s_umount temporarily.  This is safe as we're
		 * holding an active reference.
1166 1167
		 */
		up_write(&s->s_umount);
1168
		blkdev_put(bdev, mode);
1169
		down_write(&s->s_umount);
Linus Torvalds's avatar
Linus Torvalds committed
1170
	} else {
1171
		s->s_mode = mode;
1172
		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
1173
		sb_set_blocksize(s, block_size(bdev));
1174
		error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
Linus Torvalds's avatar
Linus Torvalds committed
1175
		if (error) {
1176
			deactivate_locked_super(s);
1177
			goto error;
1178
		}
1179

1180
		s->s_flags |= SB_ACTIVE;
1181
		bdev->bd_super = s;
Linus Torvalds's avatar
Linus Torvalds committed
1182 1183
	}

Al Viro's avatar
Al Viro committed
1184
	return dget(s->s_root);
Linus Torvalds's avatar
Linus Torvalds committed
1185

1186 1187 1188
error_s:
	error = PTR_ERR(s);
error_bdev:
1189
	blkdev_put(bdev, mode);
1190
error:
Al Viro's avatar
Al Viro committed
1191 1192 1193 1194
	return ERR_PTR(error);
}
EXPORT_SYMBOL(mount_bdev);

Linus Torvalds's avatar
Linus Torvalds committed
1195 1196 1197
void kill_block_super(struct super_block *sb)
{
	struct block_device *bdev = sb->s_bdev;
1198
	fmode_t mode = sb->s_mode;
Linus Torvalds's avatar
Linus Torvalds committed
1199

1200
	bdev->bd_super = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1201 1202
	generic_shutdown_super(sb);
	sync_blockdev(bdev);
1203
	WARN_ON_ONCE(!(mode & FMODE_EXCL));
1204
	blkdev_put(bdev, mode | FMODE_EXCL);
Linus Torvalds's avatar
Linus Torvalds committed
1205 1206 1207
}

EXPORT_SYMBOL(kill_block_super);
1208
#endif
Linus Torvalds's avatar
Linus Torvalds committed
1209

Al Viro's avatar
Al Viro committed
1210
struct dentry *mount_nodev(struct file_system_type *fs_type,
Linus Torvalds's avatar
Linus Torvalds committed
1211
	int flags, void *data,
Al Viro's avatar
Al Viro committed
1212
	int (*fill_super)(struct super_block *, void *, int))
Linus Torvalds's avatar
Linus Torvalds committed
1213 1214
{
	int error;
1215
	struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1216 1217

	if (IS_ERR(s))
Al Viro's avatar
Al Viro committed
1218
		return ERR_CAST(s);
Linus Torvalds's avatar
Linus Torvalds committed
1219

1220
	error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
Linus Torvalds's avatar
Linus Torvalds committed
1221
	if (error) {
1222
		deactivate_locked_super(s);
Al Viro's avatar
Al Viro committed
1223
		return ERR_PTR(error);
Linus Torvalds's avatar
Linus Torvalds committed
1224
	}
1225
	s->s_flags |= SB_ACTIVE;
Al Viro's avatar
Al Viro committed
1226
	return dget(s->s_root);
Linus Torvalds's avatar
Linus Torvalds committed
1227
}
Al Viro's avatar
Al Viro committed
1228 1229
EXPORT_SYMBOL(mount_nodev);

Linus Torvalds's avatar
Linus Torvalds committed
1230 1231 1232 1233 1234
static int compare_single(struct super_block *s, void *p)
{
	return 1;
}

Al Viro's avatar
Al Viro committed
1235
struct dentry *mount_single(struct file_system_type *fs_type,
Linus Torvalds's avatar
Linus Torvalds committed
1236
	int flags, void *data,
Al Viro's avatar
Al Viro committed
1237
	int (*fill_super)(struct super_block *, void *, int))
Linus Torvalds's avatar
Linus Torvalds committed
1238 1239 1240 1241
{
	struct super_block *s;
	int error;

1242
	s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1243
	if (IS_ERR(s))
Al Viro's avatar
Al Viro committed
1244
		return ERR_CAST(s);
Linus Torvalds's avatar
Linus Torvalds committed
1245
	if (!s->s_root) {
1246
		error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
Linus Torvalds's avatar
Linus Torvalds committed
1247
		if (error) {
1248
			deactivate_locked_super(s);
Al Viro's avatar
Al Viro committed
1249
			return ERR_PTR(error);
Linus Torvalds's avatar
Linus Torvalds committed
1250
		}
1251
		s->s_flags |= SB_ACTIVE;
1252 1253
	} else {
		do_remount_sb(s, flags, data, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1254
	}
Al Viro's avatar
Al Viro committed
1255 1256 1257 1258
	return dget(s->s_root);
}
EXPORT_SYMBOL(mount_single);

1259 1260
struct dentry *
mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
Linus Torvalds's avatar
Linus Torvalds committed
1261
{
Al Viro's avatar
Al Viro committed
1262
	struct dentry *root;
1263
	struct super_block *sb;
Linus Torvalds's avatar
Linus Torvalds committed
1264
	char *secdata = NULL;
1265
	int error = -ENOMEM;
Al Viro's avatar
Al Viro committed
1266

1267
	if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
Linus Torvalds's avatar
Linus Torvalds committed
1268
		secdata = alloc_secdata();
1269
		if (!secdata)
1270
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1271

1272
		error = security_sb_copy_data(data, secdata);
1273
		if (error)
Linus Torvalds's avatar
Linus Torvalds committed
1274 1275 1276
			goto out_free_secdata;
	}

Al Viro's avatar
Al Viro committed
1277 1278 1279 1280
	root = type->mount(type, flags, name, data);
	if (IS_ERR(root)) {
		error = PTR_ERR(root);
		goto out_free_secdata;
Al Viro's avatar
Al Viro committed
1281
	}
1282 1283 1284
	sb = root->d_sb;
	BUG_ON(!sb);
	WARN_ON(!sb->s_bdi);
1285 1286 1287 1288 1289 1290 1291 1292

	/*
	 * Write barrier is for super_cache_count(). We place it before setting
	 * SB_BORN as the data dependency between the two functions is the
	 * superblock structure contents that we just set up, not the SB_BORN
	 * flag.
	 */
	smp_wmb();
1293
	sb->s_flags |= SB_BORN;
1294

1295
	error = security_sb_kern_mount(sb, flags, secdata);
1296 1297
	if (error)
		goto out_sb;
1298

1299 1300 1301 1302
	/*
	 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
	 * but s_maxbytes was an unsigned long long for many releases. Throw
	 * this warning for a little while to try and catch filesystems that
1303
	 * violate this rule.
1304
	 */
1305 1306
	WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
		"negative value (%lld)\n", type->name, sb->s_maxbytes);
1307

1308
	up_write(&sb->s_umount);
1309
	free_secdata(secdata);
1310
	return root;
Linus Torvalds's avatar
Linus Torvalds committed
1311
out_sb:
1312 1313
	dput(root);
	deactivate_locked_super(sb);
Linus Torvalds's avatar
Linus Torvalds committed
1314 1315 1316
out_free_secdata:
	free_secdata(secdata);
out:
1317
	return ERR_PTR(error);
Linus Torvalds's avatar
Linus Torvalds committed
1318 1319
}

1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336
/*
 * Setup private BDI for given superblock. It gets automatically cleaned up
 * in generic_shutdown_super().
 */
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
{
	struct backing_dev_info *bdi;
	int err;
	va_list args;

	bdi = bdi_alloc(GFP_KERNEL);
	if (!bdi)
		return -ENOMEM;

	bdi->name = sb->s_type->name;

	va_start(args, fmt);
1337
	err = bdi_register_va(bdi, fmt, args);
1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
	va_end(args);
	if (err) {
		bdi_put(bdi);
		return err;
	}
	WARN_ON(sb->s_bdi != &noop_backing_dev_info);
	sb->s_bdi = bdi;

	return 0;
}
EXPORT_SYMBOL(super_setup_bdi_name);

/*
 * Setup private BDI for given superblock. I gets automatically cleaned up
 * in generic_shutdown_super().
 */
int super_setup_bdi(struct super_block *sb)
{
	static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);

	return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
				    atomic_long_inc_return(&bdi_seq));
}
EXPORT_SYMBOL(super_setup_bdi);

1363 1364 1365 1366 1367 1368
/*
 * This is an internal function, please use sb_end_{write,pagefault,intwrite}
 * instead.
 */
void __sb_end_write(struct super_block *sb, int level)
{
1369
	percpu_up_read(sb->s_writers.rw_sem + level-1);
1370 1371 1372
}
EXPORT_SYMBOL(__sb_end_write);

1373 1374 1375 1376 1377 1378 1379
/*
 * This is an internal function, please use sb_start_{write,pagefault,intwrite}
 * instead.
 */
int __sb_start_write(struct super_block *sb, int level, bool wait)
{
	bool force_trylock = false;
1380
	int ret = 1;
1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395

#ifdef CONFIG_LOCKDEP
	/*
	 * We want lockdep to tell us about possible deadlocks with freezing
	 * but it's it bit tricky to properly instrument it. Getting a freeze
	 * protection works as getting a read lock but there are subtle
	 * problems. XFS for example gets freeze protection on internal level
	 * twice in some cases, which is OK only because we already hold a
	 * freeze protection also on higher level. Due to these cases we have
	 * to use wait == F (trylock mode) which must not fail.
	 */
	if (wait) {
		int i;

		for (i = 0; i < level - 1; i++)
1396
			if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
1397 1398 1399 1400 1401
				force_trylock = true;
				break;
			}
	}
#endif
1402 1403 1404 1405 1406
	if (wait && !force_trylock)
		percpu_down_read(sb->s_writers.rw_sem + level-1);
	else
		ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);

1407
	WARN_ON(force_trylock && !ret);
1408 1409
	return ret;
}
1410 1411 1412 1413 1414 1415 1416 1417
EXPORT_SYMBOL(__sb_start_write);

/**
 * sb_wait_write - wait until all writers to given file system finish
 * @sb: the super for which we wait
 * @level: type of writers we wait for (normal vs page fault)
 *
 * This function waits until there are no writers of given type to given file
1418
 * system.
1419 1420 1421
 */
static void sb_wait_write(struct super_block *sb, int level)
{
1422 1423
	percpu_down_write(sb->s_writers.rw_sem + level-1);
}
1424

1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440
/*
 * We are going to return to userspace and forget about these locks, the
 * ownership goes to the caller of thaw_super() which does unlock().
 */
static void lockdep_sb_freeze_release(struct super_block *sb)
{
	int level;

	for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
		percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}

/*
 * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb).
 */
static void lockdep_sb_freeze_acquire(struct super_block *sb)
1441 1442
{
	int level;
1443

1444 1445
	for (level = 0; level < SB_FREEZE_LEVELS; ++level)
		percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
1446 1447 1448 1449 1450
}

static void sb_freeze_unlock(struct super_block *sb)
{
	int level;
1451

1452 1453
	for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
		percpu_up_write(sb->s_writers.rw_sem + level);
1454 1455
}

1456
/**
1457 1458
 * freeze_super - lock the filesystem and force it into a consistent state
 * @sb: the super to lock
1459 1460 1461 1462
 *
 * Syncs the super to make sure the filesystem is consistent and calls the fs's
 * freeze_fs.  Subsequent calls to this without first thawing the fs will return
 * -EBUSY.
1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487
 *
 * During this function, sb->s_writers.frozen goes through these values:
 *
 * SB_UNFROZEN: File system is normal, all writes progress as usual.
 *
 * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
 * writes should be blocked, though page faults are still allowed. We wait for
 * all writes to complete and then proceed to the next stage.
 *
 * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
 * but internal fs threads can still modify the filesystem (although they
 * should not dirty new pages or inodes), writeback can run etc. After waiting
 * for all running page faults we sync the filesystem which will clean all
 * dirty pages and inodes (no new dirty pages or inodes can be created when
 * sync is running).
 *
 * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
 * modification are blocked (e.g. XFS preallocation truncation on inode
 * reclaim). This is usually implemented by blocking new transactions for
 * filesystems that have them and need this additional guard. After all
 * internal writers are finished we call ->freeze_fs() to finish filesystem
 * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
 * mostly auxiliary for filesystems to verify they do not modify frozen fs.
 *
 * sb->s_writers.frozen is protected by sb->s_umount.
1488 1489 1490 1491 1492 1493 1494
 */
int freeze_super(struct super_block *sb)
{
	int ret;

	atomic_inc(&sb->s_active);
	down_write(&sb->s_umount);
1495
	if (sb->s_writers.frozen != SB_UNFROZEN) {
1496 1497 1498 1499
		deactivate_locked_super(sb);
		return -EBUSY;
	}

1500
	if (!(sb->s_flags & SB_BORN)) {
Al Viro's avatar
Al Viro committed
1501 1502 1503 1504
		up_write(&sb->s_umount);
		return 0;	/* sic - it's "nothing to do" */
	}

1505
	if (sb_rdonly(sb)) {
1506 1507
		/* Nothing to do really... */
		sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1508 1509 1510 1511
		up_write(&sb->s_umount);
		return 0;
	}

1512 1513 1514 1515
	sb->s_writers.frozen = SB_FREEZE_WRITE;
	/* Release s_umount to preserve sb_start_write -> s_umount ordering */
	up_write(&sb->s_umount);
	sb_wait_write(sb, SB_FREEZE_WRITE);
1516
	down_write(&sb->s_umount);
1517 1518 1519 1520 1521 1522

	/* Now we go and block page faults... */
	sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
	sb_wait_write(sb, SB_FREEZE_PAGEFAULT);

	/* All writers are done so after syncing there won't be dirty data */
1523 1524
	sync_filesystem(sb);

1525 1526 1527
	/* Now wait for internal filesystem counter */
	sb->s_writers.frozen = SB_FREEZE_FS;
	sb_wait_write(sb, SB_FREEZE_FS);
1528 1529 1530 1531 1532 1533

	if (sb->s_op->freeze_fs) {
		ret = sb->s_op->freeze_fs(sb);
		if (ret) {
			printk(KERN_ERR
				"VFS:Filesystem freeze failed\n");
1534
			sb->s_writers.frozen = SB_UNFROZEN;
1535
			sb_freeze_unlock(sb);
1536
			wake_up(&sb->s_writers.wait_unfrozen);
1537 1538 1539 1540
			deactivate_locked_super(sb);
			return ret;
		}
	}
1541
	/*
1542 1543
	 * For debugging purposes so that fs can warn if it sees write activity
	 * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
1544 1545
	 */
	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1546
	lockdep_sb_freeze_release(sb);
1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557
	up_write(&sb->s_umount);
	return 0;
}
EXPORT_SYMBOL(freeze_super);

/**
 * thaw_super -- unlock filesystem
 * @sb: the super to thaw
 *
 * Unlocks the filesystem and marks it writeable again after freeze_super().
 */
1558
static int thaw_super_locked(struct super_block *sb)
1559 1560 1561
{
	int error;

1562
	if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
1563 1564 1565 1566
		up_write(&sb->s_umount);
		return -EINVAL;
	}

1567
	if (sb_rdonly(sb)) {
1568
		sb->s_writers.frozen = SB_UNFROZEN;
1569
		goto out;
1570
	}
1571

1572 1573
	lockdep_sb_freeze_acquire(sb);

1574 1575 1576 1577 1578
	if (sb->s_op->unfreeze_fs) {
		error = sb->s_op->unfreeze_fs(sb);
		if (error) {
			printk(KERN_ERR
				"VFS:Filesystem thaw failed\n");
1579
			lockdep_sb_freeze_release(sb);
1580 1581 1582 1583 1584
			up_write(&sb->s_umount);
			return error;
		}
	}

1585
	sb->s_writers.frozen = SB_UNFROZEN;
1586 1587
	sb_freeze_unlock(sb);
out:
1588
	wake_up(&sb->s_writers.wait_unfrozen);
1589 1590 1591
	deactivate_locked_super(sb);
	return 0;
}
1592 1593 1594 1595 1596 1597

int thaw_super(struct super_block *sb)
{
	down_write(&sb->s_umount);
	return thaw_super_locked(sb);
}
1598
EXPORT_SYMBOL(thaw_super);