super.c 39.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 *  linux/fs/super.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  super.c contains code to handle: - mount structures
 *                                   - super-block tables
 *                                   - filesystem drivers list
 *                                   - mount system call
 *                                   - umount system call
 *                                   - ustat system call
 *
 * GK 2/5/95  -  Changed to support mounting the root fs via NFS
 *
 *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
 *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
 *  Added options to /proc/mounts:
18
 *    Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
Linus Torvalds's avatar
Linus Torvalds committed
19 20 21 22
 *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
 *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
 */

23
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
24 25 26 27 28 29
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/writeback.h>		/* for the emergency remount stuff */
#include <linux/idr.h>
Ingo Molnar's avatar
Ingo Molnar committed
30
#include <linux/mutex.h>
31
#include <linux/backing-dev.h>
32
#include <linux/rculist_bl.h>
33
#include <linux/cleancache.h>
Al Viro's avatar
Al Viro committed
34
#include <linux/fsnotify.h>
35
#include <linux/lockdep.h>
36
#include <linux/user_namespace.h>
37
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
38 39


40 41
static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);
Linus Torvalds's avatar
Linus Torvalds committed
42

43 44 45 46 47 48
static char *sb_writers_name[SB_FREEZE_LEVELS] = {
	"sb_writers",
	"sb_pagefaults",
	"sb_internal",
};

49 50 51 52 53 54 55
/*
 * One thing we have to be careful of with a per-sb shrinker is that we don't
 * drop the last active reference to the superblock from within the shrinker.
 * If that happens we could trigger unregistering the shrinker from within the
 * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
 * take a passive reference to the superblock to avoid this from occurring.
 */
56 57
static unsigned long super_cache_scan(struct shrinker *shrink,
				      struct shrink_control *sc)
58 59
{
	struct super_block *sb;
60 61 62 63 64
	long	fs_objects = 0;
	long	total_objects;
	long	freed = 0;
	long	dentries;
	long	inodes;
65 66 67 68 69 70 71

	sb = container_of(shrink, struct super_block, s_shrink);

	/*
	 * Deadlock avoidance.  We may hold various FS locks, and we don't want
	 * to recurse into the FS that called us in clear_inode() and friends..
	 */
72 73
	if (!(sc->gfp_mask & __GFP_FS))
		return SHRINK_STOP;
74

75
	if (!trylock_super(sb))
76
		return SHRINK_STOP;
77

78
	if (sb->s_op->nr_cached_objects)
79
		fs_objects = sb->s_op->nr_cached_objects(sb, sc);
80

81 82
	inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
	dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
83
	total_objects = dentries + inodes + fs_objects + 1;
84 85
	if (!total_objects)
		total_objects = 1;
86

87
	/* proportion the scan between the caches */
88
	dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
89
	inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
90
	fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
91

92 93 94
	/*
	 * prune the dcache first as the icache is pinned by it, then
	 * prune the icache, followed by the filesystem specific caches
95 96 97
	 *
	 * Ensure that we always scan at least one object - memcg kmem
	 * accounting uses this to fully empty the caches.
98
	 */
99
	sc->nr_to_scan = dentries + 1;
100
	freed = prune_dcache_sb(sb, sc);
101
	sc->nr_to_scan = inodes + 1;
102
	freed += prune_icache_sb(sb, sc);
103 104

	if (fs_objects) {
105
		sc->nr_to_scan = fs_objects + 1;
106
		freed += sb->s_op->free_cached_objects(sb, sc);
107 108
	}

109
	up_read(&sb->s_umount);
110 111 112 113 114 115 116 117 118 119 120
	return freed;
}

static unsigned long super_cache_count(struct shrinker *shrink,
				       struct shrink_control *sc)
{
	struct super_block *sb;
	long	total_objects = 0;

	sb = container_of(shrink, struct super_block, s_shrink);

121
	/*
122
	 * Don't call trylock_super as it is a potential
123 124 125
	 * scalability bottleneck. The counts could get updated
	 * between super_cache_count and super_cache_scan anyway.
	 * Call to super_cache_count with shrinker_rwsem held
126
	 * ensures the safety of call to list_lru_shrink_count() and
127 128
	 * s_op->nr_cached_objects().
	 */
129
	if (sb->s_op && sb->s_op->nr_cached_objects)
130
		total_objects = sb->s_op->nr_cached_objects(sb, sc);
131

132 133
	total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
	total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
134

135
	total_objects = vfs_pressure_ratio(total_objects);
136
	return total_objects;
137 138
}

139 140 141 142 143 144 145
static void destroy_super_work(struct work_struct *work)
{
	struct super_block *s = container_of(work, struct super_block,
							destroy_work);
	int i;

	for (i = 0; i < SB_FREEZE_LEVELS; i++)
146
		percpu_free_rwsem(&s->s_writers.rw_sem[i]);
147 148 149 150 151 152 153 154 155 156
	kfree(s);
}

static void destroy_super_rcu(struct rcu_head *head)
{
	struct super_block *s = container_of(head, struct super_block, rcu);
	INIT_WORK(&s->destroy_work, destroy_super_work);
	schedule_work(&s->destroy_work);
}

157 158 159 160 161 162 163
/**
 *	destroy_super	-	frees a superblock
 *	@s: superblock to free
 *
 *	Frees a superblock.
 */
static void destroy_super(struct super_block *s)
164
{
165 166 167 168
	list_lru_destroy(&s->s_dentry_lru);
	list_lru_destroy(&s->s_inode_lru);
	security_sb_free(s);
	WARN_ON(!list_empty(&s->s_mounts));
169
	put_user_ns(s->s_user_ns);
170 171
	kfree(s->s_subtype);
	kfree(s->s_options);
172
	call_rcu(&s->rcu, destroy_super_rcu);
173 174
}

Linus Torvalds's avatar
Linus Torvalds committed
175 176
/**
 *	alloc_super	-	create new superblock
177
 *	@type:	filesystem type superblock should belong to
178
 *	@flags: the mount flags
179
 *	@user_ns: User namespace for the super_block
Linus Torvalds's avatar
Linus Torvalds committed
180 181 182 183
 *
 *	Allocates and initializes a new &struct super_block.  alloc_super()
 *	returns a pointer new superblock or %NULL if allocation had failed.
 */
184 185
static struct super_block *alloc_super(struct file_system_type *type, int flags,
				       struct user_namespace *user_ns)
Linus Torvalds's avatar
Linus Torvalds committed
186
{
187
	struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
188
	static const struct super_operations default_op;
189 190 191 192
	int i;

	if (!s)
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
193

194
	INIT_LIST_HEAD(&s->s_mounts);
195
	s->s_user_ns = get_user_ns(user_ns);
196

197 198
	if (security_sb_alloc(s))
		goto fail;
199

200
	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
201 202 203
		if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
					sb_writers_name[i],
					&type->s_writers_key[i]))
204
			goto fail;
Linus Torvalds's avatar
Linus Torvalds committed
205
	}
206
	init_waitqueue_head(&s->s_writers.wait_unfrozen);
207
	s->s_bdi = &noop_backing_dev_info;
208
	s->s_flags = flags;
209
	if (s->s_user_ns != &init_user_ns)
210
		s->s_iflags |= SB_I_NODEV;
211 212
	INIT_HLIST_NODE(&s->s_instances);
	INIT_HLIST_BL_HEAD(&s->s_anon);
213
	mutex_init(&s->s_sync_lock);
214
	INIT_LIST_HEAD(&s->s_inodes);
215
	spin_lock_init(&s->s_inode_list_lock);
216 217
	INIT_LIST_HEAD(&s->s_inodes_wb);
	spin_lock_init(&s->s_inode_wblist_lock);
218

219
	if (list_lru_init_memcg(&s->s_dentry_lru))
220
		goto fail;
221
	if (list_lru_init_memcg(&s->s_inode_lru))
222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
		goto fail;

	init_rwsem(&s->s_umount);
	lockdep_set_class(&s->s_umount, &type->s_umount_key);
	/*
	 * sget() can have s_umount recursion.
	 *
	 * When it cannot find a suitable sb, it allocates a new
	 * one (this one), and tries again to find a suitable old
	 * one.
	 *
	 * In case that succeeds, it will acquire the s_umount
	 * lock of the old one. Since these are clearly distrinct
	 * locks, and this object isn't exposed yet, there's no
	 * risk of deadlocks.
	 *
	 * Annotate this by putting this lock in a different
	 * subclass.
	 */
	down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
	s->s_count = 1;
	atomic_set(&s->s_active, 1);
	mutex_init(&s->s_vfs_rename_mutex);
	lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
	mutex_init(&s->s_dquot.dqio_mutex);
	s->s_maxbytes = MAX_NON_LFS;
	s->s_op = &default_op;
	s->s_time_gran = 1000000000;
250
	s->cleancache_poolid = CLEANCACHE_NO_POOL;
251 252 253 254 255

	s->s_shrink.seeks = DEFAULT_SEEKS;
	s->s_shrink.scan_objects = super_cache_scan;
	s->s_shrink.count_objects = super_cache_count;
	s->s_shrink.batch = 1024;
256
	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
Linus Torvalds's avatar
Linus Torvalds committed
257
	return s;
258

259 260 261
fail:
	destroy_super(s);
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
262 263 264 265 266
}

/* Superblock refcounting  */

/*
267
 * Drop a superblock's refcount.  The caller must hold sb_lock.
Linus Torvalds's avatar
Linus Torvalds committed
268
 */
Al Viro's avatar
Al Viro committed
269
static void __put_super(struct super_block *sb)
Linus Torvalds's avatar
Linus Torvalds committed
270 271
{
	if (!--sb->s_count) {
272
		list_del_init(&sb->s_list);
Linus Torvalds's avatar
Linus Torvalds committed
273 274 275 276 277 278 279 280 281 282 283
		destroy_super(sb);
	}
}

/**
 *	put_super	-	drop a temporary reference to superblock
 *	@sb: superblock in question
 *
 *	Drops a temporary reference, frees superblock if there's no
 *	references left.
 */
Al Viro's avatar
Al Viro committed
284
static void put_super(struct super_block *sb)
Linus Torvalds's avatar
Linus Torvalds committed
285 286 287 288 289 290 291 292
{
	spin_lock(&sb_lock);
	__put_super(sb);
	spin_unlock(&sb_lock);
}


/**
293
 *	deactivate_locked_super	-	drop an active reference to superblock
Linus Torvalds's avatar
Linus Torvalds committed
294 295
 *	@s: superblock to deactivate
 *
296
 *	Drops an active reference to superblock, converting it into a temporary
297
 *	one if there is no other active references left.  In that case we
Linus Torvalds's avatar
Linus Torvalds committed
298 299
 *	tell fs driver to shut it down and drop the temporary reference we
 *	had just acquired.
300 301
 *
 *	Caller holds exclusive lock on superblock; that lock is released.
Linus Torvalds's avatar
Linus Torvalds committed
302
 */
303
void deactivate_locked_super(struct super_block *s)
Linus Torvalds's avatar
Linus Torvalds committed
304 305
{
	struct file_system_type *fs = s->s_type;
Al Viro's avatar
Al Viro committed
306
	if (atomic_dec_and_test(&s->s_active)) {
307
		cleancache_invalidate_fs(s);
308
		unregister_shrinker(&s->s_shrink);
309
		fs->kill_sb(s);
Glauber Costa's avatar
Glauber Costa committed
310

311 312 313 314 315 316 317 318
		/*
		 * Since list_lru_destroy() may sleep, we cannot call it from
		 * put_super(), where we hold the sb_lock. Therefore we destroy
		 * the lru lists right now.
		 */
		list_lru_destroy(&s->s_dentry_lru);
		list_lru_destroy(&s->s_inode_lru);

Linus Torvalds's avatar
Linus Torvalds committed
319 320
		put_filesystem(fs);
		put_super(s);
321 322
	} else {
		up_write(&s->s_umount);
Linus Torvalds's avatar
Linus Torvalds committed
323 324 325
	}
}

326
EXPORT_SYMBOL(deactivate_locked_super);
Linus Torvalds's avatar
Linus Torvalds committed
327

328
/**
329
 *	deactivate_super	-	drop an active reference to superblock
330 331
 *	@s: superblock to deactivate
 *
332 333 334
 *	Variant of deactivate_locked_super(), except that superblock is *not*
 *	locked by caller.  If we are going to drop the final active reference,
 *	lock will be acquired prior to that.
335
 */
336
void deactivate_super(struct super_block *s)
337
{
338 339 340
        if (!atomic_add_unless(&s->s_active, -1, 1)) {
		down_write(&s->s_umount);
		deactivate_locked_super(s);
341 342 343
	}
}

344
EXPORT_SYMBOL(deactivate_super);
345

Linus Torvalds's avatar
Linus Torvalds committed
346 347 348 349 350 351 352 353 354
/**
 *	grab_super - acquire an active reference
 *	@s: reference we are trying to make active
 *
 *	Tries to acquire an active reference.  grab_super() is used when we
 * 	had just found a superblock in super_blocks or fs_type->fs_supers
 *	and want to turn it into a full-blown active reference.  grab_super()
 *	is called with sb_lock held and drops it.  Returns 1 in case of
 *	success, 0 if we had failed (superblock contents was already dead or
Al Viro's avatar
Al Viro committed
355 356 357
 *	dying when grab_super() had been called).  Note that this is only
 *	called for superblocks not in rundown mode (== ones still on ->fs_supers
 *	of their type), so increment of ->s_count is OK here.
Linus Torvalds's avatar
Linus Torvalds committed
358
 */
359
static int grab_super(struct super_block *s) __releases(sb_lock)
Linus Torvalds's avatar
Linus Torvalds committed
360 361 362 363
{
	s->s_count++;
	spin_unlock(&sb_lock);
	down_write(&s->s_umount);
Al Viro's avatar
Al Viro committed
364 365 366 367
	if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) {
		put_super(s);
		return 1;
	}
Linus Torvalds's avatar
Linus Torvalds committed
368 369 370 371 372
	up_write(&s->s_umount);
	put_super(s);
	return 0;
}

373
/*
374
 *	trylock_super - try to grab ->s_umount shared
375
 *	@sb: reference we are trying to grab
376
 *
377
 *	Try to prevent fs shutdown.  This is used in places where we
378
 *	cannot take an active reference but we need to ensure that the
379 380 381 382 383 384 385 386 387 388
 *	filesystem is not shut down while we are working on it. It returns
 *	false if we cannot acquire s_umount or if we lose the race and
 *	filesystem already got into shutdown, and returns true with the s_umount
 *	lock held in read mode in case of success. On successful return,
 *	the caller must drop the s_umount lock when done.
 *
 *	Note that unlike get_super() et.al. this one does *not* bump ->s_count.
 *	The reason why it's safe is that we are OK with doing trylock instead
 *	of down_read().  There's a couple of places that are OK with that, but
 *	it's very much not a general-purpose interface.
389
 */
390
bool trylock_super(struct super_block *sb)
391 392
{
	if (down_read_trylock(&sb->s_umount)) {
393 394
		if (!hlist_unhashed(&sb->s_instances) &&
		    sb->s_root && (sb->s_flags & MS_BORN))
395 396 397 398 399 400 401
			return true;
		up_read(&sb->s_umount);
	}

	return false;
}

Linus Torvalds's avatar
Linus Torvalds committed
402 403 404 405 406 407 408 409 410
/**
 *	generic_shutdown_super	-	common helper for ->kill_sb()
 *	@sb: superblock to kill
 *
 *	generic_shutdown_super() does all fs-independent work on superblock
 *	shutdown.  Typical ->kill_sb() should pick all fs-specific objects
 *	that need destruction out of superblock, call generic_shutdown_super()
 *	and release aforementioned objects.  Note: dentries and inodes _are_
 *	taken care of and do not need specific handling.
411 412 413 414
 *
 *	Upon calling this function, the filesystem may no longer alter or
 *	rearrange the set of dentries belonging to this super_block, nor may it
 *	change the attachments of dentries to inodes.
Linus Torvalds's avatar
Linus Torvalds committed
415 416 417
 */
void generic_shutdown_super(struct super_block *sb)
{
418
	const struct super_operations *sop = sb->s_op;
Linus Torvalds's avatar
Linus Torvalds committed
419

420 421
	if (sb->s_root) {
		shrink_dcache_for_umount(sb);
422
		sync_filesystem(sb);
Linus Torvalds's avatar
Linus Torvalds committed
423
		sb->s_flags &= ~MS_ACTIVE;
424

425
		fsnotify_unmount_inodes(sb);
426
		cgroup_writeback_umount();
Al Viro's avatar
Al Viro committed
427 428

		evict_inodes(sb);
Linus Torvalds's avatar
Linus Torvalds committed
429

430 431 432 433 434
		if (sb->s_dio_done_wq) {
			destroy_workqueue(sb->s_dio_done_wq);
			sb->s_dio_done_wq = NULL;
		}

Linus Torvalds's avatar
Linus Torvalds committed
435 436 437
		if (sop->put_super)
			sop->put_super(sb);

Al Viro's avatar
Al Viro committed
438
		if (!list_empty(&sb->s_inodes)) {
439 440 441
			printk("VFS: Busy inodes after unmount of %s. "
			   "Self-destruct in 5 seconds.  Have a nice day...\n",
			   sb->s_id);
Linus Torvalds's avatar
Linus Torvalds committed
442 443 444 445
		}
	}
	spin_lock(&sb_lock);
	/* should be initialized for __put_super_and_need_restart() */
Al Viro's avatar
Al Viro committed
446
	hlist_del_init(&sb->s_instances);
Linus Torvalds's avatar
Linus Torvalds committed
447 448
	spin_unlock(&sb_lock);
	up_write(&sb->s_umount);
449 450 451 452 453
	if (sb->s_iflags & SB_I_DYNBDI) {
		bdi_put(sb->s_bdi);
		sb->s_bdi = &noop_backing_dev_info;
		sb->s_iflags &= ~SB_I_DYNBDI;
	}
Linus Torvalds's avatar
Linus Torvalds committed
454 455 456 457 458
}

EXPORT_SYMBOL(generic_shutdown_super);

/**
459
 *	sget_userns -	find or create a superblock
Linus Torvalds's avatar
Linus Torvalds committed
460 461 462
 *	@type:	filesystem type superblock should belong to
 *	@test:	comparison callback
 *	@set:	setup callback
463
 *	@flags:	mount flags
464
 *	@user_ns: User namespace for the super_block
Linus Torvalds's avatar
Linus Torvalds committed
465 466
 *	@data:	argument to each of them
 */
467
struct super_block *sget_userns(struct file_system_type *type,
Linus Torvalds's avatar
Linus Torvalds committed
468 469
			int (*test)(struct super_block *,void *),
			int (*set)(struct super_block *,void *),
470
			int flags, struct user_namespace *user_ns,
Linus Torvalds's avatar
Linus Torvalds committed
471 472 473
			void *data)
{
	struct super_block *s = NULL;
474
	struct super_block *old;
Linus Torvalds's avatar
Linus Torvalds committed
475 476
	int err;

477
	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) &&
478 479 480
	    !(type->fs_flags & FS_USERNS_MOUNT) &&
	    !capable(CAP_SYS_ADMIN))
		return ERR_PTR(-EPERM);
Linus Torvalds's avatar
Linus Torvalds committed
481 482
retry:
	spin_lock(&sb_lock);
483
	if (test) {
484
		hlist_for_each_entry(old, &type->fs_supers, s_instances) {
485 486
			if (!test(old, data))
				continue;
487 488 489 490 491 492 493 494
			if (user_ns != old->s_user_ns) {
				spin_unlock(&sb_lock);
				if (s) {
					up_write(&s->s_umount);
					destroy_super(s);
				}
				return ERR_PTR(-EBUSY);
			}
495 496
			if (!grab_super(old))
				goto retry;
Li Zefan's avatar
Li Zefan committed
497 498
			if (s) {
				up_write(&s->s_umount);
499
				destroy_super(s);
Al Viro's avatar
Al Viro committed
500
				s = NULL;
Li Zefan's avatar
Li Zefan committed
501
			}
502 503
			return old;
		}
Linus Torvalds's avatar
Linus Torvalds committed
504 505 506
	}
	if (!s) {
		spin_unlock(&sb_lock);
507
		s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns);
Linus Torvalds's avatar
Linus Torvalds committed
508 509 510 511 512 513 514 515
		if (!s)
			return ERR_PTR(-ENOMEM);
		goto retry;
	}
		
	err = set(s, data);
	if (err) {
		spin_unlock(&sb_lock);
Li Zefan's avatar
Li Zefan committed
516
		up_write(&s->s_umount);
Linus Torvalds's avatar
Linus Torvalds committed
517 518 519 520 521 522
		destroy_super(s);
		return ERR_PTR(err);
	}
	s->s_type = type;
	strlcpy(s->s_id, type->name, sizeof(s->s_id));
	list_add_tail(&s->s_list, &super_blocks);
Al Viro's avatar
Al Viro committed
523
	hlist_add_head(&s->s_instances, &type->fs_supers);
Linus Torvalds's avatar
Linus Torvalds committed
524 525
	spin_unlock(&sb_lock);
	get_filesystem(type);
526
	register_shrinker(&s->s_shrink);
Linus Torvalds's avatar
Linus Torvalds committed
527 528 529
	return s;
}

530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547
EXPORT_SYMBOL(sget_userns);

/**
 *	sget	-	find or create a superblock
 *	@type:	  filesystem type superblock should belong to
 *	@test:	  comparison callback
 *	@set:	  setup callback
 *	@flags:	  mount flags
 *	@data:	  argument to each of them
 */
struct super_block *sget(struct file_system_type *type,
			int (*test)(struct super_block *,void *),
			int (*set)(struct super_block *,void *),
			int flags,
			void *data)
{
	struct user_namespace *user_ns = current_user_ns();

548 549 550 551 552 553 554
	/* We don't yet pass the user namespace of the parent
	 * mount through to here so always use &init_user_ns
	 * until that changes.
	 */
	if (flags & MS_SUBMOUNT)
		user_ns = &init_user_ns;

555
	/* Ensure the requestor has permissions over the target filesystem */
556
	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
557 558 559 560 561
		return ERR_PTR(-EPERM);

	return sget_userns(type, test, set, flags, user_ns, data);
}

Linus Torvalds's avatar
Linus Torvalds committed
562 563 564 565 566 567 568 569 570 571
EXPORT_SYMBOL(sget);

void drop_super(struct super_block *sb)
{
	up_read(&sb->s_umount);
	put_super(sb);
}

EXPORT_SYMBOL(drop_super);

572 573 574 575 576 577 578
void drop_super_exclusive(struct super_block *sb)
{
	up_write(&sb->s_umount);
	put_super(sb);
}
EXPORT_SYMBOL(drop_super_exclusive);

Al Viro's avatar
Al Viro committed
579 580 581 582 583 584 585 586 587 588
/**
 *	iterate_supers - call function for all active superblocks
 *	@f: function to call
 *	@arg: argument to pass to it
 *
 *	Scans the superblock list and calls given function, passing it
 *	locked superblock and given argument.
 */
void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
{
589
	struct super_block *sb, *p = NULL;
Al Viro's avatar
Al Viro committed
590 591

	spin_lock(&sb_lock);
592
	list_for_each_entry(sb, &super_blocks, s_list) {
Al Viro's avatar
Al Viro committed
593
		if (hlist_unhashed(&sb->s_instances))
Al Viro's avatar
Al Viro committed
594 595 596 597 598
			continue;
		sb->s_count++;
		spin_unlock(&sb_lock);

		down_read(&sb->s_umount);
Al Viro's avatar
Al Viro committed
599
		if (sb->s_root && (sb->s_flags & MS_BORN))
Al Viro's avatar
Al Viro committed
600 601 602 603
			f(sb, arg);
		up_read(&sb->s_umount);

		spin_lock(&sb_lock);
604 605 606
		if (p)
			__put_super(p);
		p = sb;
Al Viro's avatar
Al Viro committed
607
	}
608 609
	if (p)
		__put_super(p);
Al Viro's avatar
Al Viro committed
610 611 612
	spin_unlock(&sb_lock);
}

Al Viro's avatar
Al Viro committed
613 614 615 616 617 618 619 620 621 622 623 624 625 626 627
/**
 *	iterate_supers_type - call function for superblocks of given type
 *	@type: fs type
 *	@f: function to call
 *	@arg: argument to pass to it
 *
 *	Scans the superblock list and calls given function, passing it
 *	locked superblock and given argument.
 */
void iterate_supers_type(struct file_system_type *type,
	void (*f)(struct super_block *, void *), void *arg)
{
	struct super_block *sb, *p = NULL;

	spin_lock(&sb_lock);
628
	hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
Al Viro's avatar
Al Viro committed
629 630 631 632
		sb->s_count++;
		spin_unlock(&sb_lock);

		down_read(&sb->s_umount);
Al Viro's avatar
Al Viro committed
633
		if (sb->s_root && (sb->s_flags & MS_BORN))
Al Viro's avatar
Al Viro committed
634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
			f(sb, arg);
		up_read(&sb->s_umount);

		spin_lock(&sb_lock);
		if (p)
			__put_super(p);
		p = sb;
	}
	if (p)
		__put_super(p);
	spin_unlock(&sb_lock);
}

EXPORT_SYMBOL(iterate_supers_type);

649
static struct super_block *__get_super(struct block_device *bdev, bool excl)
Linus Torvalds's avatar
Linus Torvalds committed
650
{
651 652
	struct super_block *sb;

Linus Torvalds's avatar
Linus Torvalds committed
653 654
	if (!bdev)
		return NULL;
655

Linus Torvalds's avatar
Linus Torvalds committed
656
	spin_lock(&sb_lock);
657 658
rescan:
	list_for_each_entry(sb, &super_blocks, s_list) {
Al Viro's avatar
Al Viro committed
659
		if (hlist_unhashed(&sb->s_instances))
660
			continue;
661 662
		if (sb->s_bdev == bdev) {
			sb->s_count++;
Linus Torvalds's avatar
Linus Torvalds committed
663
			spin_unlock(&sb_lock);
664 665 666 667
			if (!excl)
				down_read(&sb->s_umount);
			else
				down_write(&sb->s_umount);
668
			/* still alive? */
Al Viro's avatar
Al Viro committed
669
			if (sb->s_root && (sb->s_flags & MS_BORN))
670
				return sb;
671 672 673 674
			if (!excl)
				up_read(&sb->s_umount);
			else
				up_write(&sb->s_umount);
675
			/* nope, got unmounted */
676
			spin_lock(&sb_lock);
677 678
			__put_super(sb);
			goto rescan;
Linus Torvalds's avatar
Linus Torvalds committed
679 680 681 682 683 684
		}
	}
	spin_unlock(&sb_lock);
	return NULL;
}

685
/**
686
 *	get_super - get the superblock of a device
687 688 689
 *	@bdev: device to get the superblock for
 *
 *	Scans the superblock list and finds the superblock of the file system
690
 *	mounted on the device given. %NULL is returned if no match is found.
691
 */
692 693 694 695 696 697 698 699
struct super_block *get_super(struct block_device *bdev)
{
	return __get_super(bdev, false);
}
EXPORT_SYMBOL(get_super);

static struct super_block *__get_super_thawed(struct block_device *bdev,
					      bool excl)
700 701
{
	while (1) {
702
		struct super_block *s = __get_super(bdev, excl);
703
		if (!s || s->s_writers.frozen == SB_UNFROZEN)
704
			return s;
705 706 707 708
		if (!excl)
			up_read(&s->s_umount);
		else
			up_write(&s->s_umount);
709 710
		wait_event(s->s_writers.wait_unfrozen,
			   s->s_writers.frozen == SB_UNFROZEN);
711 712 713
		put_super(s);
	}
}
714 715 716 717 718 719 720 721 722 723 724 725 726 727

/**
 *	get_super_thawed - get thawed superblock of a device
 *	@bdev: device to get the superblock for
 *
 *	Scans the superblock list and finds the superblock of the file system
 *	mounted on the device. The superblock is returned once it is thawed
 *	(or immediately if it was not frozen). %NULL is returned if no match
 *	is found.
 */
struct super_block *get_super_thawed(struct block_device *bdev)
{
	return __get_super_thawed(bdev, false);
}
728 729
EXPORT_SYMBOL(get_super_thawed);

730 731 732 733 734 735 736 737 738 739 740 741 742 743 744
/**
 *	get_super_exclusive_thawed - get thawed superblock of a device
 *	@bdev: device to get the superblock for
 *
 *	Scans the superblock list and finds the superblock of the file system
 *	mounted on the device. The superblock is returned once it is thawed
 *	(or immediately if it was not frozen) and s_umount semaphore is held
 *	in exclusive mode. %NULL is returned if no match is found.
 */
struct super_block *get_super_exclusive_thawed(struct block_device *bdev)
{
	return __get_super_thawed(bdev, true);
}
EXPORT_SYMBOL(get_super_exclusive_thawed);

745 746 747 748 749 750
/**
 * get_active_super - get an active reference to the superblock of a device
 * @bdev: device to get the superblock for
 *
 * Scans the superblock list and finds the superblock of the file system
 * mounted on the device given.  Returns the superblock with an active
751
 * reference or %NULL if none was found.
752 753 754 755 756 757 758 759
 */
struct super_block *get_active_super(struct block_device *bdev)
{
	struct super_block *sb;

	if (!bdev)
		return NULL;

760
restart:
761 762
	spin_lock(&sb_lock);
	list_for_each_entry(sb, &super_blocks, s_list) {
Al Viro's avatar
Al Viro committed
763
		if (hlist_unhashed(&sb->s_instances))
764
			continue;
765
		if (sb->s_bdev == bdev) {
Al Viro's avatar
Al Viro committed
766
			if (!grab_super(sb))
767
				goto restart;
Al Viro's avatar
Al Viro committed
768 769
			up_write(&sb->s_umount);
			return sb;
770
		}
771 772 773 774
	}
	spin_unlock(&sb_lock);
	return NULL;
}
Linus Torvalds's avatar
Linus Torvalds committed
775
 
776
struct super_block *user_get_super(dev_t dev)
Linus Torvalds's avatar
Linus Torvalds committed
777
{
778
	struct super_block *sb;
Linus Torvalds's avatar
Linus Torvalds committed
779 780

	spin_lock(&sb_lock);
781 782
rescan:
	list_for_each_entry(sb, &super_blocks, s_list) {
Al Viro's avatar
Al Viro committed
783
		if (hlist_unhashed(&sb->s_instances))
784
			continue;
785 786
		if (sb->s_dev ==  dev) {
			sb->s_count++;
Linus Torvalds's avatar
Linus Torvalds committed
787
			spin_unlock(&sb_lock);
788
			down_read(&sb->s_umount);
789
			/* still alive? */
Al Viro's avatar
Al Viro committed
790
			if (sb->s_root && (sb->s_flags & MS_BORN))
791 792
				return sb;
			up_read(&sb->s_umount);
793
			/* nope, got unmounted */
794
			spin_lock(&sb_lock);
795 796
			__put_super(sb);
			goto rescan;
Linus Torvalds's avatar
Linus Torvalds committed
797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814
		}
	}
	spin_unlock(&sb_lock);
	return NULL;
}

/**
 *	do_remount_sb - asks filesystem to change mount options.
 *	@sb:	superblock in question
 *	@flags:	numeric part of options
 *	@data:	the rest of options
 *      @force: whether or not to force the change
 *
 *	Alters the mount options of a mounted file system.
 */
int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
{
	int retval;
815
	int remount_ro;
816

817
	if (sb->s_writers.frozen != SB_UNFROZEN)
818 819
		return -EBUSY;

820
#ifdef CONFIG_BLOCK
Linus Torvalds's avatar
Linus Torvalds committed
821 822
	if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
		return -EACCES;
823
#endif
824

825 826
	remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);

827
	if (remount_ro) {
828
		if (!hlist_empty(&sb->s_pins)) {
829
			up_write(&sb->s_umount);
830
			group_pin_kill(&sb->s_pins);
831 832 833 834 835 836 837 838 839 840
			down_write(&sb->s_umount);
			if (!sb->s_root)
				return 0;
			if (sb->s_writers.frozen != SB_UNFROZEN)
				return -EBUSY;
			remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
		}
	}
	shrink_dcache_sb(sb);

Linus Torvalds's avatar
Linus Torvalds committed
841 842
	/* If we are remounting RDONLY and current sb is read/write,
	   make sure there are no rw files opened */
843
	if (remount_ro) {
844
		if (force) {
Al Viro's avatar
Al Viro committed
845 846
			sb->s_readonly_remount = 1;
			smp_wmb();
847 848 849 850 851
		} else {
			retval = sb_prepare_remount_readonly(sb);
			if (retval)
				return retval;
		}
Linus Torvalds's avatar
Linus Torvalds committed
852 853 854 855
	}

	if (sb->s_op->remount_fs) {
		retval = sb->s_op->remount_fs(sb, &flags, data);
856 857
		if (retval) {
			if (!force)
858
				goto cancel_readonly;
859 860 861 862
			/* If forced remount, go ahead despite any errors */
			WARN(1, "forced remount of a %s fs returned %i\n",
			     sb->s_type->name, retval);
		}
Linus Torvalds's avatar
Linus Torvalds committed
863 864
	}
	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
865 866 867
	/* Needs to be ordered wrt mnt_is_readonly() */
	smp_wmb();
	sb->s_readonly_remount = 0;
868

869 870 871 872 873 874 875 876 877 878
	/*
	 * Some filesystems modify their metadata via some other path than the
	 * bdev buffer cache (eg. use a private mapping, or directories in
	 * pagecache, etc). Also file data modifications go via their own
	 * mappings. So If we try to mount readonly then copy the filesystem
	 * from bdev, we could get stale data, so invalidate it to give a best
	 * effort at coherency.
	 */
	if (remount_ro && sb->s_bdev)
		invalidate_bdev(sb->s_bdev);
Linus Torvalds's avatar
Linus Torvalds committed
879
	return 0;
880 881 882 883

cancel_readonly:
	sb->s_readonly_remount = 0;
	return retval;
Linus Torvalds's avatar
Linus Torvalds committed
884 885
}

886
static void do_emergency_remount(struct work_struct *work)
Linus Torvalds's avatar
Linus Torvalds committed
887
{
888
	struct super_block *sb, *p = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
889 890

	spin_lock(&sb_lock);
891
	list_for_each_entry(sb, &super_blocks, s_list) {
Al Viro's avatar
Al Viro committed
892
		if (hlist_unhashed(&sb->s_instances))
893
			continue;
Linus Torvalds's avatar
Linus Torvalds committed
894 895
		sb->s_count++;
		spin_unlock(&sb_lock);
896
		down_write(&sb->s_umount);
Al Viro's avatar
Al Viro committed
897 898
		if (sb->s_root && sb->s_bdev && (sb->s_flags & MS_BORN) &&
		    !(sb->s_flags & MS_RDONLY)) {
Linus Torvalds's avatar
Linus Torvalds committed
899 900 901 902 903
			/*
			 * What lock protects sb->s_flags??
			 */
			do_remount_sb(sb, MS_RDONLY, NULL, 1);
		}
904
		up_write(&sb->s_umount);
Linus Torvalds's avatar
Linus Torvalds committed
905
		spin_lock(&sb_lock);
906 907 908
		if (p)
			__put_super(p);
		p = sb;
Linus Torvalds's avatar
Linus Torvalds committed
909
	}
910 911
	if (p)
		__put_super(p);
Linus Torvalds's avatar
Linus Torvalds committed
912
	spin_unlock(&sb_lock);
913
	kfree(work);
Linus Torvalds's avatar
Linus Torvalds committed
914 915 916 917 918
	printk("Emergency Remount complete\n");
}

void emergency_remount(void)
{
919 920 921 922 923 924 925
	struct work_struct *work;

	work = kmalloc(sizeof(*work), GFP_ATOMIC);
	if (work) {
		INIT_WORK(work, do_emergency_remount);
		schedule_work(work);
	}
Linus Torvalds's avatar
Linus Torvalds committed
926 927 928 929 930 931 932
}

/*
 * Unnamed block devices are dummy devices used by virtual
 * filesystems which don't use real block-devices.  -- jrs
 */

933
static DEFINE_IDA(unnamed_dev_ida);
Linus Torvalds's avatar
Linus Torvalds committed
934
static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
935 936 937 938
/* Many userspace utilities consider an FSID of 0 invalid.
 * Always return at least 1 from get_anon_bdev.
 */
static int unnamed_dev_start = 1;
Linus Torvalds's avatar
Linus Torvalds committed
939

940
int get_anon_bdev(dev_t *p)
Linus Torvalds's avatar
Linus Torvalds committed
941 942 943 944 945
{
	int dev;
	int error;

 retry:
946
	if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0)
Linus Torvalds's avatar
Linus Torvalds committed
947 948
		return -ENOMEM;
	spin_lock(&unnamed_dev_lock);
949
	error = ida_get_new_above(&unnamed_dev_ida, unnamed_dev_start, &dev);
950 951
	if (!error)
		unnamed_dev_start = dev + 1;
Linus Torvalds's avatar
Linus Torvalds committed
952 953 954 955 956 957 958
	spin_unlock(&unnamed_dev_lock);
	if (error == -EAGAIN)
		/* We raced and lost with another CPU. */
		goto retry;
	else if (error)
		return -EAGAIN;

959
	if (dev >= (1 << MINORBITS)) {
Linus Torvalds's avatar
Linus Torvalds committed
960
		spin_lock(&unnamed_dev_lock);
961
		ida_remove(&unnamed_dev_ida, dev);
962 963
		if (unnamed_dev_start > dev)
			unnamed_dev_start = dev;
Linus Torvalds's avatar
Linus Torvalds committed
964 965 966
		spin_unlock(&unnamed_dev_lock);
		return -EMFILE;
	}
967
	*p = MKDEV(0, dev & MINORMASK);
Linus Torvalds's avatar
Linus Torvalds committed
968 969
	return 0;
}
970
EXPORT_SYMBOL(get_anon_bdev);
Linus Torvalds's avatar
Linus Torvalds committed
971

972
void free_anon_bdev(dev_t dev)
Linus Torvalds's avatar
Linus Torvalds committed
973
{
974
	int slot = MINOR(dev);
Linus Torvalds's avatar
Linus Torvalds committed
975
	spin_lock(&unnamed_dev_lock);
976
	ida_remove(&unnamed_dev_ida, slot);
977 978
	if (slot < unnamed_dev_start)
		unnamed_dev_start = slot;
Linus Torvalds's avatar
Linus Torvalds committed
979 980
	spin_unlock(&unnamed_dev_lock);
}
981 982 983 984
EXPORT_SYMBOL(free_anon_bdev);

int set_anon_super(struct super_block *s, void *data)
{
985
	return get_anon_bdev(&s->s_dev);
986 987 988 989 990 991 992 993 994 995
}

EXPORT_SYMBOL(set_anon_super);

void kill_anon_super(struct super_block *sb)
{
	dev_t dev = sb->s_dev;
	generic_shutdown_super(sb);
	free_anon_bdev(dev);
}
Linus Torvalds's avatar
Linus Torvalds committed
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007

EXPORT_SYMBOL(kill_anon_super);

void kill_litter_super(struct super_block *sb)
{
	if (sb->s_root)
		d_genocide(sb->s_root);
	kill_anon_super(sb);
}

EXPORT_SYMBOL(kill_litter_super);

1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
static int ns_test_super(struct super_block *sb, void *data)
{
	return sb->s_fs_info == data;
}

static int ns_set_super(struct super_block *sb, void *data)
{
	sb->s_fs_info = data;
	return set_anon_super(sb, NULL);
}

1019 1020 1021
struct dentry *mount_ns(struct file_system_type *fs_type,
	int flags, void *data, void *ns, struct user_namespace *user_ns,
	int (*fill_super)(struct super_block *, void *, int))
1022 1023 1024
{
	struct super_block *sb;

1025 1026 1027 1028 1029 1030
	/* Don't allow mounting unless the caller has CAP_SYS_ADMIN
	 * over the namespace.
	 */
	if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
		return ERR_PTR(-EPERM);

1031 1032
	sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
			 user_ns, ns);
1033
	if (IS_ERR(sb))
Al Viro's avatar
Al Viro committed
1034
		return ERR_CAST(sb);
1035 1036 1037 1038 1039

	if (!sb->s_root) {
		int err;
		err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
		if (err) {
1040
			deactivate_locked_super(sb);
Al Viro's avatar
Al Viro committed
1041
			return ERR_PTR(err);
1042 1043 1044 1045 1046
		}

		sb->s_flags |= MS_ACTIVE;
	}

Al Viro's avatar
Al Viro committed
1047
	return dget(sb->s_root);
1048 1049
}

Al Viro's avatar
Al Viro committed
1050
EXPORT_SYMBOL(mount_ns);
1051

1052
#ifdef CONFIG_BLOCK
Linus Torvalds's avatar
Linus Torvalds committed
1053 1054 1055 1056
static int set_bdev_super(struct super_block *s, void *data)
{
	s->s_bdev = data;
	s->s_dev = s->s_bdev->bd_dev;
Jens Axboe's avatar
Jens Axboe committed
1057 1058 1059 1060 1061

	/*
	 * We set the bdi here to the queue backing, file systems can
	 * overwrite this in ->fill_super()
	 */
1062
	s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
Linus Torvalds's avatar
Linus Torvalds committed
1063 1064 1065 1066 1067 1068 1069 1070
	return 0;
}

static int test_bdev_super(struct super_block *s, void *data)
{
	return (void *)s->s_bdev == data;
}

Al Viro's avatar
Al Viro committed
1071
struct dentry *mount_bdev(struct file_system_type *fs_type,
Linus Torvalds's avatar
Linus Torvalds committed
1072
	int flags, const char *dev_name, void *data,
Al Viro's avatar
Al Viro committed
1073
	int (*fill_super)(struct super_block *, void *, int))
Linus Torvalds's avatar
Linus Torvalds committed
1074 1075 1076
{
	struct block_device *bdev;
	struct super_block *s;
1077
	fmode_t mode = FMODE_READ | FMODE_EXCL;
Linus Torvalds's avatar
Linus Torvalds committed
1078 1079
	int error = 0;

1080 1081 1082
	if (!(flags & MS_RDONLY))
		mode |= FMODE_WRITE;

1083
	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
Linus Torvalds's avatar
Linus Torvalds committed
1084
	if (IS_ERR(bdev))
Al Viro's avatar
Al Viro committed
1085
		return ERR_CAST(bdev);
Linus Torvalds's avatar
Linus Torvalds committed
1086 1087 1088 1089 1090 1091

	/*
	 * once the super is inserted into the list by sget, s_umount
	 * will protect the lockfs code from trying to start a snapshot
	 * while we are mounting
	 */
1092 1093 1094 1095 1096 1097
	mutex_lock(&bdev->bd_fsfreeze_mutex);
	if (bdev->bd_fsfreeze_count > 0) {
		mutex_unlock(&bdev->bd_fsfreeze_mutex);
		error = -EBUSY;
		goto error_bdev;
	}
1098 1099
	s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC,
		 bdev);
1100
	mutex_unlock(&bdev->bd_fsfreeze_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
1101
	if (IS_ERR(s))
1102
		goto error_s;
Linus Torvalds's avatar
Linus Torvalds committed
1103 1104 1105

	if (s->s_root) {
		if ((flags ^ s->s_flags) & MS_RDONLY) {
1106
			deactivate_locked_super(s);
1107 1108
			error = -EBUSY;
			goto error_bdev;
Linus Torvalds's avatar
Linus Torvalds committed
1109
		}
1110

1111 1112
		/*
		 * s_umount nests inside bd_mutex during
1113 1114 1115 1116
		 * __invalidate_device().  blkdev_put() acquires
		 * bd_mutex and can't be called under s_umount.  Drop
		 * s_umount temporarily.  This is safe as we're
		 * holding an active reference.
1117 1118
		 */
		up_write(&s->s_umount);
1119
		blkdev_put(bdev, mode);
1120
		down_write(&s->s_umount);
Linus Torvalds's avatar
Linus Torvalds committed
1121
	} else {
1122
		s->s_mode = mode;
1123
		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
1124
		sb_set_blocksize(s, block_size(bdev));
1125
		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
Linus Torvalds's avatar
Linus Torvalds committed
1126
		if (error) {
1127
			deactivate_locked_super(s);
1128
			goto error;
1129
		}
1130 1131

		s->s_flags |= MS_ACTIVE;
1132
		bdev->bd_super = s;
Linus Torvalds's avatar
Linus Torvalds committed
1133 1134
	}

Al Viro's avatar
Al Viro committed
1135
	return dget(s->s_root);
Linus Torvalds's avatar
Linus Torvalds committed
1136

1137 1138 1139
error_s:
	error = PTR_ERR(s);
error_bdev:
1140
	blkdev_put(bdev, mode);
1141
error:
Al Viro's avatar
Al Viro committed
1142 1143 1144 1145
	return ERR_PTR(error);
}
EXPORT_SYMBOL(mount_bdev);

Linus Torvalds's avatar
Linus Torvalds committed
1146 1147 1148
void kill_block_super(struct super_block *sb)
{
	struct block_device *bdev = sb->s_bdev;
1149
	fmode_t mode = sb->s_mode;
Linus Torvalds's avatar
Linus Torvalds committed
1150

1151
	bdev->bd_super = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1152 1153
	generic_shutdown_super(sb);
	sync_blockdev(bdev);
1154
	WARN_ON_ONCE(!(mode & FMODE_EXCL));
1155
	blkdev_put(bdev, mode | FMODE_EXCL);
Linus Torvalds's avatar
Linus Torvalds committed
1156 1157 1158
}

EXPORT_SYMBOL(kill_block_super);
1159
#endif
Linus Torvalds's avatar
Linus Torvalds committed
1160

Al Viro's avatar
Al Viro committed
1161
struct dentry *mount_nodev(struct file_system_type *fs_type,
Linus Torvalds's avatar
Linus Torvalds committed
1162
	int flags, void *data,
Al Viro's avatar
Al Viro committed
1163
	int (*fill_super)(struct super_block *, void *, int))
Linus Torvalds's avatar
Linus Torvalds committed
1164 1165
{
	int error;
1166
	struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1167 1168

	if (IS_ERR(s))
Al Viro's avatar
Al Viro committed
1169
		return ERR_CAST(s);
Linus Torvalds's avatar
Linus Torvalds committed
1170

1171
	error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
Linus Torvalds's avatar
Linus Torvalds committed
1172
	if (error) {
1173
		deactivate_locked_super(s);
Al Viro's avatar
Al Viro committed
1174
		return ERR_PTR(error);
Linus Torvalds's avatar
Linus Torvalds committed
1175 1176
	}
	s->s_flags |= MS_ACTIVE;
Al Viro's avatar
Al Viro committed
1177
	return dget(s->s_root);
Linus Torvalds's avatar
Linus Torvalds committed
1178
}
Al Viro's avatar
Al Viro committed
1179 1180
EXPORT_SYMBOL(mount_nodev);

Linus Torvalds's avatar
Linus Torvalds committed
1181 1182 1183 1184 1185
static int compare_single(struct super_block *s, void *p)
{
	return 1;
}

Al Viro's avatar
Al Viro committed
1186
struct dentry *mount_single(struct file_system_type *fs_type,
Linus Torvalds's avatar
Linus Torvalds committed
1187
	int flags, void *data,
Al Viro's avatar
Al Viro committed
1188
	int (*fill_super)(struct super_block *, void *, int))
Linus Torvalds's avatar
Linus Torvalds committed
1189 1190 1191 1192
{
	struct super_block *s;
	int error;

1193
	s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1194
	if (IS_ERR(s))
Al Viro's avatar
Al Viro committed
1195
		return ERR_CAST(s);
Linus Torvalds's avatar
Linus Torvalds committed
1196
	if (!s->s_root) {
1197
		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
Linus Torvalds's avatar
Linus Torvalds committed
1198
		if (error) {
1199
			deactivate_locked_super(s);
Al Viro's avatar
Al Viro committed
1200
			return ERR_PTR(error);
Linus Torvalds's avatar
Linus Torvalds committed
1201 1202
		}
		s->s_flags |= MS_ACTIVE;
1203 1204
	} else {
		do_remount_sb(s, flags, data, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1205
	}
Al Viro's avatar
Al Viro committed
1206 1207 1208 1209
	return dget(s->s_root);
}
EXPORT_SYMBOL(mount_single);

1210 1211
struct dentry *
mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
Linus Torvalds's avatar
Linus Torvalds committed
1212
{
Al Viro's avatar
Al Viro committed
1213
	struct dentry *root;
1214
	struct super_block *sb;
Linus Torvalds's avatar
Linus Torvalds committed
1215
	char *secdata = NULL;
1216
	int error = -ENOMEM;
Al Viro's avatar
Al Viro committed
1217

1218
	if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
Linus Torvalds's avatar
Linus Torvalds committed
1219
		secdata = alloc_secdata();
1220
		if (!secdata)
1221
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1222

1223
		error = security_sb_copy_data(data, secdata);
1224
		if (error)
Linus Torvalds's avatar
Linus Torvalds committed
1225 1226 1227
			goto out_free_secdata;
	}

Al Viro's avatar
Al Viro committed
1228 1229 1230 1231
	root = type->mount(type, flags, name, data);
	if (IS_ERR(root)) {
		error = PTR_ERR(root);
		goto out_free_secdata;
Al Viro's avatar
Al Viro committed
1232
	}
1233 1234 1235 1236
	sb = root->d_sb;
	BUG_ON(!sb);
	WARN_ON(!sb->s_bdi);
	sb->s_flags |= MS_BORN;
1237

1238
	error = security_sb_kern_mount(sb, flags, secdata);
1239 1240
	if (error)
		goto out_sb;
1241

1242 1243 1244 1245
	/*
	 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
	 * but s_maxbytes was an unsigned long long for many releases. Throw
	 * this warning for a little while to try and catch filesystems that
1246
	 * violate this rule.
1247
	 */
1248 1249
	WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
		"negative value (%lld)\n", type->name, sb->s_maxbytes);
1250

1251
	up_write(&sb->s_umount);
1252
	free_secdata(secdata);
1253
	return root;
Linus Torvalds's avatar
Linus Torvalds committed
1254
out_sb:
1255 1256
	dput(root);
	deactivate_locked_super(sb);
Linus Torvalds's avatar
Linus Torvalds committed
1257 1258 1259
out_free_secdata:
	free_secdata(secdata);
out:
1260
	return ERR_PTR(error);
Linus Torvalds's avatar
Linus Torvalds committed
1261 1262
}

1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306
/*
 * Setup private BDI for given superblock. It gets automatically cleaned up
 * in generic_shutdown_super().
 */
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
{
	struct backing_dev_info *bdi;
	int err;
	va_list args;

	bdi = bdi_alloc(GFP_KERNEL);
	if (!bdi)
		return -ENOMEM;

	bdi->name = sb->s_type->name;

	va_start(args, fmt);
	err = bdi_register_va(bdi, NULL, fmt, args);
	va_end(args);
	if (err) {
		bdi_put(bdi);
		return err;
	}
	WARN_ON(sb->s_bdi != &noop_backing_dev_info);
	sb->s_bdi = bdi;
	sb->s_iflags |= SB_I_DYNBDI;

	return 0;
}
EXPORT_SYMBOL(super_setup_bdi_name);

/*
 * Setup private BDI for given superblock. I gets automatically cleaned up
 * in generic_shutdown_super().
 */
int super_setup_bdi(struct super_block *sb)
{
	static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);

	return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
				    atomic_long_inc_return(&bdi_seq));
}
EXPORT_SYMBOL(super_setup_bdi);

1307 1308 1309 1310 1311 1312
/*
 * This is an internal function, please use sb_end_{write,pagefault,intwrite}
 * instead.
 */
void __sb_end_write(struct super_block *sb, int level)
{
1313
	percpu_up_read(sb->s_writers.rw_sem + level-1);
1314 1315 1316
}
EXPORT_SYMBOL(__sb_end_write);

1317 1318 1319 1320 1321 1322 1323
/*
 * This is an internal function, please use sb_start_{write,pagefault,intwrite}
 * instead.
 */
int __sb_start_write(struct super_block *sb, int level, bool wait)
{
	bool force_trylock = false;
1324
	int ret = 1;
1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339

#ifdef CONFIG_LOCKDEP
	/*
	 * We want lockdep to tell us about possible deadlocks with freezing
	 * but it's it bit tricky to properly instrument it. Getting a freeze
	 * protection works as getting a read lock but there are subtle
	 * problems. XFS for example gets freeze protection on internal level
	 * twice in some cases, which is OK only because we already hold a
	 * freeze protection also on higher level. Due to these cases we have
	 * to use wait == F (trylock mode) which must not fail.
	 */
	if (wait) {
		int i;

		for (i = 0; i < level - 1; i++)
1340
			if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
1341 1342 1343 1344 1345
				force_trylock = true;
				break;
			}
	}
#endif
1346 1347 1348 1349 1350
	if (wait && !force_trylock)
		percpu_down_read(sb->s_writers.rw_sem + level-1);
	else
		ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);

1351
	WARN_ON(force_trylock && !ret);
1352 1353
	return ret;
}
1354 1355 1356 1357 1358 1359 1360 1361
EXPORT_SYMBOL(__sb_start_write);

/**
 * sb_wait_write - wait until all writers to given file system finish
 * @sb: the super for which we wait
 * @level: type of writers we wait for (normal vs page fault)
 *
 * This function waits until there are no writers of given type to given file
1362
 * system.
1363 1364 1365
 */
static void sb_wait_write(struct super_block *sb, int level)
{
1366 1367
	percpu_down_write(sb->s_writers.rw_sem + level-1);
}
1368

1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384
/*
 * We are going to return to userspace and forget about these locks, the
 * ownership goes to the caller of thaw_super() which does unlock().
 */
static void lockdep_sb_freeze_release(struct super_block *sb)
{
	int level;

	for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
		percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}

/*
 * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb).
 */
static void lockdep_sb_freeze_acquire(struct super_block *sb)
1385 1386
{
	int level;
1387

1388 1389
	for (level = 0; level < SB_FREEZE_LEVELS; ++level)
		percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
1390 1391 1392 1393 1394
}

static void sb_freeze_unlock(struct super_block *sb)
{
	int level;
1395

1396 1397
	for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
		percpu_up_write(sb->s_writers.rw_sem + level);
1398 1399
}

1400
/**
1401 1402
 * freeze_super - lock the filesystem and force it into a consistent state
 * @sb: the super to lock
1403 1404 1405 1406
 *
 * Syncs the super to make sure the filesystem is consistent and calls the fs's
 * freeze_fs.  Subsequent calls to this without first thawing the fs will return
 * -EBUSY.
1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431
 *
 * During this function, sb->s_writers.frozen goes through these values:
 *
 * SB_UNFROZEN: File system is normal, all writes progress as usual.
 *
 * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
 * writes should be blocked, though page faults are still allowed. We wait for
 * all writes to complete and then proceed to the next stage.
 *
 * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
 * but internal fs threads can still modify the filesystem (although they
 * should not dirty new pages or inodes), writeback can run etc. After waiting
 * for all running page faults we sync the filesystem which will clean all
 * dirty pages and inodes (no new dirty pages or inodes can be created when
 * sync is running).
 *
 * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
 * modification are blocked (e.g. XFS preallocation truncation on inode
 * reclaim). This is usually implemented by blocking new transactions for
 * filesystems that have them and need this additional guard. After all
 * internal writers are finished we call ->freeze_fs() to finish filesystem
 * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
 * mostly auxiliary for filesystems to verify they do not modify frozen fs.
 *
 * sb->s_writers.frozen is protected by sb->s_umount.
1432 1433 1434 1435 1436 1437 1438
 */
int freeze_super(struct super_block *sb)
{
	int ret;

	atomic_inc(&sb->s_active);
	down_write(&sb->s_umount);
1439
	if (sb->s_writers.frozen != SB_UNFROZEN) {
1440 1441 1442 1443
		deactivate_locked_super(sb);
		return -EBUSY;
	}

Al Viro's avatar
Al Viro committed
1444 1445 1446 1447 1448
	if (!(sb->s_flags & MS_BORN)) {
		up_write(&sb->s_umount);
		return 0;	/* sic - it's "nothing to do" */
	}

1449
	if (sb->s_flags & MS_RDONLY) {
1450 1451
		/* Nothing to do really... */
		sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1452 1453 1454 1455
		up_write(&sb->s_umount);
		return 0;
	}

1456 1457 1458 1459
	sb->s_writers.frozen = SB_FREEZE_WRITE;
	/* Release s_umount to preserve sb_start_write -> s_umount ordering */
	up_write(&sb->s_umount);
	sb_wait_write(sb, SB_FREEZE_WRITE);
1460
	down_write(&sb->s_umount);
1461 1462 1463 1464 1465 1466

	/* Now we go and block page faults... */
	sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
	sb_wait_write(sb, SB_FREEZE_PAGEFAULT);

	/* All writers are done so after syncing there won't be dirty data */
1467 1468
	sync_filesystem(sb);

1469 1470 1471
	/* Now wait for internal filesystem counter */
	sb->s_writers.frozen = SB_FREEZE_FS;
	sb_wait_write(sb, SB_FREEZE_FS);
1472 1473 1474 1475 1476 1477

	if (sb->s_op->freeze_fs) {
		ret = sb->s_op->freeze_fs(sb);
		if (ret) {
			printk(KERN_ERR
				"VFS:Filesystem freeze failed\n");
1478
			sb->s_writers.frozen = SB_UNFROZEN;
1479
			sb_freeze_unlock(sb);
1480
			wake_up(&sb->s_writers.wait_unfrozen);
1481 1482 1483 1484
			deactivate_locked_super(sb);
			return ret;
		}
	}
1485
	/*
1486 1487
	 * For debugging purposes so that fs can warn if it sees write activity
	 * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
1488 1489
	 */
	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1490
	lockdep_sb_freeze_release(sb);
1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506
	up_write(&sb->s_umount);
	return 0;
}
EXPORT_SYMBOL(freeze_super);

/**
 * thaw_super -- unlock filesystem
 * @sb: the super to thaw
 *
 * Unlocks the filesystem and marks it writeable again after freeze_super().
 */
int thaw_super(struct super_block *sb)
{
	int error;

	down_write(&sb->s_umount);
1507
	if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
1508 1509 1510 1511
		up_write(&sb->s_umount);
		return -EINVAL;
	}

1512 1513
	if (sb->s_flags & MS_RDONLY) {
		sb->s_writers.frozen = SB_UNFROZEN;
1514
		goto out;
1515
	}
1516

1517 1518
	lockdep_sb_freeze_acquire(sb);

1519 1520 1521 1522 1523
	if (sb->s_op->unfreeze_fs) {
		error = sb->s_op->unfreeze_fs(sb);
		if (error) {
			printk(KERN_ERR
				"VFS:Filesystem thaw failed\n");
1524
			lockdep_sb_freeze_release(sb);
1525 1526 1527 1528 1529
			up_write(&sb->s_umount);
			return error;
		}
	}

1530
	sb->s_writers.frozen = SB_UNFROZEN;
1531 1532
	sb_freeze_unlock(sb);
out:
1533
	wake_up(&sb->s_writers.wait_unfrozen);
1534 1535 1536 1537
	deactivate_locked_super(sb);
	return 0;
}
EXPORT_SYMBOL(thaw_super);