swapfile.c 41.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7
/*
 *  linux/mm/swapfile.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 */

8
#include <linux/config.h>
Linus Torvalds's avatar
Linus Torvalds committed
9
#include <linux/mm.h>
10
#include <linux/mman.h>
Linus Torvalds's avatar
Linus Torvalds committed
11
#include <linux/slab.h>
Linus Torvalds's avatar
Linus Torvalds committed
12 13 14 15
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
16
#include <linux/namei.h>
Linus Torvalds's avatar
Linus Torvalds committed
17
#include <linux/shm.h>
18
#include <linux/blkdev.h>
Andrew Morton's avatar
Andrew Morton committed
19
#include <linux/writeback.h>
20
#include <linux/proc_fs.h>
21
#include <linux/seq_file.h>
22
#include <linux/init.h>
23
#include <linux/module.h>
24
#include <linux/rmap-locking.h>
25
#include <linux/security.h>
26
#include <linux/backing-dev.h>
Linus Torvalds's avatar
Linus Torvalds committed
27 28

#include <asm/pgtable.h>
29
#include <asm/tlbflush.h>
30
#include <linux/swapops.h>
Linus Torvalds's avatar
Linus Torvalds committed
31 32 33

spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
unsigned int nr_swapfiles;
Linus Torvalds's avatar
Linus Torvalds committed
34
int total_swap_pages;
Linus Torvalds's avatar
Linus Torvalds committed
35 36
static int swap_overflow;

37 38
EXPORT_SYMBOL(total_swap_pages);

Linus Torvalds's avatar
Linus Torvalds committed
39 40 41 42
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
Linus Torvalds's avatar
Linus Torvalds committed
43 44 45 46 47

struct swap_list_t swap_list = {-1, -1};

struct swap_info_struct swap_info[MAX_SWAPFILES];

48 49 50 51 52 53 54 55
/*
 * Array of backing blockdevs, for swap_unplug_fn.  We need this because the
 * bdev->unplug_fn can sleep and we cannot hold swap_list_lock while calling
 * the unplug_fn.  And swap_list_lock cannot be turned into a semaphore.
 */
static DECLARE_MUTEX(swap_bdevs_sem);
static struct block_device *swap_bdevs[MAX_SWAPFILES];

Linus Torvalds's avatar
Linus Torvalds committed
56 57
#define SWAPFILE_CLUSTER 256

58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
/*
 * Caller holds swap_bdevs_sem
 */
static void install_swap_bdev(struct block_device *bdev)
{
	int i;

	for (i = 0; i < MAX_SWAPFILES; i++) {
		if (swap_bdevs[i] == NULL) {
			swap_bdevs[i] = bdev;
			return;
		}
	}
	BUG();
}

static void remove_swap_bdev(struct block_device *bdev)
{
	int i;

	for (i = 0; i < MAX_SWAPFILES; i++) {
		if (swap_bdevs[i] == bdev) {
			memcpy(&swap_bdevs[i], &swap_bdevs[i + 1],
				(MAX_SWAPFILES - i - 1) * sizeof(*swap_bdevs));
			swap_bdevs[MAX_SWAPFILES - 1] = NULL;
			return;
		}
	}
	BUG();
}

void swap_unplug_io_fn(struct backing_dev_info *unused_bdi)
{
	int i;

	down(&swap_bdevs_sem);
	for (i = 0; i < MAX_SWAPFILES; i++) {
		struct block_device *bdev = swap_bdevs[i];
		struct backing_dev_info *bdi;

		if (bdev == NULL)
			break;
		bdi = bdev->bd_inode->i_mapping->backing_dev_info;
		(*bdi->unplug_io_fn)(bdi);
	}
	up(&swap_bdevs_sem);
}

Linus Torvalds's avatar
Linus Torvalds committed
106
static inline int scan_swap_map(struct swap_info_struct *si)
Linus Torvalds's avatar
Linus Torvalds committed
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
{
	unsigned long offset;
	/* 
	 * We try to cluster swap pages by allocating them
	 * sequentially in swap.  Once we've allocated
	 * SWAPFILE_CLUSTER pages this way, however, we resort to
	 * first-free allocation, starting a new cluster.  This
	 * prevents us from scattering swap pages all over the entire
	 * swap partition, so that we reduce overall disk seek times
	 * between swap pages.  -- sct */
	if (si->cluster_nr) {
		while (si->cluster_next <= si->highest_bit) {
			offset = si->cluster_next++;
			if (si->swap_map[offset])
				continue;
			si->cluster_nr--;
			goto got_page;
		}
	}
	si->cluster_nr = SWAPFILE_CLUSTER;

	/* try to find an empty (even not aligned) cluster. */
	offset = si->lowest_bit;
 check_next_cluster:
	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
	{
		int nr;
		for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
			if (si->swap_map[nr])
			{
				offset = nr+1;
				goto check_next_cluster;
			}
		/* We found a completly empty cluster, so start
		 * using it.
		 */
		goto got_page;
	}
	/* No luck, so now go finegrined as usual. -Andrea */
	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
		if (si->swap_map[offset])
			continue;
Linus Torvalds's avatar
Linus Torvalds committed
149
		si->lowest_bit = offset+1;
Linus Torvalds's avatar
Linus Torvalds committed
150 151 152 153 154
	got_page:
		if (offset == si->lowest_bit)
			si->lowest_bit++;
		if (offset == si->highest_bit)
			si->highest_bit--;
Linus Torvalds's avatar
Linus Torvalds committed
155 156 157 158
		if (si->lowest_bit > si->highest_bit) {
			si->lowest_bit = si->max;
			si->highest_bit = 0;
		}
Linus Torvalds's avatar
Linus Torvalds committed
159
		si->swap_map[offset] = 1;
160
		si->inuse_pages++;
Linus Torvalds's avatar
Linus Torvalds committed
161 162 163 164
		nr_swap_pages--;
		si->cluster_next = offset+1;
		return offset;
	}
Linus Torvalds's avatar
Linus Torvalds committed
165 166
	si->lowest_bit = si->max;
	si->highest_bit = 0;
Linus Torvalds's avatar
Linus Torvalds committed
167 168 169
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
170
swp_entry_t get_swap_page(void)
Linus Torvalds's avatar
Linus Torvalds committed
171 172 173 174 175 176 177
{
	struct swap_info_struct * p;
	unsigned long offset;
	swp_entry_t entry;
	int type, wrapped = 0;

	entry.val = 0;	/* Out of memory */
Linus Torvalds's avatar
Linus Torvalds committed
178
	swap_list_lock();
Linus Torvalds's avatar
Linus Torvalds committed
179 180 181
	type = swap_list.next;
	if (type < 0)
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
182
	if (nr_swap_pages <= 0)
Linus Torvalds's avatar
Linus Torvalds committed
183 184 185 186
		goto out;

	while (1) {
		p = &swap_info[type];
187
		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
Linus Torvalds's avatar
Linus Torvalds committed
188
			swap_device_lock(p);
Linus Torvalds's avatar
Linus Torvalds committed
189
			offset = scan_swap_map(p);
Linus Torvalds's avatar
Linus Torvalds committed
190 191
			swap_device_unlock(p);
			if (offset) {
192
				entry = swp_entry(type,offset);
Linus Torvalds's avatar
Linus Torvalds committed
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
				type = swap_info[type].next;
				if (type < 0 ||
					p->prio != swap_info[type].prio) {
						swap_list.next = swap_list.head;
				} else {
					swap_list.next = type;
				}
				goto out;
			}
		}
		type = p->next;
		if (!wrapped) {
			if (type < 0 || p->prio != swap_info[type].prio) {
				type = swap_list.head;
				wrapped = 1;
			}
		} else
			if (type < 0)
				goto out;	/* out of swap space */
	}
out:
Linus Torvalds's avatar
Linus Torvalds committed
214
	swap_list_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
215
	return entry;
Linus Torvalds's avatar
Linus Torvalds committed
216 217
}

Linus Torvalds's avatar
Linus Torvalds committed
218
static struct swap_info_struct * swap_info_get(swp_entry_t entry)
Linus Torvalds's avatar
Linus Torvalds committed
219 220 221 222 223 224
{
	struct swap_info_struct * p;
	unsigned long offset, type;

	if (!entry.val)
		goto out;
225
	type = swp_type(entry);
Linus Torvalds's avatar
Linus Torvalds committed
226 227 228 229 230
	if (type >= nr_swapfiles)
		goto bad_nofile;
	p = & swap_info[type];
	if (!(p->flags & SWP_USED))
		goto bad_device;
231
	offset = swp_offset(entry);
Linus Torvalds's avatar
Linus Torvalds committed
232 233 234 235 236 237 238 239
	if (offset >= p->max)
		goto bad_offset;
	if (!p->swap_map[offset])
		goto bad_free;
	swap_list_lock();
	if (p->prio > swap_info[swap_list.next].prio)
		swap_list.next = type;
	swap_device_lock(p);
Linus Torvalds's avatar
Linus Torvalds committed
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
	return p;

bad_free:
	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
	goto out;
bad_offset:
	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
	goto out;
bad_device:
	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
	goto out;
bad_nofile:
	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
out:
	return NULL;
}	

static void swap_info_put(struct swap_info_struct * p)
{
	swap_device_unlock(p);
	swap_list_unlock();
}

static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
{
	int count = p->swap_map[offset];

	if (count < SWAP_MAP_MAX) {
		count--;
		p->swap_map[offset] = count;
		if (!count) {
Linus Torvalds's avatar
Linus Torvalds committed
271 272 273 274 275
			if (offset < p->lowest_bit)
				p->lowest_bit = offset;
			if (offset > p->highest_bit)
				p->highest_bit = offset;
			nr_swap_pages++;
276
			p->inuse_pages--;
Linus Torvalds's avatar
Linus Torvalds committed
277 278
		}
	}
Linus Torvalds's avatar
Linus Torvalds committed
279 280
	return count;
}
Linus Torvalds's avatar
Linus Torvalds committed
281

Linus Torvalds's avatar
Linus Torvalds committed
282 283 284 285 286 287 288 289 290 291
/*
 * Caller has made sure that the swapdevice corresponding to entry
 * is still around or has not been recycled.
 */
void swap_free(swp_entry_t entry)
{
	struct swap_info_struct * p;

	p = swap_info_get(entry);
	if (p) {
292
		swap_entry_free(p, swp_offset(entry));
Linus Torvalds's avatar
Linus Torvalds committed
293 294 295 296
		swap_info_put(p);
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
297 298 299 300 301 302 303 304 305 306 307 308 309 310
/*
 * Check if we're the only user of a swap page,
 * when the page is locked.
 */
static int exclusive_swap_page(struct page *page)
{
	int retval = 0;
	struct swap_info_struct * p;
	swp_entry_t entry;

	entry.val = page->index;
	p = swap_info_get(entry);
	if (p) {
		/* Is the only swap cache user the cache itself? */
311
		if (p->swap_map[swp_offset(entry)] == 1) {
Linus Torvalds's avatar
Linus Torvalds committed
312
			/* Recheck the page count with the pagecache lock held.. */
313
			spin_lock_irq(&swapper_space.tree_lock);
314
			if (page_count(page) - !!PagePrivate(page) == 2)
Linus Torvalds's avatar
Linus Torvalds committed
315
				retval = 1;
316
			spin_unlock_irq(&swapper_space.tree_lock);
Linus Torvalds's avatar
Linus Torvalds committed
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
		}
		swap_info_put(p);
	}
	return retval;
}

/*
 * We can use this swap cache entry directly
 * if there are no other references to it.
 *
 * Here "exclusive_swap_page()" does the real
 * work, but we opportunistically check whether
 * we need to get all the locks first..
 */
int can_share_swap_page(struct page *page)
{
	int retval = 0;
Linus Torvalds's avatar
Linus Torvalds committed
334 335 336

	if (!PageLocked(page))
		BUG();
Linus Torvalds's avatar
Linus Torvalds committed
337 338
	switch (page_count(page)) {
	case 3:
339
		if (!PagePrivate(page))
Linus Torvalds's avatar
Linus Torvalds committed
340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
			break;
		/* Fallthrough */
	case 2:
		if (!PageSwapCache(page))
			break;
		retval = exclusive_swap_page(page);
		break;
	case 1:
		if (PageReserved(page))
			break;
		retval = 1;
	}
	return retval;
}

Linus Torvalds's avatar
Linus Torvalds committed
355 356 357 358 359 360 361 362 363 364
/*
 * Work out if there are any other processes sharing this
 * swap cache page. Free it if you can. Return success.
 */
int remove_exclusive_swap_page(struct page *page)
{
	int retval;
	struct swap_info_struct * p;
	swp_entry_t entry;

365
	BUG_ON(PagePrivate(page));
366 367
	BUG_ON(!PageLocked(page));

Linus Torvalds's avatar
Linus Torvalds committed
368 369
	if (!PageSwapCache(page))
		return 0;
370 371
	if (PageWriteback(page))
		return 0;
372
	if (page_count(page) != 2) /* 2: us + cache */
Linus Torvalds's avatar
Linus Torvalds committed
373 374 375 376 377 378 379 380 381
		return 0;

	entry.val = page->index;
	p = swap_info_get(entry);
	if (!p)
		return 0;

	/* Is the only swap cache user the cache itself? */
	retval = 0;
382
	if (p->swap_map[swp_offset(entry)] == 1) {
Linus Torvalds's avatar
Linus Torvalds committed
383
		/* Recheck the page count with the pagecache lock held.. */
384
		spin_lock_irq(&swapper_space.tree_lock);
385
		if ((page_count(page) == 2) && !PageWriteback(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
386
			__delete_from_swap_cache(page);
Linus Torvalds's avatar
Linus Torvalds committed
387
			SetPageDirty(page);
Linus Torvalds's avatar
Linus Torvalds committed
388 389
			retval = 1;
		}
390
		spin_unlock_irq(&swapper_space.tree_lock);
Linus Torvalds's avatar
Linus Torvalds committed
391 392 393 394 395 396 397 398 399 400 401
	}
	swap_info_put(p);

	if (retval) {
		swap_free(entry);
		page_cache_release(page);
	}

	return retval;
}

Linus Torvalds's avatar
Linus Torvalds committed
402 403 404 405 406 407 408 409 410 411 412
/*
 * Free the swap entry like above, but also try to
 * free the page cache entry if it is the last user.
 */
void free_swap_and_cache(swp_entry_t entry)
{
	struct swap_info_struct * p;
	struct page *page = NULL;

	p = swap_info_get(entry);
	if (p) {
413
		if (swap_entry_free(p, swp_offset(entry)) == 1)
Linus Torvalds's avatar
Linus Torvalds committed
414 415 416 417
			page = find_trylock_page(&swapper_space, entry.val);
		swap_info_put(p);
	}
	if (page) {
418 419
		int one_user;

420
		BUG_ON(PagePrivate(page));
Linus Torvalds's avatar
Linus Torvalds committed
421
		page_cache_get(page);
422
		one_user = (page_count(page) == 2);
Linus Torvalds's avatar
Linus Torvalds committed
423
		/* Only cache user (+us), or swap space full? Free it! */
424
		if (!PageWriteback(page) && (one_user || vm_swap_full())) {
Linus Torvalds's avatar
Linus Torvalds committed
425 426 427
			delete_from_swap_cache(page);
			SetPageDirty(page);
		}
Andrew Morton's avatar
Andrew Morton committed
428
		unlock_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
429
		page_cache_release(page);
Linus Torvalds's avatar
Linus Torvalds committed
430
	}
Linus Torvalds's avatar
Linus Torvalds committed
431 432 433 434 435 436 437 438 439 440 441
}

/*
 * The swap entry has been read in advance, and we return 1 to indicate
 * that the page has been used or is no longer needed.
 *
 * Always set the resulting pte to be nowrite (the same as COW pages
 * after one process has exited).  We don't know just how many PTEs will
 * share this swap entry, so be cautious and let do_wp_page work out
 * what to do if a write is requested later.
 */
Andrew Morton's avatar
Andrew Morton committed
442
/* vma->vm_mm->page_table_lock is held */
443 444 445
static void
unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
	swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
Linus Torvalds's avatar
Linus Torvalds committed
446
{
Andrew Morton's avatar
Andrew Morton committed
447
	vma->vm_mm->rss++;
Linus Torvalds's avatar
Linus Torvalds committed
448
	get_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
449
	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
450
	*pte_chainp = page_add_rmap(page, dir, *pte_chainp);
Linus Torvalds's avatar
Linus Torvalds committed
451
	swap_free(entry);
Linus Torvalds's avatar
Linus Torvalds committed
452 453
}

Andrew Morton's avatar
Andrew Morton committed
454 455
/* vma->vm_mm->page_table_lock is held */
static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
Linus Torvalds's avatar
Linus Torvalds committed
456
	unsigned long address, unsigned long size, unsigned long offset,
Andrew Morton's avatar
Andrew Morton committed
457
	swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
Linus Torvalds's avatar
Linus Torvalds committed
458 459 460
{
	pte_t * pte;
	unsigned long end;
Andrew Morton's avatar
Andrew Morton committed
461
	pte_t swp_pte = swp_entry_to_pte(entry);
Linus Torvalds's avatar
Linus Torvalds committed
462 463

	if (pmd_none(*dir))
Andrew Morton's avatar
Andrew Morton committed
464
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
465 466 467
	if (pmd_bad(*dir)) {
		pmd_ERROR(*dir);
		pmd_clear(dir);
Andrew Morton's avatar
Andrew Morton committed
468
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
469
	}
470
	pte = pte_offset_map(dir, address);
Linus Torvalds's avatar
Linus Torvalds committed
471 472 473 474 475 476
	offset += address & PMD_MASK;
	address &= ~PMD_MASK;
	end = address + size;
	if (end > PMD_SIZE)
		end = PMD_SIZE;
	do {
477
		/*
Andrew Morton's avatar
Andrew Morton committed
478 479
		 * swapoff spends a _lot_ of time in this loop!
		 * Test inline before going to call unuse_pte.
480
		 */
Andrew Morton's avatar
Andrew Morton committed
481 482 483 484 485 486
		if (unlikely(pte_same(*pte, swp_pte))) {
			unuse_pte(vma, offset + address, pte,
					entry, page, pte_chainp);
			pte_unmap(pte);
			return 1;
		}
Linus Torvalds's avatar
Linus Torvalds committed
487 488 489
		address += PAGE_SIZE;
		pte++;
	} while (address && (address < end));
490
	pte_unmap(pte - 1);
Andrew Morton's avatar
Andrew Morton committed
491
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
492 493
}

Andrew Morton's avatar
Andrew Morton committed
494 495
/* vma->vm_mm->page_table_lock is held */
static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
Linus Torvalds's avatar
Linus Torvalds committed
496
	unsigned long address, unsigned long size,
Andrew Morton's avatar
Andrew Morton committed
497
	swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
Linus Torvalds's avatar
Linus Torvalds committed
498 499 500 501 502
{
	pmd_t * pmd;
	unsigned long offset, end;

	if (pgd_none(*dir))
Andrew Morton's avatar
Andrew Morton committed
503
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
504 505 506
	if (pgd_bad(*dir)) {
		pgd_ERROR(*dir);
		pgd_clear(dir);
Andrew Morton's avatar
Andrew Morton committed
507
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
508 509 510 511 512 513 514 515 516 517
	}
	pmd = pmd_offset(dir, address);
	offset = address & PGDIR_MASK;
	address &= ~PGDIR_MASK;
	end = address + size;
	if (end > PGDIR_SIZE)
		end = PGDIR_SIZE;
	if (address >= end)
		BUG();
	do {
Andrew Morton's avatar
Andrew Morton committed
518 519 520
		if (unuse_pmd(vma, pmd, address, end - address,
				offset, entry, page, pte_chainp))
			return 1;
Linus Torvalds's avatar
Linus Torvalds committed
521 522 523
		address = (address + PMD_SIZE) & PMD_MASK;
		pmd++;
	} while (address && (address < end));
Andrew Morton's avatar
Andrew Morton committed
524
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
525 526
}

Andrew Morton's avatar
Andrew Morton committed
527 528 529
/* vma->vm_mm->page_table_lock is held */
static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
	swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
Linus Torvalds's avatar
Linus Torvalds committed
530 531 532 533 534 535
{
	unsigned long start = vma->vm_start, end = vma->vm_end;

	if (start >= end)
		BUG();
	do {
Andrew Morton's avatar
Andrew Morton committed
536 537 538
		if (unuse_pgd(vma, pgdir, start, end - start,
				entry, page, pte_chainp))
			return 1;
Linus Torvalds's avatar
Linus Torvalds committed
539 540 541
		start = (start + PGDIR_SIZE) & PGDIR_MASK;
		pgdir++;
	} while (start && (start < end));
Andrew Morton's avatar
Andrew Morton committed
542
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
543 544
}

Andrew Morton's avatar
Andrew Morton committed
545
static int unuse_process(struct mm_struct * mm,
Linus Torvalds's avatar
Linus Torvalds committed
546 547 548
			swp_entry_t entry, struct page* page)
{
	struct vm_area_struct* vma;
Andrew Morton's avatar
Andrew Morton committed
549 550 551 552 553
	struct pte_chain *pte_chain;

	pte_chain = pte_chain_alloc(GFP_KERNEL);
	if (!pte_chain)
		return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
554 555 556 557 558 559 560

	/*
	 * Go through process' page directory.
	 */
	spin_lock(&mm->page_table_lock);
	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
Andrew Morton's avatar
Andrew Morton committed
561 562
		if (unuse_vma(vma, pgd, entry, page, &pte_chain))
			break;
Linus Torvalds's avatar
Linus Torvalds committed
563 564
	}
	spin_unlock(&mm->page_table_lock);
Andrew Morton's avatar
Andrew Morton committed
565 566
	pte_chain_free(pte_chain);
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
567 568
}

Linus Torvalds's avatar
Linus Torvalds committed
569
/*
Linus Torvalds's avatar
Linus Torvalds committed
570 571
 * Scan swap_map from current position to next entry still in use.
 * Recycle to start on reaching the end, returning 0 when empty.
Linus Torvalds's avatar
Linus Torvalds committed
572
 */
Linus Torvalds's avatar
Linus Torvalds committed
573
static int find_next_to_unuse(struct swap_info_struct *si, int prev)
Linus Torvalds's avatar
Linus Torvalds committed
574
{
Linus Torvalds's avatar
Linus Torvalds committed
575 576 577
	int max = si->max;
	int i = prev;
	int count;
Linus Torvalds's avatar
Linus Torvalds committed
578 579

	/*
Linus Torvalds's avatar
Linus Torvalds committed
580 581 582 583
	 * No need for swap_device_lock(si) here: we're just looking
	 * for whether an entry is in use, not modifying it; false
	 * hits are okay, and sys_swapoff() has already prevented new
	 * allocations from this area (while holding swap_list_lock()).
Linus Torvalds's avatar
Linus Torvalds committed
584
	 */
Linus Torvalds's avatar
Linus Torvalds committed
585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603
	for (;;) {
		if (++i >= max) {
			if (!prev) {
				i = 0;
				break;
			}
			/*
			 * No entries in use at top of swap_map,
			 * loop back to start and recheck there.
			 */
			max = prev + 1;
			prev = 0;
			i = 1;
		}
		count = si->swap_map[i];
		if (count && count != SWAP_MAP_BAD)
			break;
	}
	return i;
Linus Torvalds's avatar
Linus Torvalds committed
604 605
}

Linus Torvalds's avatar
Linus Torvalds committed
606 607 608 609 610 611 612 613
/*
 * We completely avoid races by reading each swap page in advance,
 * and then search for the process using it.  All the necessary
 * page table adjustments can then be made atomically.
 */
static int try_to_unuse(unsigned int type)
{
	struct swap_info_struct * si = &swap_info[type];
Linus Torvalds's avatar
Linus Torvalds committed
614 615 616 617 618 619 620 621
	struct mm_struct *start_mm;
	unsigned short *swap_map;
	unsigned short swcount;
	struct page *page;
	swp_entry_t entry;
	int i = 0;
	int retval = 0;
	int reset_overflow = 0;
622
	int shmem;
Linus Torvalds's avatar
Linus Torvalds committed
623

Linus Torvalds's avatar
Linus Torvalds committed
624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639
	/*
	 * When searching mms for an entry, a good strategy is to
	 * start at the first mm we freed the previous entry from
	 * (though actually we don't notice whether we or coincidence
	 * freed the entry).  Initialize this start_mm with a hold.
	 *
	 * A simpler strategy would be to start at the last mm we
	 * freed the previous entry from; but that would take less
	 * advantage of mmlist ordering (now preserved by swap_out()),
	 * which clusters forked address spaces together, most recent
	 * child immediately after parent.  If we race with dup_mmap(),
	 * we very much want to resolve parent before child, otherwise
	 * we may miss some entries: using last mm would invert that.
	 */
	start_mm = &init_mm;
	atomic_inc(&init_mm.mm_users);
Linus Torvalds's avatar
Linus Torvalds committed
640

Linus Torvalds's avatar
Linus Torvalds committed
641 642 643 644 645 646 647 648 649 650 651 652 653 654
	/*
	 * Keep on scanning until all entries have gone.  Usually,
	 * one pass through swap_map is enough, but not necessarily:
	 * mmput() removes mm from mmlist before exit_mmap() and its
	 * zap_page_range().  That's not too bad, those entries are
	 * on their way out, and handled faster there than here.
	 * do_munmap() behaves similarly, taking the range out of mm's
	 * vma list before zap_page_range().  But unfortunately, when
	 * unmapping a part of a vma, it takes the whole out first,
	 * then reinserts what's left after (might even reschedule if
	 * open() method called) - so swap entries may be invisible
	 * to swapoff for a while, then reappear - but that is rare.
	 */
	while ((i = find_next_to_unuse(si, i))) {
655 656 657 658 659
		if (signal_pending(current)) {
			retval = -EINTR;
			break;
		}

Linus Torvalds's avatar
Linus Torvalds committed
660 661 662 663
		/* 
		 * Get a page for the entry, using the existing swap
		 * cache page if there is one.  Otherwise, get a clean
		 * page and read the swap into it. 
Linus Torvalds's avatar
Linus Torvalds committed
664
		 */
Linus Torvalds's avatar
Linus Torvalds committed
665
		swap_map = &si->swap_map[i];
666
		entry = swp_entry(type, i);
Linus Torvalds's avatar
Linus Torvalds committed
667 668
		page = read_swap_cache_async(entry);
		if (!page) {
Linus Torvalds's avatar
Linus Torvalds committed
669
			/*
Linus Torvalds's avatar
Linus Torvalds committed
670 671 672 673
			 * Either swap_duplicate() failed because entry
			 * has been freed independently, and will not be
			 * reused since sys_swapoff() already disabled
			 * allocation from here, or alloc_page() failed.
Linus Torvalds's avatar
Linus Torvalds committed
674
			 */
Linus Torvalds's avatar
Linus Torvalds committed
675 676 677 678 679
			if (!*swap_map)
				continue;
			retval = -ENOMEM;
			break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
680

Linus Torvalds's avatar
Linus Torvalds committed
681 682 683 684 685 686 687 688
		/*
		 * Don't hold on to start_mm if it looks like exiting.
		 */
		if (atomic_read(&start_mm->mm_users) == 1) {
			mmput(start_mm);
			start_mm = &init_mm;
			atomic_inc(&init_mm.mm_users);
		}
Linus Torvalds's avatar
Linus Torvalds committed
689

Linus Torvalds's avatar
Linus Torvalds committed
690
		/*
Linus Torvalds's avatar
Linus Torvalds committed
691 692 693
		 * Wait for and lock page.  When do_swap_page races with
		 * try_to_unuse, do_swap_page can handle the fault much
		 * faster than try_to_unuse can locate the entry.  This
694
		 * apparently redundant "wait_on_page_locked" lets try_to_unuse
Linus Torvalds's avatar
Linus Torvalds committed
695 696
		 * defer to do_swap_page in such a case - in some tests,
		 * do_swap_page and try_to_unuse repeatedly compete.
Linus Torvalds's avatar
Linus Torvalds committed
697
		 */
698 699
		wait_on_page_locked(page);
		wait_on_page_writeback(page);
Linus Torvalds's avatar
Linus Torvalds committed
700
		lock_page(page);
701
		wait_on_page_writeback(page);
Linus Torvalds's avatar
Linus Torvalds committed
702 703 704 705 706 707

		/*
		 * Remove all references to entry, without blocking.
		 * Whenever we reach init_mm, there's no address space
		 * to search, but use it as a reminder to search shmem.
		 */
708
		shmem = 0;
Linus Torvalds's avatar
Linus Torvalds committed
709
		swcount = *swap_map;
Linus Torvalds's avatar
Linus Torvalds committed
710
		if (swcount > 1) {
Linus Torvalds's avatar
Linus Torvalds committed
711
			if (start_mm == &init_mm)
712
				shmem = shmem_unuse(entry, page);
Linus Torvalds's avatar
Linus Torvalds committed
713
			else
Andrew Morton's avatar
Andrew Morton committed
714
				retval = unuse_process(start_mm, entry, page);
Linus Torvalds's avatar
Linus Torvalds committed
715
		}
Linus Torvalds's avatar
Linus Torvalds committed
716
		if (*swap_map > 1) {
Linus Torvalds's avatar
Linus Torvalds committed
717 718 719
			int set_start_mm = (*swap_map >= swcount);
			struct list_head *p = &start_mm->mmlist;
			struct mm_struct *new_start_mm = start_mm;
Andrew Morton's avatar
Andrew Morton committed
720
			struct mm_struct *prev_mm = start_mm;
Linus Torvalds's avatar
Linus Torvalds committed
721 722
			struct mm_struct *mm;

Andrew Morton's avatar
Andrew Morton committed
723 724
			atomic_inc(&new_start_mm->mm_users);
			atomic_inc(&prev_mm->mm_users);
Linus Torvalds's avatar
Linus Torvalds committed
725
			spin_lock(&mmlist_lock);
Andrew Morton's avatar
Andrew Morton committed
726
			while (*swap_map > 1 && !retval &&
Linus Torvalds's avatar
Linus Torvalds committed
727
					(p = p->next) != &start_mm->mmlist) {
Linus Torvalds's avatar
Linus Torvalds committed
728
				mm = list_entry(p, struct mm_struct, mmlist);
Andrew Morton's avatar
Andrew Morton committed
729 730 731 732 733 734 735
				atomic_inc(&mm->mm_users);
				spin_unlock(&mmlist_lock);
				mmput(prev_mm);
				prev_mm = mm;

				cond_resched();

Linus Torvalds's avatar
Linus Torvalds committed
736
				swcount = *swap_map;
Andrew Morton's avatar
Andrew Morton committed
737 738 739
				if (swcount <= 1)
					;
				else if (mm == &init_mm) {
Linus Torvalds's avatar
Linus Torvalds committed
740
					set_start_mm = 1;
741
					shmem = shmem_unuse(entry, page);
Linus Torvalds's avatar
Linus Torvalds committed
742
				} else
Andrew Morton's avatar
Andrew Morton committed
743
					retval = unuse_process(mm, entry, page);
Linus Torvalds's avatar
Linus Torvalds committed
744
				if (set_start_mm && *swap_map < swcount) {
Andrew Morton's avatar
Andrew Morton committed
745 746
					mmput(new_start_mm);
					atomic_inc(&mm->mm_users);
Linus Torvalds's avatar
Linus Torvalds committed
747 748
					new_start_mm = mm;
					set_start_mm = 0;
Linus Torvalds's avatar
Linus Torvalds committed
749
				}
Andrew Morton's avatar
Andrew Morton committed
750
				spin_lock(&mmlist_lock);
Linus Torvalds's avatar
Linus Torvalds committed
751
			}
Linus Torvalds's avatar
Linus Torvalds committed
752
			spin_unlock(&mmlist_lock);
Andrew Morton's avatar
Andrew Morton committed
753
			mmput(prev_mm);
Linus Torvalds's avatar
Linus Torvalds committed
754 755 756
			mmput(start_mm);
			start_mm = new_start_mm;
		}
Andrew Morton's avatar
Andrew Morton committed
757 758 759 760 761
		if (retval) {
			unlock_page(page);
			page_cache_release(page);
			break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777

		/*
		 * How could swap count reach 0x7fff when the maximum
		 * pid is 0x7fff, and there's no way to repeat a swap
		 * page within an mm (except in shmem, where it's the
		 * shared object which takes the reference count)?
		 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
		 *
		 * If that's wrong, then we should worry more about
		 * exit_mmap() and do_munmap() cases described above:
		 * we might be resetting SWAP_MAP_MAX too early here.
		 * We know "Undead"s can happen, they're okay, so don't
		 * report them; but do report if we reset SWAP_MAP_MAX.
		 */
		if (*swap_map == SWAP_MAP_MAX) {
			swap_device_lock(si);
Linus Torvalds's avatar
Linus Torvalds committed
778
			*swap_map = 1;
Linus Torvalds's avatar
Linus Torvalds committed
779
			swap_device_unlock(si);
Linus Torvalds's avatar
Linus Torvalds committed
780 781
			reset_overflow = 1;
		}
Linus Torvalds's avatar
Linus Torvalds committed
782

Linus Torvalds's avatar
Linus Torvalds committed
783 784
		/*
		 * If a reference remains (rare), we would like to leave
Andrew Morton's avatar
Andrew Morton committed
785
		 * the page in the swap cache; but try_to_unmap could
Linus Torvalds's avatar
Linus Torvalds committed
786 787 788 789 790 791 792 793 794
		 * then re-duplicate the entry once we drop page lock,
		 * so we might loop indefinitely; also, that page could
		 * not be swapped out to other storage meanwhile.  So:
		 * delete from cache even if there's another reference,
		 * after ensuring that the data has been saved to disk -
		 * since if the reference remains (rarer), it will be
		 * read from disk into another page.  Splitting into two
		 * pages would be incorrect if swap supported "shared
		 * private" pages, but they are handled by tmpfs files.
795 796 797 798 799 800
		 *
		 * Note shmem_unuse already deleted a swappage from
		 * the swap cache, unless the move to filepage failed:
		 * in which case it left swappage in cache, lowered its
		 * swap count to pass quickly through the loops above,
		 * and now we must reincrement count to try again later.
Linus Torvalds's avatar
Linus Torvalds committed
801
		 */
802
		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
Andrew Morton's avatar
Andrew Morton committed
803 804 805 806 807
			struct writeback_control wbc = {
				.sync_mode = WB_SYNC_NONE,
			};

			swap_writepage(page, &wbc);
Linus Torvalds's avatar
Linus Torvalds committed
808
			lock_page(page);
809 810
			wait_on_page_writeback(page);
		}
811 812 813 814 815 816
		if (PageSwapCache(page)) {
			if (shmem)
				swap_duplicate(entry);
			else
				delete_from_swap_cache(page);
		}
Linus Torvalds's avatar
Linus Torvalds committed
817 818 819 820

		/*
		 * So we could skip searching mms once swap count went
		 * to 1, we did not mark any present ptes as dirty: must
Andrew Morton's avatar
Andrew Morton committed
821
		 * mark page dirty so shrink_list will preserve it.
Linus Torvalds's avatar
Linus Torvalds committed
822 823
		 */
		SetPageDirty(page);
Andrew Morton's avatar
Andrew Morton committed
824
		unlock_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
825
		page_cache_release(page);
Linus Torvalds's avatar
Linus Torvalds committed
826

Linus Torvalds's avatar
Linus Torvalds committed
827 828
		/*
		 * Make sure that we aren't completely killing
829
		 * interactive performance.
Linus Torvalds's avatar
Linus Torvalds committed
830
		 */
831
		cond_resched();
Linus Torvalds's avatar
Linus Torvalds committed
832 833 834 835 836 837 838 839
	}

	mmput(start_mm);
	if (reset_overflow) {
		printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
		swap_overflow = 0;
	}
	return retval;
Linus Torvalds's avatar
Linus Torvalds committed
840 841
}

842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900
/*
 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
 * corresponds to page offset `offset'.
 */
sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
{
	struct swap_extent *se = sis->curr_swap_extent;
	struct swap_extent *start_se = se;

	for ( ; ; ) {
		struct list_head *lh;

		if (se->start_page <= offset &&
				offset < (se->start_page + se->nr_pages)) {
			return se->start_block + (offset - se->start_page);
		}
		lh = se->list.prev;
		if (lh == &sis->extent_list)
			lh = lh->prev;
		se = list_entry(lh, struct swap_extent, list);
		sis->curr_swap_extent = se;
		BUG_ON(se == start_se);		/* It *must* be present */
	}
}

/*
 * Free all of a swapdev's extent information
 */
static void destroy_swap_extents(struct swap_info_struct *sis)
{
	while (!list_empty(&sis->extent_list)) {
		struct swap_extent *se;

		se = list_entry(sis->extent_list.next,
				struct swap_extent, list);
		list_del(&se->list);
		kfree(se);
	}
	sis->nr_extents = 0;
}

/*
 * Add a block range (and the corresponding page range) into this swapdev's
 * extent list.  The extent list is kept sorted in block order.
 *
 * This function rather assumes that it is called in ascending sector_t order.
 * It doesn't look for extent coalescing opportunities.
 */
static int
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
		unsigned long nr_pages, sector_t start_block)
{
	struct swap_extent *se;
	struct swap_extent *new_se;
	struct list_head *lh;

	lh = sis->extent_list.next;	/* The highest-addressed block */
	while (lh != &sis->extent_list) {
		se = list_entry(lh, struct swap_extent, list);
901 902
		if (se->start_block + se->nr_pages == start_block &&
		    se->start_page  + se->nr_pages == start_page) {
903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951
			/* Merge it */
			se->nr_pages += nr_pages;
			return 0;
		}
		lh = lh->next;
	}

	/*
	 * No merge.  Insert a new extent, preserving ordering.
	 */
	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
	if (new_se == NULL)
		return -ENOMEM;
	new_se->start_page = start_page;
	new_se->nr_pages = nr_pages;
	new_se->start_block = start_block;

	lh = sis->extent_list.prev;	/* The lowest block */
	while (lh != &sis->extent_list) {
		se = list_entry(lh, struct swap_extent, list);
		if (se->start_block > start_block)
			break;
		lh = lh->prev;
	}
	list_add_tail(&new_se->list, lh);
	sis->nr_extents++;
	return 0;
}

/*
 * A `swap extent' is a simple thing which maps a contiguous range of pages
 * onto a contiguous range of disk blocks.  An ordered list of swap extents
 * is built at swapon time and is then used at swap_writepage/swap_readpage
 * time for locating where on disk a page belongs.
 *
 * If the swapfile is an S_ISBLK block device, a single extent is installed.
 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
 * swap files identically.
 *
 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
 * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
 * swapfiles are handled *identically* after swapon time.
 *
 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
 * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
 * some stray blocks are found which do not fall within the PAGE_SIZE alignment
 * requirements, they are simply tossed out - we will never use those blocks
 * for swapping.
 *
952 953 954 955
 * For S_ISREG swapfiles we hold i_sem across the life of the swapon.  This
 * prevents root from shooting her foot off by ftruncating an in-use swapfile,
 * which will scribble on the fs.
 *
956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972
 * The amount of disk space which a single swap extent represents varies.
 * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
 * extents in the list.  To avoid much list walking, we cache the previous
 * search location in `curr_swap_extent', and start new searches from there.
 * This is extremely effective.  The average number of iterations in
 * map_swap_page() has been measured at about 0.3 per page.  - akpm.
 */
static int setup_swap_extents(struct swap_info_struct *sis)
{
	struct inode *inode;
	unsigned blocks_per_page;
	unsigned long page_no;
	unsigned blkbits;
	sector_t probe_block;
	sector_t last_block;
	int ret;

973
	inode = sis->swap_file->f_mapping->host;
974 975 976 977 978 979 980 981 982 983 984 985 986 987
	if (S_ISBLK(inode->i_mode)) {
		ret = add_swap_extent(sis, 0, sis->max, 0);
		goto done;
	}

	blkbits = inode->i_blkbits;
	blocks_per_page = PAGE_SIZE >> blkbits;

	/*
	 * Map all the blocks into the extent list.  This code doesn't try
	 * to be very smart.
	 */
	probe_block = 0;
	page_no = 0;
Andrew Morton's avatar
Andrew Morton committed
988
	last_block = i_size_read(inode) >> blkbits;
989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
	while ((probe_block + blocks_per_page) <= last_block &&
			page_no < sis->max) {
		unsigned block_in_page;
		sector_t first_block;

		first_block = bmap(inode, probe_block);
		if (first_block == 0)
			goto bad_bmap;

		/*
		 * It must be PAGE_SIZE aligned on-disk
		 */
		if (first_block & (blocks_per_page - 1)) {
			probe_block++;
			goto reprobe;
		}

		for (block_in_page = 1; block_in_page < blocks_per_page;
					block_in_page++) {
			sector_t block;

			block = bmap(inode, probe_block + block_in_page);
			if (block == 0)
				goto bad_bmap;
			if (block != first_block + block_in_page) {
				/* Discontiguity */
				probe_block++;
				goto reprobe;
			}
		}

		/*
		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
		 */
		ret = add_swap_extent(sis, page_no, 1,
				first_block >> (PAGE_SHIFT - blkbits));
		if (ret)
			goto out;
		page_no++;
		probe_block += blocks_per_page;
reprobe:
		continue;
	}
	ret = 0;
	if (page_no == 0)
		ret = -EINVAL;
	sis->max = page_no;
	sis->highest_bit = page_no - 1;
done:
	sis->curr_swap_extent = list_entry(sis->extent_list.prev,
					struct swap_extent, list);
	goto out;
bad_bmap:
	printk(KERN_ERR "swapon: swapfile has holes\n");
	ret = -EINVAL;
out:
	return ret;
}

1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067
#if 0	/* We don't need this yet */
#include <linux/backing-dev.h>
int page_queue_congested(struct page *page)
{
	struct backing_dev_info *bdi;

	BUG_ON(!PageLocked(page));	/* It pins the swap_info_struct */

	bdi = page->mapping->backing_dev_info;
	if (PageSwapCache(page)) {
		swp_entry_t entry = { .val = page->index };
		struct swap_info_struct *sis;

		sis = get_swap_info_struct(swp_type(entry));
		bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
	}
	return bdi_write_congested(bdi);
}
#endif

1068
asmlinkage long sys_swapoff(const char __user * specialfile)
Linus Torvalds's avatar
Linus Torvalds committed
1069 1070
{
	struct swap_info_struct * p = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1071
	unsigned short *swap_map;
1072 1073
	struct file *swap_file, *victim;
	struct address_space *mapping;
David S. Miller's avatar
David S. Miller committed
1074
	char * pathname;
Linus Torvalds's avatar
Linus Torvalds committed
1075 1076 1077 1078 1079 1080
	int i, type, prev;
	int err;
	
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

David S. Miller's avatar
David S. Miller committed
1081 1082 1083 1084 1085 1086 1087
	pathname = getname(specialfile);
	err = PTR_ERR(pathname);
	if (IS_ERR(pathname))
		goto out;

	victim = filp_open(pathname, O_RDWR, 0);
	putname(pathname);
1088 1089
	err = PTR_ERR(victim);
	if (IS_ERR(victim))
Linus Torvalds's avatar
Linus Torvalds committed
1090 1091
		goto out;

1092
	mapping = victim->f_mapping;
Linus Torvalds's avatar
Linus Torvalds committed
1093 1094 1095 1096
	prev = -1;
	swap_list_lock();
	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
		p = swap_info + type;
1097
		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
1098
			if (p->swap_file->f_mapping == mapping)
Linus Torvalds's avatar
Linus Torvalds committed
1099
				break;
Linus Torvalds's avatar
Linus Torvalds committed
1100 1101 1102 1103
		}
		prev = type;
	}
	if (type < 0) {
1104 1105 1106 1107
		err = -EINVAL;
		swap_list_unlock();
		goto out_dput;
	}
1108
	if (!security_vm_enough_memory(p->pages))
1109 1110 1111
		vm_unacct_memory(p->pages);
	else {
		err = -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124
		swap_list_unlock();
		goto out_dput;
	}
	if (prev < 0) {
		swap_list.head = p->next;
	} else {
		swap_info[prev].next = p->next;
	}
	if (type == swap_list.next) {
		/* just pick something that's safe... */
		swap_list.next = swap_list.head;
	}
	nr_swap_pages -= p->pages;
Linus Torvalds's avatar
Linus Torvalds committed
1125
	total_swap_pages -= p->pages;
1126
	p->flags &= ~SWP_WRITEOK;
Linus Torvalds's avatar
Linus Torvalds committed
1127
	swap_list_unlock();
1128
	current->flags |= PF_SWAPOFF;
Linus Torvalds's avatar
Linus Torvalds committed
1129
	err = try_to_unuse(type);
1130
	current->flags &= ~PF_SWAPOFF;
Linus Torvalds's avatar
Linus Torvalds committed
1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142
	if (err) {
		/* re-insert swap space back into swap_list */
		swap_list_lock();
		for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
			if (p->prio >= swap_info[i].prio)
				break;
		p->next = i;
		if (prev < 0)
			swap_list.head = swap_list.next = p - swap_info;
		else
			swap_info[prev].next = p - swap_info;
		nr_swap_pages += p->pages;
Linus Torvalds's avatar
Linus Torvalds committed
1143
		total_swap_pages += p->pages;
1144
		p->flags |= SWP_WRITEOK;
Linus Torvalds's avatar
Linus Torvalds committed
1145
		swap_list_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
1146 1147
		goto out_dput;
	}
1148
	down(&swap_bdevs_sem);
Linus Torvalds's avatar
Linus Torvalds committed
1149
	swap_list_lock();
Linus Torvalds's avatar
Linus Torvalds committed
1150
	swap_device_lock(p);
Linus Torvalds's avatar
Linus Torvalds committed
1151
	swap_file = p->swap_file;
Linus Torvalds's avatar
Linus Torvalds committed
1152
	p->swap_file = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1153
	p->max = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1154
	swap_map = p->swap_map;
Linus Torvalds's avatar
Linus Torvalds committed
1155 1156
	p->swap_map = NULL;
	p->flags = 0;
1157
	destroy_swap_extents(p);
Linus Torvalds's avatar
Linus Torvalds committed
1158
	swap_device_unlock(p);
Linus Torvalds's avatar
Linus Torvalds committed
1159
	swap_list_unlock();
1160 1161
	remove_swap_bdev(p->bdev);
	up(&swap_bdevs_sem);
Linus Torvalds's avatar
Linus Torvalds committed
1162
	vfree(swap_map);
1163
	if (S_ISBLK(mapping->host->i_mode)) {
Andrew Morton's avatar
Andrew Morton committed
1164
		struct block_device *bdev = I_BDEV(mapping->host);
1165
		set_blocksize(bdev, p->old_block_size);
1166
		bd_release(bdev);
1167
	} else {
1168
		up(&mapping->host->i_sem);
1169
	}
Linus Torvalds's avatar
Linus Torvalds committed
1170
	filp_close(swap_file, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1171 1172 1173
	err = 0;

out_dput:
1174
	filp_close(victim, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1175 1176 1177 1178
out:
	return err;
}

1179 1180 1181
#ifdef CONFIG_PROC_FS
/* iterator */
static void *swap_start(struct seq_file *swap, loff_t *pos)
Linus Torvalds's avatar
Linus Torvalds committed
1182 1183
{
	struct swap_info_struct *ptr = swap_info;
1184 1185
	int i;
	loff_t l = *pos;
Linus Torvalds's avatar
Linus Torvalds committed
1186

1187
	swap_list_lock();
Linus Torvalds's avatar
Linus Torvalds committed
1188

1189
	for (i = 0; i < nr_swapfiles; i++, ptr++) {
Alexander Viro's avatar
Alexander Viro committed
1190 1191
		if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
			continue;
1192 1193 1194
		if (!l--)
			return ptr;
	}
Alexander Viro's avatar
Alexander Viro committed
1195

1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208
	return NULL;
}

static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
{
	struct swap_info_struct *ptr = v;
	void *endptr = (void *) swap_info + nr_swapfiles * sizeof(struct swap_info_struct);

	for (++ptr; ptr < (struct swap_info_struct *) endptr; ptr++) {
		if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
			continue;
		++*pos;
		return ptr;
Linus Torvalds's avatar
Linus Torvalds committed
1209
	}
1210 1211 1212 1213 1214 1215 1216

	return NULL;
}

static void swap_stop(struct seq_file *swap, void *v)
{
	swap_list_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
1217 1218
}

1219 1220 1221 1222
static int swap_show(struct seq_file *swap, void *v)
{
	struct swap_info_struct *ptr = v;
	struct file *file;
1223
	int len;
1224 1225 1226 1227 1228

	if (v == swap_info)
		seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");

	file = ptr->swap_file;
1229
	len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
1230
	seq_printf(swap, "%*s%s\t%d\t%ld\t%d\n",
1231
		       len < 40 ? 40 - len : 1, " ",
1232 1233 1234
		       S_ISBLK(file->f_dentry->d_inode->i_mode) ?
				"partition" : "file\t",
		       ptr->pages << (PAGE_SHIFT - 10),
1235
		       ptr->inuse_pages << (PAGE_SHIFT - 10),
1236 1237 1238 1239
		       ptr->prio);
	return 0;
}

1240
static struct seq_operations swaps_op = {
1241 1242 1243 1244 1245
	.start =	swap_start,
	.next =		swap_next,
	.stop =		swap_stop,
	.show =		swap_show
};
1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268

static int swaps_open(struct inode *inode, struct file *file)
{
	return seq_open(file, &swaps_op);
}

static struct file_operations proc_swaps_operations = {
	.open		= swaps_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= seq_release,
};

static int __init procswaps_init(void)
{
	struct proc_dir_entry *entry;

	entry = create_proc_entry("swaps", 0, NULL);
	if (entry)
		entry->proc_fops = &proc_swaps_operations;
	return 0;
}
__initcall(procswaps_init);
1269 1270
#endif /* CONFIG_PROC_FS */

Linus Torvalds's avatar
Linus Torvalds committed
1271 1272 1273 1274 1275
/*
 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
 *
 * The swapon system call
 */
1276
asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
Linus Torvalds's avatar
Linus Torvalds committed
1277 1278
{
	struct swap_info_struct * p;
1279
	char *name = NULL;
1280
	struct block_device *bdev = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1281 1282
	struct file *swap_file = NULL;
	struct address_space *mapping;
Linus Torvalds's avatar
Linus Torvalds committed
1283
	unsigned int type;
1284
	int i, prev;
Linus Torvalds's avatar
Linus Torvalds committed
1285
	int error;
1286
	static int least_priority;
Linus Torvalds's avatar
Linus Torvalds committed
1287 1288 1289
	union swap_header *swap_header = 0;
	int swap_header_version;
	int nr_good_pages = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1290
	unsigned long maxpages = 1;
Linus Torvalds's avatar
Linus Torvalds committed
1291
	int swapfilesize;
Linus Torvalds's avatar
Linus Torvalds committed
1292
	unsigned short *swap_map;
1293
	struct page *page = NULL;
1294 1295
	struct inode *inode = NULL;
	int did_down = 0;
1296

Linus Torvalds's avatar
Linus Torvalds committed
1297 1298
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
Linus Torvalds's avatar
Linus Torvalds committed
1299
	swap_list_lock();
Linus Torvalds's avatar
Linus Torvalds committed
1300 1301 1302 1303 1304
	p = swap_info;
	for (type = 0 ; type < nr_swapfiles ; type++,p++)
		if (!(p->flags & SWP_USED))
			break;
	error = -EPERM;
1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317
	/*
	 * Test if adding another swap device is possible. There are
	 * two limiting factors: 1) the number of bits for the swap
	 * type swp_entry_t definition and 2) the number of bits for
	 * the swap type in the swap ptes as defined by the different
	 * architectures. To honor both limitations a swap entry
	 * with swap offset 0 and swap type ~0UL is created, encoded
	 * to a swap pte, decoded to a swp_entry_t again and finally
	 * the swap type part is extracted. This will mask all bits
	 * from the initial ~0UL that can't be encoded in either the
	 * swp_entry_t or the architecture definition of a swap pte.
	 */
	if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
Linus Torvalds's avatar
Linus Torvalds committed
1318
		swap_list_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
1319
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1320
	}
Linus Torvalds's avatar
Linus Torvalds committed
1321 1322
	if (type >= nr_swapfiles)
		nr_swapfiles = type+1;
1323
	INIT_LIST_HEAD(&p->extent_list);
Linus Torvalds's avatar
Linus Torvalds committed
1324
	p->flags = SWP_USED;
1325
	p->nr_extents = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1326
	p->swap_file = NULL;
1327
	p->old_block_size = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1328 1329 1330 1331
	p->swap_map = NULL;
	p->lowest_bit = 0;
	p->highest_bit = 0;
	p->cluster_nr = 0;
1332
	p->inuse_pages = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1333 1334 1335 1336 1337 1338 1339 1340
	p->sdev_lock = SPIN_LOCK_UNLOCKED;
	p->next = -1;
	if (swap_flags & SWAP_FLAG_PREFER) {
		p->prio =
		  (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
	} else {
		p->prio = --least_priority;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1341
	swap_list_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
1342 1343
	name = getname(specialfile);
	error = PTR_ERR(name);
1344 1345
	if (IS_ERR(name)) {
		name = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1346
		goto bad_swap_2;
1347
	}
Linus Torvalds's avatar
Linus Torvalds committed
1348 1349
	swap_file = filp_open(name, O_RDWR, 0);
	error = PTR_ERR(swap_file);
1350 1351
	if (IS_ERR(swap_file)) {
		swap_file = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1352
		goto bad_swap_2;
1353
	}
Linus Torvalds's avatar
Linus Torvalds committed
1354

Linus Torvalds's avatar
Linus Torvalds committed
1355
	p->swap_file = swap_file;
1356
	mapping = swap_file->f_mapping;
1357
	inode = mapping->host;
1358 1359 1360 1361 1362 1363 1364

	error = -EBUSY;
	for (i = 0; i < nr_swapfiles; i++) {
		struct swap_info_struct *q = &swap_info[i];

		if (i == type || !q->swap_file)
			continue;
1365
		if (mapping == q->swap_file->f_mapping)
1366 1367
			goto bad_swap;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1368

Linus Torvalds's avatar
Linus Torvalds committed
1369
	error = -EINVAL;
1370
	if (S_ISBLK(inode->i_mode)) {
Andrew Morton's avatar
Andrew Morton committed
1371
		bdev = I_BDEV(inode);
1372 1373 1374 1375 1376
		error = bd_claim(bdev, sys_swapon);
		if (error < 0) {
			bdev = NULL;
			goto bad_swap;
		}
1377
		p->old_block_size = block_size(bdev);
Andrew Morton's avatar
Andrew Morton committed
1378
		error = set_blocksize(bdev, PAGE_SIZE);
1379 1380
		if (error < 0)
			goto bad_swap;
1381
		p->bdev = bdev;
1382 1383
	} else if (S_ISREG(inode->i_mode)) {
		p->bdev = inode->i_sb->s_bdev;
1384 1385
		down(&inode->i_sem);
		did_down = 1;
1386
	} else {
Linus Torvalds's avatar
Linus Torvalds committed
1387
		goto bad_swap;
1388
	}
Linus Torvalds's avatar
Linus Torvalds committed
1389

1390
	swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
Linus Torvalds's avatar
Linus Torvalds committed
1391

1392 1393 1394
	/*
	 * Read the swap header.
	 */
1395 1396 1397 1398
	if (!mapping->a_ops->readpage) {
		error = -EINVAL;
		goto bad_swap;
	}
1399 1400 1401 1402
	page = read_cache_page(mapping, 0,
			(filler_t *)mapping->a_ops->readpage, swap_file);
	if (IS_ERR(page)) {
		error = PTR_ERR(page);
Linus Torvalds's avatar
Linus Torvalds committed
1403 1404
		goto bad_swap;
	}
1405 1406 1407 1408 1409
	wait_on_page_locked(page);
	if (!PageUptodate(page))
		goto bad_swap;
	kmap(page);
	swap_header = page_address(page);
Linus Torvalds's avatar
Linus Torvalds committed
1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422

	if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
		swap_header_version = 1;
	else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
		swap_header_version = 2;
	else {
		printk("Unable to find swap-space signature\n");
		error = -EINVAL;
		goto bad_swap;
	}
	
	switch (swap_header_version) {
	case 1:
1423 1424 1425 1426
		printk(KERN_ERR "version 0 swap is no longer supported. "
			"Use mkswap -v1 %s\n", name);
		error = -EINVAL;
		goto bad_swap;
Linus Torvalds's avatar
Linus Torvalds committed
1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438
	case 2:
		/* Check the swap header's sub-version and the size of
                   the swap file and bad block lists */
		if (swap_header->info.version != 1) {
			printk(KERN_WARNING
			       "Unable to handle swap header version %d\n",
			       swap_header->info.version);
			error = -EINVAL;
			goto bad_swap;
		}

		p->lowest_bit  = 1;
1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453
		/*
		 * Find out how many pages are allowed for a single swap
		 * device. There are two limiting factors: 1) the number of
		 * bits for the swap offset in the swp_entry_t type and
		 * 2) the number of bits in the a swap pte as defined by
		 * the different architectures. In order to find the
		 * largest possible bit mask a swap entry with swap type 0
		 * and swap offset ~0UL is created, encoded to a swap pte,
		 * decoded to a swp_entry_t again and finally the swap
		 * offset is extracted. This will mask all the bits from
		 * the initial ~0UL mask that can't be encoded in either
		 * the swp_entry_t or the architecture definition of a
		 * swap pte.
		 */
		maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1;
Linus Torvalds's avatar
Linus Torvalds committed
1454 1455
		if (maxpages > swap_header->info.last_page)
			maxpages = swap_header->info.last_page;
Linus Torvalds's avatar
Linus Torvalds committed
1456
		p->highest_bit = maxpages - 1;
Linus Torvalds's avatar
Linus Torvalds committed
1457 1458 1459 1460 1461 1462

		error = -EINVAL;
		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
			goto bad_swap;
		
		/* OK, set up the swap map and apply the bad block list */
Linus Torvalds's avatar
Linus Torvalds committed
1463
		if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
Linus Torvalds's avatar
Linus Torvalds committed
1464 1465 1466 1467 1468
			error = -ENOMEM;
			goto bad_swap;
		}

		error = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1469
		memset(p->swap_map, 0, maxpages * sizeof(short));
Linus Torvalds's avatar
Linus Torvalds committed
1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483
		for (i=0; i<swap_header->info.nr_badpages; i++) {
			int page = swap_header->info.badpages[i];
			if (page <= 0 || page >= swap_header->info.last_page)
				error = -EINVAL;
			else
				p->swap_map[page] = SWAP_MAP_BAD;
		}
		nr_good_pages = swap_header->info.last_page -
				swap_header->info.nr_badpages -
				1 /* header page */;
		if (error) 
			goto bad_swap;
	}
	
Linus Torvalds's avatar
Linus Torvalds committed
1484
	if (swapfilesize && maxpages > swapfilesize) {
Linus Torvalds's avatar
Linus Torvalds committed
1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495
		printk(KERN_WARNING
		       "Swap area shorter than signature indicates\n");
		error = -EINVAL;
		goto bad_swap;
	}
	if (!nr_good_pages) {
		printk(KERN_WARNING "Empty swap-file\n");
		error = -EINVAL;
		goto bad_swap;
	}
	p->swap_map[0] = SWAP_MAP_BAD;
1496 1497 1498
	p->max = maxpages;
	p->pages = nr_good_pages;

1499 1500
	error = setup_swap_extents(p);
	if (error)
1501 1502
		goto bad_swap;

1503
	down(&swap_bdevs_sem);
Linus Torvalds's avatar
Linus Torvalds committed
1504
	swap_list_lock();
Linus Torvalds's avatar
Linus Torvalds committed
1505
	swap_device_lock(p);
1506
	p->flags = SWP_ACTIVE;
Linus Torvalds's avatar
Linus Torvalds committed
1507
	nr_swap_pages += nr_good_pages;
Linus Torvalds's avatar
Linus Torvalds committed
1508
	total_swap_pages += nr_good_pages;
1509 1510 1511
	printk(KERN_INFO "Adding %dk swap on %s.  Priority:%d extents:%d\n",
		nr_good_pages<<(PAGE_SHIFT-10), name,
		p->prio, p->nr_extents);
Linus Torvalds's avatar
Linus Torvalds committed
1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526

	/* insert swap space into swap_list: */
	prev = -1;
	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
		if (p->prio >= swap_info[i].prio) {
			break;
		}
		prev = i;
	}
	p->next = i;
	if (prev < 0) {
		swap_list.head = swap_list.next = p - swap_info;
	} else {
		swap_info[prev].next = p - swap_info;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1527
	swap_device_unlock(p);
Linus Torvalds's avatar
Linus Torvalds committed
1528
	swap_list_unlock();
1529 1530
	install_swap_bdev(p->bdev);
	up(&swap_bdevs_sem);
Linus Torvalds's avatar
Linus Torvalds committed
1531 1532 1533
	error = 0;
	goto out;
bad_swap:
1534
	if (bdev) {
1535
		set_blocksize(bdev, p->old_block_size);
1536
		bd_release(bdev);
1537
	}
Linus Torvalds's avatar
Linus Torvalds committed
1538
bad_swap_2:
Linus Torvalds's avatar
Linus Torvalds committed
1539 1540
	swap_list_lock();
	swap_map = p->swap_map;
Linus Torvalds's avatar
Linus Torvalds committed
1541 1542 1543 1544 1545
	p->swap_file = NULL;
	p->swap_map = NULL;
	p->flags = 0;
	if (!(swap_flags & SWAP_FLAG_PREFER))
		++least_priority;
Linus Torvalds's avatar
Linus Torvalds committed
1546
	swap_list_unlock();
1547
	destroy_swap_extents(p);
Linus Torvalds's avatar
Linus Torvalds committed
1548 1549
	if (swap_map)
		vfree(swap_map);
1550
	if (swap_file)
Linus Torvalds's avatar
Linus Torvalds committed
1551
		filp_close(swap_file, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1552
out:
1553 1554 1555 1556 1557 1558
	if (page && !IS_ERR(page)) {
		kunmap(page);
		page_cache_release(page);
	}
	if (name)
		putname(name);
1559 1560
	if (error && did_down)
		up(&inode->i_sem);
Linus Torvalds's avatar
Linus Torvalds committed
1561 1562 1563 1564 1565 1566
	return error;
}

void si_swapinfo(struct sysinfo *val)
{
	unsigned int i;
Linus Torvalds's avatar
Linus Torvalds committed
1567
	unsigned long nr_to_be_unused = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1568

Linus Torvalds's avatar
Linus Torvalds committed
1569
	swap_list_lock();
Linus Torvalds's avatar
Linus Torvalds committed
1570
	for (i = 0; i < nr_swapfiles; i++) {
Hugh Dickins's avatar
Hugh Dickins committed
1571 1572
		if (!(swap_info[i].flags & SWP_USED) ||
		     (swap_info[i].flags & SWP_WRITEOK))
Linus Torvalds's avatar
Linus Torvalds committed
1573
			continue;
1574
		nr_to_be_unused += swap_info[i].inuse_pages;
Linus Torvalds's avatar
Linus Torvalds committed
1575
	}
Linus Torvalds's avatar
Linus Torvalds committed
1576 1577 1578
	val->freeswap = nr_swap_pages + nr_to_be_unused;
	val->totalswap = total_swap_pages + nr_to_be_unused;
	swap_list_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592
}

/*
 * Verify that a swap entry is valid and increment its swap map count.
 *
 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
 * "permanent", but will be reclaimed by the next swapoff.
 */
int swap_duplicate(swp_entry_t entry)
{
	struct swap_info_struct * p;
	unsigned long offset, type;
	int result = 0;

1593
	type = swp_type(entry);
Linus Torvalds's avatar
Linus Torvalds committed
1594 1595 1596
	if (type >= nr_swapfiles)
		goto bad_file;
	p = type + swap_info;
1597
	offset = swp_offset(entry);
Linus Torvalds's avatar
Linus Torvalds committed
1598

Linus Torvalds's avatar
Linus Torvalds committed
1599
	swap_device_lock(p);
Linus Torvalds's avatar
Linus Torvalds committed
1600 1601 1602 1603 1604 1605 1606 1607 1608 1609
	if (offset < p->max && p->swap_map[offset]) {
		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
			p->swap_map[offset]++;
			result = 1;
		} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
			if (swap_overflow++ < 5)
				printk(KERN_WARNING "swap_dup: swap entry overflow\n");
			p->swap_map[offset] = SWAP_MAP_MAX;
			result = 1;
		}
Linus Torvalds's avatar
Linus Torvalds committed
1610 1611 1612 1613 1614 1615
	}
	swap_device_unlock(p);
out:
	return result;

bad_file:
Linus Torvalds's avatar
Linus Torvalds committed
1616
	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
Linus Torvalds's avatar
Linus Torvalds committed
1617 1618 1619
	goto out;
}

1620 1621
struct swap_info_struct *
get_swap_info_struct(unsigned type)
Linus Torvalds's avatar
Linus Torvalds committed
1622
{
1623
	return &swap_info[type];
Linus Torvalds's avatar
Linus Torvalds committed
1624 1625 1626
}

/*
Linus Torvalds's avatar
Linus Torvalds committed
1627 1628
 * swap_device_lock prevents swap_map being freed. Don't grab an extra
 * reference on the swaphandle, it doesn't matter if it becomes unused.
Linus Torvalds's avatar
Linus Torvalds committed
1629 1630 1631 1632 1633
 */
int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
{
	int ret = 0, i = 1 << page_cluster;
	unsigned long toff;
1634
	struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
Linus Torvalds's avatar
Linus Torvalds committed
1635

Linus Torvalds's avatar
Linus Torvalds committed
1636 1637
	if (!page_cluster)	/* no readahead */
		return 0;
1638
	toff = (swp_offset(entry) >> page_cluster) << page_cluster;
Linus Torvalds's avatar
Linus Torvalds committed
1639 1640 1641
	if (!toff)		/* first page is swap header */
		toff++, i--;
	*offset = toff;
Linus Torvalds's avatar
Linus Torvalds committed
1642

Linus Torvalds's avatar
Linus Torvalds committed
1643
	swap_device_lock(swapdev);
Linus Torvalds's avatar
Linus Torvalds committed
1644 1645 1646 1647
	do {
		/* Don't read-ahead past the end of the swap area */
		if (toff >= swapdev->max)
			break;
Linus Torvalds's avatar
Linus Torvalds committed
1648
		/* Don't read in free or bad pages */
Linus Torvalds's avatar
Linus Torvalds committed
1649 1650 1651 1652 1653 1654 1655
		if (!swapdev->swap_map[toff])
			break;
		if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
			break;
		toff++;
		ret++;
	} while (--i);
Linus Torvalds's avatar
Linus Torvalds committed
1656
	swap_device_unlock(swapdev);
Linus Torvalds's avatar
Linus Torvalds committed
1657 1658
	return ret;
}