v2.4.9.14 -> v2.4.9.15

- Jan Harkes: make Coda work with arbitrary host filesystems, not just filesystems that use generic_file_read/write - Al Viro: block device cleanups - Hugh Dickins: swap device lock fixes - fix swap readahead race - me, Andrea: more reference bit cleanups

v2.4.9.14 -> v2.4.9.15
- Jan Harkes: make Coda work with arbitrary host filesystems, not just filesystems that use generic_file_read/write - Al Viro: block device cleanups - Hugh Dickins: swap device lock fixes - fix swap readahead race - me, Andrea: more reference bit cleanups
e2f6721a · Linus Torvalds · 269f8f70 · e2f6721a · e2f6721a · e2f6721a
Commit e2f6721a authored Feb 04, 2002 by Linus Torvalds
25 changed files
--- a/Makefile
+++ b/Makefile
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 10
-EXTRAVERSION =-pre14
+EXTRAVERSION =-pre15

 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)


--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -491,7 +491,6 @@ static void __exit rd_cleanup (void)
 			bdev->bd_cache_openers--;
 			truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
 			blkdev_put(bdev, BDEV_FILE);
-			bdput(bdev);
 		}
 		destroy_buffers(MKDEV(MAJOR_NR, i));
 	}
@@ -780,7 +779,7 @@ static void __init rd_load_image(kdev_t device, int offset, int unit)
 		if (i && (i % devblocks == 0)) {
 			printk("done disk #%d.\n", i/devblocks);
 			rotate = 0;
-			if (blkdev_close(inode, &infile) != 0) {
+			if (infile.f_op->release(inode, &infile) != 0) {
 				printk("Error closing the disk.\n");
 				goto noclose_input;
 			}

--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -103,6 +103,7 @@ int raw_open(struct inode *inode, struct file *filp)
 	if (!bdev)
 		goto out;

+	atomic_inc(&bdev->bd_count);
 	rdev = to_kdev_t(bdev->bd_dev);
 	err = blkdev_get(bdev, filp->f_mode, 0, BDEV_RAW);
 	if (err)

--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -270,6 +270,8 @@ int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc)
 	return 0;
 }

+EXPORT_SYMBOL(tty_register_ldisc);
+
 /* Set the discipline of a tty line. */
 static int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 {

--- a/drivers/ide/hptraid.c
+++ b/drivers/ide/hptraid.c
@@ -279,6 +279,7 @@ static void __init probedisk(int major, int minor,int device)
 	int i;
        struct highpoint_raid_conf *prom;
 	static unsigned char block[4096];
+	struct block_device *bdev;
 	
 	if (maxsectors(major,minor)==0)
 		return;
@@ -301,12 +302,12 @@ static void __init probedisk(int major, int minor,int device)
 	if (i>8) 
 		return;

-	raid[device].disk[i].bdev = bdget(MKDEV(major,minor));
-        if (raid[device].disk[i].bdev != NULL) {
+	bdev = bdget(MKDEV(major,minor));
+	if (bdev && blkdev_get(bdev,FMODE_READ|FMODE_WRITE,0,BDEV_RAW) == 0) {
        	int j=0;
        	struct gendisk *gd;
-        	/* This is supposed to prevent others from stealing our underlying disks */
-		blkdev_get(raid[device].disk[i].bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+
+		raid[device].disk[i].bdev = bdev;
 		/* now blank the /proc/partitions table for the wrong partition table,
 		   so that scripts don't accidentally mount it and crash the kernel */
 		 /* XXX: the 0 is an utter hack  --hch */
@@ -408,12 +409,12 @@ static void __exit hptraid_exit (void)
 {
 	int i,device;
 	for (device = 0; device<16; device++) {
-		for (i=0;i<8;i++) 
-			if (raid[device].disk[i].bdev) {
-				blkdev_put(raid[device].disk[i].bdev, BDEV_RAW);
-				bdput(raid[device].disk[i].bdev);
-				raid[device].disk[i].bdev = NULL;
-			}       
+		for (i=0;i<8;i++) {
+			struct block_device *bdev = raid[device].disk[i].bdev;
+			raid[device].disk[i].bdev = NULL;
+			if (bdev)
+				blkdev_put(bdev, BDEV_RAW);
+		}
 		if (raid[device].sectors)
 			ataraid_release_device(device);
 	}

--- a/drivers/ide/pdcraid.c
+++ b/drivers/ide/pdcraid.c
@@ -311,12 +311,12 @@ static void __init probedisk(int major, int minor,int device)
        for (i=0;(i<prom->raid.total_disks)&&(i<8);i++) {
        	if ( (prom->raid.disk[i].channel== prom->raid.channel) &&
        	     (prom->raid.disk[i].device == prom->raid.device) ) {
-        	        raid[device].disk[i].bdev = bdget(MKDEV(major,minor));
-        	        if (raid[device].disk[i].bdev != NULL) {
+			struct block_device *bdev = bdget(MKDEV(major,minor));
+			if (bdev && blkdev_get(bdev,FMODE_READ|FMODE_WRITE,0,BDEV_RAW) == 0) {
        	        	struct gendisk *gd;
        	        	int j;
        	        	/* This is supposed to prevent others from stealing our underlying disks */
-        	        	blkdev_get(raid[device].disk[i].bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+				raid[device].disk[i].bdev = bdev;
 				gd=get_gendisk(major);
 				if (gd!=NULL) {
 					for (j=1+(minor<<gd->minor_shift);j<((minor+1)<<gd->minor_shift);j++) 
@@ -418,13 +418,12 @@ static void __exit pdcraid_exit (void)
 {
 	int i,device;
 	for (device = 0; device<16; device++) {
-		for (i=0;i<8;i++) 
-			if (raid[device].disk[i].bdev) {
-				blkdev_put(raid[device].disk[i].bdev, BDEV_RAW);
-				bdput(raid[device].disk[i].bdev);
-				raid[device].disk[i].bdev = NULL;
-
-			}	
+		for (i=0;i<8;i++)  {
+			struct block_device *bdev = raid[device].disk[i].bdev;
+			raid[device].disk[i].bdev = NULL;
+			if (bdev)
+				blkdev_put(bdev, BDEV_RAW);
+		}	
 		if (raid[device].sectors)
 			ataraid_release_device(device);
 	}

--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -649,11 +649,11 @@ static int lock_rdev (mdk_rdev_t *rdev)

 static void unlock_rdev (mdk_rdev_t *rdev)
 {
-	if (!rdev->bdev)
-		MD_BUG();
-	blkdev_put(rdev->bdev, BDEV_RAW);
-	bdput(rdev->bdev);
+	struct block_device *bdev = rdev->bdev;
 	rdev->bdev = NULL;
+	if (!bdev)
+		MD_BUG();
+	blkdev_put(bdev, BDEV_RAW);
 }

 void md_autodetect_dev (kdev_t dev);

--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -404,7 +404,6 @@ static int get_inode(struct block_device *bdev)
 		if (!inode)
 			return -ENOMEM;
 		inode->i_rdev = to_kdev_t(bdev->bd_dev);
-		atomic_inc(&bdev->bd_count);	/* will go away */
 		inode->i_bdev = bdev;
 		inode->i_data.a_ops = &def_blk_aops;
 		bdev->bd_inode = inode;
@@ -437,6 +436,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 	{
 		memset(bdev, 0, sizeof(*bdev));
 		sema_init(&bdev->bd_sem, 1);
+		INIT_LIST_HEAD(&bdev->bd_inodes);
 	}
 }

@@ -522,17 +522,58 @@ struct block_device *bdget(dev_t dev)

 void bdput(struct block_device *bdev)
 {
-	if (atomic_dec_and_test(&bdev->bd_count)) {
+	if (atomic_dec_and_lock(&bdev->bd_count, &bdev_lock)) {
+		struct list_head *p;
 		if (bdev->bd_openers)
 			BUG();
 		if (bdev->bd_cache_openers)
 			BUG();
-		spin_lock(&bdev_lock);
 		list_del(&bdev->bd_hash);
+		while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
+			struct inode *inode;
+			inode = list_entry(p, struct inode, i_devices);
+			list_del_init(p);
+			inode->i_bdev = NULL;
+		}
 		spin_unlock(&bdev_lock);
 		destroy_bdev(bdev);
 	}
 }
+ 
+int bd_acquire(struct inode *inode)
+{
+	struct block_device *bdev;
+	spin_lock(&bdev_lock);
+	if (inode->i_bdev) {
+		atomic_inc(&inode->i_bdev->bd_count);
+		spin_unlock(&bdev_lock);
+		return 0;
+	}
+	spin_unlock(&bdev_lock);
+	bdev = bdget(kdev_t_to_nr(inode->i_rdev));
+	if (!bdev)
+		return -ENOMEM;
+	spin_lock(&bdev_lock);
+	if (!inode->i_bdev) {
+		inode->i_bdev = bdev;
+		list_add(&inode->i_devices, &bdev->bd_inodes);
+	} else if (inode->i_bdev != bdev)
+		BUG();
+	spin_unlock(&bdev_lock);
+	return 0;
+}
+
+/* Call when you free inode */
+
+void bd_forget(struct inode *inode)
+{
+	spin_lock(&bdev_lock);
+	if (inode->i_bdev) {
+		list_del_init(&inode->i_devices);
+		inode->i_bdev = NULL;
+	}
+	spin_unlock(&bdev_lock);
+}

 static struct {
 	const char *name;
@@ -706,13 +747,15 @@ int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags, int kind)
 	}
 	unlock_kernel();
 	up(&bdev->bd_sem);
+	if (ret)
+		bdput(bdev);
 	return ret;
 }

 int blkdev_open(struct inode * inode, struct file * filp)
 {
-	int ret = -ENXIO;
-	struct block_device *bdev = inode->i_bdev;
+	int ret;
+	struct block_device *bdev;

 	/*
 	 * Preserve backwards compatibility and allow large file access
@@ -722,13 +765,15 @@ int blkdev_open(struct inode * inode, struct file * filp)
 	 */
 	filp->f_flags |= O_LARGEFILE;

+	bd_acquire(inode);
+	bdev = inode->i_bdev;
 	down(&bdev->bd_sem);

-	if (get_inode(bdev)) {
-		up(&bdev->bd_sem);
-		return -ENOMEM;
-	}
+	ret = get_inode(bdev);
+	if (ret)
+		goto out;

+	ret = -ENXIO;
 	lock_kernel();
 	if (!bdev->bd_op)
 		bdev->bd_op = get_blkfops(MAJOR(inode->i_rdev));
@@ -749,7 +794,10 @@ int blkdev_open(struct inode * inode, struct file * filp)
 		}
 	}	
 	unlock_kernel();
+out:
 	up(&bdev->bd_sem);
+	if (ret)
+		bdput(bdev);
 	return ret;
 }	

@@ -777,6 +825,7 @@ int blkdev_put(struct block_device *bdev, int kind)
 	}
 	unlock_kernel();
 	up(&bdev->bd_sem);
+	bdput(bdev);
 	return ret;
 }

@@ -841,6 +890,7 @@ int blkdev_close(struct inode * inode, struct file * filp)
 	}
 	unlock_kernel();
 	up(&bdev->bd_sem);
+	bdput(bdev);

 	return ret;
 }

--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -31,28 +31,65 @@
 int use_coda_close;

 static ssize_t
-coda_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
+coda_file_read(struct file *file, char *buf, size_t count, loff_t *ppos)
 {
+	struct inode *inode = file->f_dentry->d_inode;
+	struct coda_inode_info *cii = ITOC(inode);
 	struct file *cfile;
+
+	cfile = cii->c_container;
+	if (!cfile) BUG();
+
+	if (!cfile->f_op || !cfile->f_op->read)
+		return -EINVAL;
+
+	return cfile->f_op->read(cfile, buf, count, ppos);
+}
+
+static ssize_t
+coda_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
+{
 	struct inode *cinode, *inode = file->f_dentry->d_inode;
 	struct coda_inode_info *cii = ITOC(inode);
-	ssize_t n;
+	struct file *cfile;
+	ssize_t ret;
+	int flags;

 	cfile = cii->c_container;
 	if (!cfile) BUG();

-	if (!cfile->f_op || cfile->f_op->write != generic_file_write)
-		BUG();
+	if (!cfile->f_op || !cfile->f_op->write)
+		return -EINVAL;

 	cinode = cfile->f_dentry->d_inode;
-	down(&cinode->i_sem);
+	down(&inode->i_sem);
+	flags = cfile->f_flags;
+        cfile->f_flags |= file->f_flags & (O_APPEND | O_SYNC);

-	n = generic_file_write(file, buf, count, ppos);
+	ret = cfile->f_op->write(cfile, buf, count, ppos);
+
+	cfile->f_flags = flags;
 	inode->i_size = cinode->i_size;
+	up(&inode->i_sem);
+
+	return ret;
+}
+
+static int
+coda_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct coda_inode_info *cii = ITOC(inode);
+	struct file *cfile;
+
+	cfile = cii->c_container;
+
+	if (!cfile) BUG();

-	up(&cinode->i_sem);
+	if (!cfile->f_op || !cfile->f_op->mmap)
+		return -ENODEV;

-	return n;
+	return cfile->f_op->mmap(cfile, vma);
 }

 int coda_open(struct inode *i, struct file *f)
@@ -237,9 +274,9 @@ int coda_fsync(struct file *file, struct dentry *dentry, int datasync)

 struct file_operations coda_file_operations = {
 	llseek:		generic_file_llseek,
-	read:		generic_file_read,
+	read:		coda_file_read,
 	write:		coda_file_write,
-	mmap:		generic_file_mmap,
+	mmap:		coda_file_mmap,
 	open:		coda_open,
 	flush:  	coda_flush,
 	release:	coda_release,

--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -414,7 +414,7 @@ MODULE_AUTHOR("Peter J. Braam <braam@cs.cmu.edu>");
 static int __init init_coda(void)
 {
 	int status;
-	printk(KERN_INFO "Coda Kernel/Venus communications, v5.3.14, coda@cs.cmu.edu\n");
+	printk(KERN_INFO "Coda Kernel/Venus communications, v5.3.15, coda@cs.cmu.edu\n");

 	status = init_coda_psdev();
 	if ( status ) {

--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -2291,9 +2291,16 @@ static int devfs_statfs (struct super_block *sb, struct statfs *buf)
    return 0;
 }   /*  End Function devfs_statfs  */

+static void devfs_clear_inode(struct inode *inode)
+{
+	if (S_ISBLK(inode->i_mode))
+		bdput(inode->i_bdev);
+}
+
 static struct super_operations devfs_sops =
 { 
    put_inode:     force_delete,
+    clear_inode:   devfs_clear_inode,
    statfs:        devfs_statfs,
 };

@@ -2351,9 +2358,7 @@ static struct inode *get_vfs_inode (struct super_block *sb,
    {
 	inode->i_rdev = MKDEV (de->u.fcb.u.device.major,
 			       de->u.fcb.u.device.minor);
-	inode->i_bdev = bdget ( kdev_t_to_nr (inode->i_rdev) );
-	inode->i_mapping->a_ops = &def_blk_aops;
-	if (inode->i_bdev)
+	if (bd_acquire(inode) == 0)
 	{
 	    if (!inode->i_bdev->bd_op && de->u.fcb.ops)
 		inode->i_bdev->bd_op = de->u.fcb.ops;

--- a/fs/devices.c
+++ b/fs/devices.c
@@ -207,7 +207,6 @@ void init_special_inode(struct inode *inode, umode_t mode, int rdev)
 	} else if (S_ISBLK(mode)) {
 		inode->i_fop = &def_blk_fops;
 		inode->i_rdev = to_kdev_t(rdev);
-		inode->i_bdev = bdget(rdev);
 	} else if (S_ISFIFO(mode))
 		inode->i_fop = &def_fifo_fops;
 	else if (S_ISSOCK(mode))

--- a/fs/inode.c
+++ b/fs/inode.c
@@ -106,6 +106,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 		INIT_LIST_HEAD(&inode->i_dentry);
 		INIT_LIST_HEAD(&inode->i_dirty_buffers);
 		INIT_LIST_HEAD(&inode->i_dirty_data_buffers);
+		INIT_LIST_HEAD(&inode->i_devices);
 		sema_init(&inode->i_sem, 1);
 		sema_init(&inode->i_zombie, 1);
 		spin_lock_init(&inode->i_data.i_shared_lock);
@@ -516,11 +517,9 @@ void clear_inode(struct inode *inode)
 	DQUOT_DROP(inode);
 	if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->clear_inode)
 		inode->i_sb->s_op->clear_inode(inode);
-	if (inode->i_bdev) {
-		bdput(inode->i_bdev);
-		inode->i_bdev = NULL;
-	}
-	if (inode->i_cdev) {
+	if (inode->i_bdev)
+		bd_forget(inode);
+	else if (inode->i_cdev) {
 		cdput(inode->i_cdev);
 		inode->i_cdev = NULL;
 	}

--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -47,9 +47,10 @@ get_drive_geometry(int kdev,struct hd_geometry *geo)
 {
 	struct block_device *bdev = bdget(kdev_t_to_nr(kdev));
 	int rc = blkdev_get(bdev, 0, 1, BDEV_FILE);
-        if ( rc == 0 )
+        if ( rc == 0 ) {
 		rc = ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo);
-	blkdev_put(bdev,BDEV_FILE);
+		blkdev_put(bdev, BDEV_FILE);
+	}
 	return rc;
 }

@@ -58,9 +59,10 @@ get_drive_info(int kdev,dasd_information_t *info)
 {
 	struct block_device *bdev = bdget(kdev_t_to_nr(kdev));
 	int rc = blkdev_get(bdev, 0, 1, BDEV_FILE);
-        if ( rc == 0 )
+        if ( rc == 0 ) {
 		rc = ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)(info));
-	blkdev_put(bdev,BDEV_FILE);
+		blkdev_put(bdev, BDEV_FILE);
+	}
 	return rc;
 }


--- a/fs/super.c
+++ b/fs/super.c
@@ -925,6 +925,7 @@ static struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 	error = -EACCES;
 	if (nd.mnt->mnt_flags & MNT_NODEV)
 		goto out;
+	bd_acquire(inode);
 	bdev = inode->i_bdev;
 	bdops = devfs_get_ops ( devfs_get_handle_from_inode (inode) );
 	if (bdops) bdev->bd_op = bdops;
@@ -982,8 +983,6 @@ static struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 	if (!fs_type->read_super(s, data, 0))
 		goto out_fail;
 	unlock_super(s);
-	/* tell bdcache that we are going to keep this one */
-	atomic_inc(&bdev->bd_count);
 	get_filesystem(fs_type);
 	path_release(&nd);
 	return s;
@@ -1128,10 +1127,9 @@ static void kill_super(struct super_block *sb)
 	sb->s_type = NULL;
 	unlock_super(sb);
 	unlock_kernel();
-	if (bdev) {
+	if (bdev)
 		blkdev_put(bdev, BDEV_FS);
-		bdput(bdev);
-	} else
+	else
 		put_unnamed_dev(dev);
 	spin_lock(&sb_lock);
 	list_del(&sb->s_list);
@@ -1718,6 +1716,7 @@ void __init mount_root(void)
 	if (!ROOT_DEV)
 		panic("I have no root and I want to scream");

+retry:
 	bdev = bdget(kdev_t_to_nr(ROOT_DEV));
 	if (!bdev)
 		panic(__FUNCTION__ ": unable to allocate root device");
@@ -1729,7 +1728,7 @@ void __init mount_root(void)
 	retval = blkdev_get(bdev, mode, 0, BDEV_FS);
 	if (retval == -EROFS) {
 		root_mountflags |= MS_RDONLY;
-		retval = blkdev_get(bdev, FMODE_READ, 0, BDEV_FS);
+		goto retry;
 	}
 	if (retval) {
 	        /*
@@ -1977,6 +1976,7 @@ int __init change_root(kdev_t new_root_dev,const char *put_old)
 		int blivet;
 		struct block_device *ramdisk = old_rootmnt->mnt_sb->s_bdev;

+		atomic_inc(&ramdisk->bd_count);
 		blivet = blkdev_get(ramdisk, FMODE_READ, 0, BDEV_FS);
 		printk(KERN_NOTICE "Trying to unmount old root ... ");
 		if (!blivet) {

--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -415,6 +415,7 @@ struct block_device {
 	int			bd_cache_openers;
 	const struct block_device_operations *bd_op;
 	struct semaphore	bd_sem;	/* open/close mutex */
+	struct list_head	bd_inodes;
 };

 struct inode {
@@ -452,6 +453,7 @@ struct inode {
 	int			i_mapping_overload;
 	struct dquot		*i_dquot[MAXQUOTAS];
 	/* These three should probably be a union */
+	struct list_head	i_devices;
 	struct pipe_inode_info	*i_pipe;
 	struct block_device	*i_bdev;
 	struct char_device	*i_cdev;
@@ -1046,6 +1048,8 @@ enum {BDEV_FILE, BDEV_SWAP, BDEV_FS, BDEV_RAW};
 extern int register_blkdev(unsigned int, const char *, struct block_device_operations *);
 extern int unregister_blkdev(unsigned int, const char *);
 extern struct block_device *bdget(dev_t);
+extern int bd_acquire(struct inode *inode);
+extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern struct char_device *cdget(dev_t);
 extern void cdput(struct char_device *);

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -274,7 +274,6 @@ typedef struct page {
 #define PG_active		 6
 #define PG_inactive		 7
 #define PG_slab			 8
-#define PG_swap_cache		 9
 #define PG_skip			10
 #define PG_highmem		11
 #define PG_checked		12	/* kill me in 2.5.<early>. */
@@ -326,18 +325,9 @@ static inline void set_page_dirty(struct page * page)
 #define SetPageDecrAfter(page)	set_bit(PG_decr_after, &(page)->flags)
 #define PageTestandClearDecrAfter(page)	test_and_clear_bit(PG_decr_after, &(page)->flags)
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
-#define PageSwapCache(page)	test_bit(PG_swap_cache, &(page)->flags)
-#define PageReserved(page)	test_bit(PG_reserved, &(page)->flags)
-
 #define PageSetSlab(page)	set_bit(PG_slab, &(page)->flags)
-#define PageSetSwapCache(page)	set_bit(PG_swap_cache, &(page)->flags)
-
-#define PageTestandSetSwapCache(page)	test_and_set_bit(PG_swap_cache, &(page)->flags)
-
-#define PageClearSlab(page)		clear_bit(PG_slab, &(page)->flags)
-#define PageClearSwapCache(page)	clear_bit(PG_swap_cache, &(page)->flags)
-
-#define PageTestandClearSwapCache(page)	test_and_clear_bit(PG_swap_cache, &(page)->flags)
+#define PageClearSlab(page)	clear_bit(PG_slab, &(page)->flags)
+#define PageReserved(page)	test_bit(PG_reserved, &(page)->flags)

 #define PageActive(page)	test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
@@ -465,6 +455,9 @@ extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void swapin_readahead(swp_entry_t);

+extern struct address_space swapper_space;
+#define PageSwapCache(page) ((page)->mapping == &swapper_space)
+
 static inline int is_page_cache_freeable(struct page * page)
 {
 	return page_count(page) - !!page->buffers == 1;
@@ -476,15 +469,13 @@ static inline int is_page_cache_freeable(struct page * page)
 */
 static inline int exclusive_swap_page(struct page *page)
 {
-	unsigned int count;
-
 	if (!PageLocked(page))
 		BUG();
 	if (!PageSwapCache(page))
 		return 0;
-	count = page_count(page) - !!page->buffers;	/*  2: us + swap cache */
-	count += swap_count(page);			/* +1: just swap cache */
-	return count == 3;				/* =3: total */
+	if (page_count(page) - !!page->buffers != 2)	/* 2: us + cache */
+		return 0;
+	return swap_count(page) == 1;			/* 1: just cache */
 }

 extern void __free_pte(pte_t);
@@ -565,11 +556,10 @@ extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int);
 #define GFP_NOFS	(__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO)
 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_USER	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
-#define GFP_HIGHUSER	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO \
-	       	| __GFP_FS | __GFP_HIGHMEM)
+#define GFP_HIGHUSER	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS | __GFP_HIGHMEM)
 #define GFP_KERNEL	(__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
-#define GFP_NFS	(__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
-#define GFP_KSWAPD	(                          __GFP_IO | __GFP_HIGHIO | __GFP_FS)
+#define GFP_NFS		(__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
+#define GFP_KSWAPD	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)

 /* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
   platforms, used as appropriate on others */

--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -84,7 +84,6 @@ extern unsigned int nr_free_buffer_pages(void);
 extern int nr_active_pages;
 extern int nr_inactive_pages;
 extern atomic_t nr_async_pages;
-extern struct address_space swapper_space;
 extern atomic_t page_cache_size;
 extern atomic_t buffermem_pages;
 extern spinlock_t pagecache_lock;
@@ -122,35 +121,27 @@ extern void rw_swap_page_nolock(int, swp_entry_t, char *);
 /* linux/mm/swap_state.c */
 extern void show_swap_cache_info(void);
 extern void add_to_swap_cache(struct page *, swp_entry_t);
-extern int swap_check_entry(unsigned long);
+extern void __delete_from_swap_cache(struct page *page);
+extern void delete_from_swap_cache(struct page *page);
+extern void free_page_and_swap_cache(struct page *page);
 extern struct page * lookup_swap_cache(swp_entry_t);
 extern struct page * read_swap_cache_async(swp_entry_t);

 /* linux/mm/oom_kill.c */
 extern void oom_kill(void);

-/*
- * Make these inline later once they are working properly.
- */
-extern void __delete_from_swap_cache(struct page *page);
-extern void delete_from_swap_cache(struct page *page);
-extern void delete_from_swap_cache_nolock(struct page *page);
-extern void free_page_and_swap_cache(struct page *page);
-
 /* linux/mm/swapfile.c */
 extern unsigned int nr_swapfiles;
 extern struct swap_info_struct swap_info[];
 extern int is_swap_partition(kdev_t);
 extern void si_swapinfo(struct sysinfo *);
-extern swp_entry_t __get_swap_page(unsigned short);
+extern swp_entry_t get_swap_page(void);
 extern void get_swaphandle_info(swp_entry_t, unsigned long *, kdev_t *, 
 					struct inode **);
 extern int swap_duplicate(swp_entry_t);
 extern int swap_count(struct page *);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
-#define get_swap_page() __get_swap_page(1)
-extern void __swap_free(swp_entry_t, unsigned short);
-#define swap_free(entry) __swap_free((entry), 1)
+extern void swap_free(swp_entry_t);
 struct swap_list_t {
 	int head;	/* head of priority-ordered swapfile list */
 	int next;	/* swapfile to be used next */

--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1704,6 +1704,7 @@ struct page * filemap_nopage(struct vm_area_struct * area,
 	 * and possibly copy it over to another page..
 	 */
 	old_page = page;
+	mark_page_accessed(page);
 	if (no_share) {
 		struct page *new_page = alloc_page(GFP_HIGHUSER);

@@ -2553,7 +2554,6 @@ struct page *__read_cache_page(struct address_space *mapping,
 	}
 	if (cached_page)
 		page_cache_release(cached_page);
-	mark_page_accessed(page);
 	return page;
 }

@@ -2571,7 +2571,10 @@ struct page *read_cache_page(struct address_space *mapping,

 retry:
 	page = __read_cache_page(mapping, index, filler, data);
-	if (IS_ERR(page) || Page_Uptodate(page))
+	if (IS_ERR(page))
+		goto out;
+	mark_page_accessed(page);
+	if (Page_Uptodate(page))
 		goto out;

 	lock_page(page);
@@ -2835,6 +2838,7 @@ generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
 unlock:
 		kunmap(page);
 		/* Mark it unlocked again and drop the page.. */
+		SetPageReferenced(page);
 		UnlockPage(page);
 		page_cache_release(page);


--- a/mm/memory.c
+++ b/mm/memory.c
@@ -85,8 +85,6 @@ void __free_pte(pte_t pte)
 	if (page->mapping) {
 		if (pte_dirty(pte))
 			set_page_dirty(page);
-		if (pte_young(pte))
-			mark_page_accessed(page);
 	}
 		
 	free_page_and_swap_cache(page);
@@ -939,10 +937,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 			break;
 		/* Recheck swapcachedness once the page is locked */
 		can_reuse = exclusive_swap_page(old_page);
-#if 1
 		if (can_reuse)
-			delete_from_swap_cache_nolock(old_page);
-#endif
+			delete_from_swap_cache(old_page);
 		UnlockPage(old_page);
 		if (!can_reuse)
 			break;
@@ -1088,23 +1084,19 @@ void swapin_readahead(swp_entry_t entry)
 	unsigned long offset;

 	/*
-	 * Get the number of handles we should do readahead io to. Also,
-	 * grab temporary references on them, releasing them as io completes.
+	 * Get the number of handles we should do readahead io to.
 	 */
 	num = valid_swaphandles(entry, &offset);
 	for (i = 0; i < num; offset++, i++) {
 		/* Don't block on I/O for read-ahead */
-		if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster
-				* (1 << page_cluster)) {
-			while (i++ < num)
-				swap_free(SWP_ENTRY(SWP_TYPE(entry), offset++));
+		if (atomic_read(&nr_async_pages) >=
+		    pager_daemon.swap_cluster << page_cluster)
 			break;
-		}
 		/* Ok, do the async read-ahead now */
 		new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset));
-		if (new_page != NULL)
-			page_cache_release(new_page);
-		swap_free(SWP_ENTRY(SWP_TYPE(entry), offset));
+		if (!new_page)
+			break;
+		page_cache_release(new_page);
 	}
 	return;
 }
@@ -1164,11 +1156,12 @@ static int do_swap_page(struct mm_struct * mm,
 	pte = mk_pte(page, vma->vm_page_prot);

 	swap_free(entry);
+	mark_page_accessed(page);
 	if (exclusive_swap_page(page)) {
 		if (vma->vm_flags & VM_WRITE)
 			pte = pte_mkwrite(pte);
 		pte = pte_mkdirty(pte);
-		delete_from_swap_cache_nolock(page);
+		delete_from_swap_cache(page);
 	}
 	UnlockPage(page);


--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -234,45 +234,55 @@ static int shmem_writepage(struct page * page)
 	int error;
 	struct shmem_inode_info *info;
 	swp_entry_t *entry, swap;
+	struct address_space *mapping;
+	unsigned long index;
 	struct inode *inode;

 	if (!PageLocked(page))
 		BUG();
-	
-	inode = page->mapping->host;
+
+	mapping = page->mapping;
+	index = page->index;
+	inode = mapping->host;
 	info = &inode->u.shmem_i;
-	swap = __get_swap_page(2);
-	error = -ENOMEM;
-	if (!swap.val) {
-		activate_page(page);
-		SetPageDirty(page);
-		goto out;
-	}

 	spin_lock(&info->lock);
-	entry = shmem_swp_entry(info, page->index);
-	if (IS_ERR(entry))	/* this had been allocted on page allocation */
+	entry = shmem_swp_entry(info, index);
+	if (IS_ERR(entry))	/* this had been allocated on page allocation */
 		BUG();
-	shmem_recalc_inode(page->mapping->host);
-	error = -EAGAIN;
+	shmem_recalc_inode(inode);
 	if (entry->val)
 		BUG();

-	*entry = swap;
-	error = 0;
-	/* Remove the from the page cache */
+	/* Remove it from the page cache */
 	lru_cache_del(page);
 	remove_inode_page(page);

+	swap_list_lock();
+	swap = get_swap_page();
+
+	if (!swap.val) {
+		swap_list_unlock();
+		/* Add it back to the page cache */
+		add_to_page_cache_locked(page, mapping, index);
+		activate_page(page);
+		SetPageDirty(page);
+		error = -ENOMEM;
+		goto out;
+	}
+
 	/* Add it to the swap cache */
 	add_to_swap_cache(page, swap);
-	page_cache_release(page);
-	info->swapped++;
+	swap_list_unlock();

-	spin_unlock(&info->lock);
 	set_page_dirty(page);
+	info->swapped++;
+	*entry = swap;
+	error = 0;
 out:
+	spin_unlock(&info->lock);
 	UnlockPage(page);
+	page_cache_release(page);
 	return error;
 }

@@ -356,7 +366,7 @@ static struct page * shmem_getpage_locked(struct inode * inode, unsigned long id

 		swap_free(*entry);
 		*entry = (swp_entry_t) {0};
-		delete_from_swap_cache_nolock(page);
+		delete_from_swap_cache(page);
 		flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1);
 		page->flags = flags | (1 << PG_dirty);
 		add_to_page_cache_locked(page, mapping, idx);

--- a/mm/swap.c
+++ b/mm/swap.c
@@ -54,7 +54,6 @@ void deactivate_page_nolock(struct page * page)
 		del_page_from_active_list(page);
 		add_page_to_inactive_list(page);
 	}
-	ClearPageReferenced(page);
 }	

 void deactivate_page(struct page * page)
@@ -73,7 +72,6 @@ void activate_page_nolock(struct page * page)
 		del_page_from_inactive_list(page);
 		add_page_to_active_list(page);
 	}
-	SetPageReferenced(page);
 }

 void activate_page(struct page * page)

--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -23,17 +23,11 @@
 */
 static int swap_writepage(struct page *page)
 {
-	/* One for the page cache, one for this user, one for page->buffers */
-	if (page_count(page) > 2 + !!page->buffers)
-		goto in_use;
-	if (swap_count(page) > 1)
-		goto in_use;
-
-	delete_from_swap_cache_nolock(page);
-	UnlockPage(page);
-	return 0;
-
-in_use:
+	if (exclusive_swap_page(page)) {
+		delete_from_swap_cache(page);
+		UnlockPage(page);
+		return 0;
+	}
 	rw_swap_page(WRITE, page);
 	return 0;
 }
@@ -75,8 +69,6 @@ void add_to_swap_cache(struct page *page, swp_entry_t entry)
 #endif
 	if (!PageLocked(page))
 		BUG();
-	if (PageTestandSetSwapCache(page))
-		BUG();
 	if (page->mapping)
 		BUG();

@@ -92,51 +84,42 @@ void add_to_swap_cache(struct page *page, swp_entry_t entry)
 */
 void __delete_from_swap_cache(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
-	swp_entry_t entry;
-
 #ifdef SWAP_CACHE_INFO
 	swap_cache_del_total++;
 #endif
-	if (mapping != &swapper_space)
+	if (!PageLocked(page))
 		BUG();
-	if (!PageSwapCache(page) || !PageLocked(page))
+	if (!PageSwapCache(page))
 		BUG();

-	entry.val = page->index;
-	PageClearSwapCache(page);
 	ClearPageDirty(page);
 	__remove_inode_page(page);
-	swap_free(entry);
 }

 /*
- * This will never put the page into the free list, the caller has
- * a reference on the page.
+ * This must be called only on pages that have
+ * been verified to be in the swap cache and locked.
+ * It will never put the page into the free list,
+ * the caller has a reference on the page.
 */
-void delete_from_swap_cache_nolock(struct page *page)
+void delete_from_swap_cache(struct page *page)
 {
+	swp_entry_t entry;
+
 	if (!PageLocked(page))
 		BUG();

 	if (block_flushpage(page, 0))
 		lru_cache_del(page);

+	entry.val = page->index;
+
 	spin_lock(&pagecache_lock);
 	__delete_from_swap_cache(page);
 	spin_unlock(&pagecache_lock);
-	page_cache_release(page);
-}

-/*
- * This must be called only on pages that have
- * been verified to be in the swap cache and locked.
- */
-void delete_from_swap_cache(struct page *page)
-{
-	lock_page(page);
-	delete_from_swap_cache_nolock(page);
-	UnlockPage(page);
+	swap_free(entry);
+	page_cache_release(page);
 }

 /* 
@@ -156,7 +139,7 @@ void free_page_and_swap_cache(struct page *page)
 	 */
 	if (PageSwapCache(page) && !TryLockPage(page)) {
 		if (exclusive_swap_page(page))
-			delete_from_swap_cache_nolock(page);
+			delete_from_swap_cache(page);
 		UnlockPage(page);
 	}
 	page_cache_release(page);
@@ -213,19 +196,24 @@ struct page * read_swap_cache_async(swp_entry_t entry)
 	new_page = alloc_page(GFP_HIGHUSER);
 	if (!new_page)
 		goto out;		/* Out of memory */
+	if (TryLockPage(new_page))
+		BUG();

 	/*
 	 * Check the swap cache again, in case we stalled above.
-	 * The BKL is guarding against races between this check
+	 * swap_list_lock is guarding against races between this check
 	 * and where the new page is added to the swap cache below.
+	 * It is also guarding against race where try_to_swap_out
+	 * allocates entry with get_swap_page then adds to cache.
 	 */
+	swap_list_lock();
 	found_page = __find_get_page(&swapper_space, entry.val, hash);
 	if (found_page)
 		goto out_free_page;

 	/*
 	 * Make sure the swap entry is still in use.  It could have gone
-	 * while caller waited for BKL, or while allocating page above,
+	 * since caller dropped page_table_lock, while allocating page above,
 	 * or while allocating page in prior call via swapin_readahead.
 	 */
 	if (!swap_duplicate(entry))	/* Account for the swap cache */
@@ -234,13 +222,15 @@ struct page * read_swap_cache_async(swp_entry_t entry)
 	/* 
 	 * Add it to the swap cache and read its contents.
 	 */
-	if (TryLockPage(new_page))
-		BUG();
 	add_to_swap_cache(new_page, entry);
+	swap_list_unlock();
+
 	rw_swap_page(READ, new_page);
 	return new_page;

 out_free_page:
+	swap_list_unlock();
+	UnlockPage(new_page);
 	page_cache_release(new_page);
 out:
 	return found_page;

--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -14,6 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/shm.h>
+#include <linux/compiler.h>

 #include <asm/pgtable.h>

@@ -33,7 +34,7 @@ struct swap_info_struct swap_info[MAX_SWAPFILES];

 #define SWAPFILE_CLUSTER 256

-static inline int scan_swap_map(struct swap_info_struct *si, unsigned short count)
+static inline int scan_swap_map(struct swap_info_struct *si)
 {
 	unsigned long offset;
 	/* 
@@ -86,7 +87,8 @@ static inline int scan_swap_map(struct swap_info_struct *si, unsigned short coun
 			si->lowest_bit = si->max;
 			si->highest_bit = 0;
 		}
-		si->swap_map[offset] = count;
+		/* Initial count 1 for user reference + 1 for swap cache */
+		si->swap_map[offset] = 2;
 		nr_swap_pages--;
 		si->cluster_next = offset+1;
 		return offset;
@@ -96,7 +98,12 @@ static inline int scan_swap_map(struct swap_info_struct *si, unsigned short coun
 	return 0;
 }

-swp_entry_t __get_swap_page(unsigned short count)
+/*
+ * Callers of get_swap_page must hold swap_list_lock across the call,
+ * and across the following add_to_swap_cache, to guard against races
+ * with read_swap_cache_async.
+ */
+swp_entry_t get_swap_page(void)
 {
 	struct swap_info_struct * p;
 	unsigned long offset;
@@ -104,20 +111,17 @@ swp_entry_t __get_swap_page(unsigned short count)
 	int type, wrapped = 0;

 	entry.val = 0;	/* Out of memory */
-	if (count >= SWAP_MAP_MAX)
-		goto bad_count;
-	swap_list_lock();
 	type = swap_list.next;
 	if (type < 0)
 		goto out;
-	if (nr_swap_pages == 0)
+	if (nr_swap_pages <= 0)
 		goto out;

 	while (1) {
 		p = &swap_info[type];
 		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
 			swap_device_lock(p);
-			offset = scan_swap_map(p, count);
+			offset = scan_swap_map(p);
 			swap_device_unlock(p);
 			if (offset) {
 				entry = SWP_ENTRY(type,offset);
@@ -142,21 +146,14 @@ swp_entry_t __get_swap_page(unsigned short count)
 				goto out;	/* out of swap space */
 	}
 out:
-	swap_list_unlock();
-	return entry;
-
-bad_count:
-	printk(KERN_ERR "get_swap_page: bad count %hd from %p\n",
-	       count, __builtin_return_address(0));
 	return entry;
 }

-
 /*
 * Caller has made sure that the swapdevice corresponding to entry
 * is still around or has not been recycled.
 */
-void __swap_free(swp_entry_t entry, unsigned short count)
+void swap_free(swp_entry_t entry)
 {
 	struct swap_info_struct * p;
 	unsigned long offset, type;
@@ -180,9 +177,7 @@ void __swap_free(swp_entry_t entry, unsigned short count)
 		swap_list.next = type;
 	swap_device_lock(p);
 	if (p->swap_map[offset] < SWAP_MAP_MAX) {
-		if (p->swap_map[offset] < count)
-			goto bad_count;
-		if (!(p->swap_map[offset] -= count)) {
+		if (!--(p->swap_map[offset])) {
 			if (offset < p->lowest_bit)
 				p->lowest_bit = offset;
 			if (offset > p->highest_bit)
@@ -207,11 +202,6 @@ void __swap_free(swp_entry_t entry, unsigned short count)
 bad_free:
 	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
 	goto out;
-bad_count:
-	swap_device_unlock(p);
-	swap_list_unlock();
-	printk(KERN_ERR "swap_free: Bad count %hd current count %hd\n", count, p->swap_map[offset]);
-	goto out;
 }

 /*
@@ -229,9 +219,9 @@ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
 {
 	pte_t pte = *dir;

-	if (pte_to_swp_entry(pte).val != entry.val)
+	if (likely(pte_to_swp_entry(pte).val != entry.val))
 		return;
-	if (pte_none(pte) || pte_present(pte))
+	if (unlikely(pte_none(pte) || pte_present(pte)))
 		return;
 	get_page(page);
 	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -458,7 +448,7 @@ static int try_to_unuse(unsigned int type)
 		 */
 		lock_page(page);
 		if (PageSwapCache(page))
-			delete_from_swap_cache_nolock(page);
+			delete_from_swap_cache(page);
 		SetPageDirty(page);
 		UnlockPage(page);
 		flush_page_to_ram(page);
@@ -567,14 +557,8 @@ asmlinkage long sys_swapoff(const char * specialfile)
 	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
 		p = swap_info + type;
 		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
-			if (p->swap_file) {
-				if (p->swap_file == nd.dentry)
-				  break;
-			} else {
-				if (S_ISBLK(nd.dentry->d_inode->i_mode)
-				    && (p->swap_device == nd.dentry->d_inode->i_rdev))
-				  break;
-			}
+			if (p->swap_file == nd.dentry)
+			  break;
 		}
 		prev = type;
 	}
@@ -616,19 +600,21 @@ asmlinkage long sys_swapoff(const char * specialfile)
 		goto out_dput;
 	}
 	if (p->swap_device)
-		blkdev_put(nd.dentry->d_inode->i_bdev, BDEV_SWAP);
+		blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP);
 	path_release(&nd);

 	swap_list_lock();
-	nd.dentry = p->swap_file;
-	p->swap_file = NULL;
+	swap_device_lock(p);
 	nd.mnt = p->swap_vfsmnt;
+	nd.dentry = p->swap_file;
 	p->swap_vfsmnt = NULL;
+	p->swap_file = NULL;
 	p->swap_device = 0;
 	p->max = 0;
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
 	p->flags = 0;
+	swap_device_unlock(p);
 	swap_list_unlock();
 	vfree(swap_map);
 	err = 0;
@@ -711,6 +697,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 	unsigned long maxpages = 1;
 	int swapfilesize;
 	struct block_device *bdev = NULL;
+	unsigned short *swap_map;
 	
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -760,6 +747,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 		p->swap_device = dev;
 		set_blocksize(dev, PAGE_SIZE);
 		
+		bd_acquire(swap_inode);
 		bdev = swap_inode->i_bdev;
 		bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode));
 		if (bdops) bdev->bd_op = bdops;
@@ -772,29 +760,24 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 		if (!dev || (blk_size[MAJOR(dev)] &&
 		     !blk_size[MAJOR(dev)][MINOR(dev)]))
 			goto bad_swap;
-		error = -EBUSY;
-		for (i = 0 ; i < nr_swapfiles ; i++) {
-			if (i == type)
-				continue;
-			if (dev == swap_info[i].swap_device)
-				goto bad_swap;
-		}
 		swapfilesize = 0;
 		if (blk_size[MAJOR(dev)])
 			swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
 				>> (PAGE_SHIFT - 10);
-	} else if (S_ISREG(swap_inode->i_mode)) {
-		error = -EBUSY;
-		for (i = 0 ; i < nr_swapfiles ; i++) {
-			if (i == type || !swap_info[i].swap_file)
-				continue;
-			if (swap_inode == swap_info[i].swap_file->d_inode)
-				goto bad_swap;
-		}
+	} else if (S_ISREG(swap_inode->i_mode))
 		swapfilesize = swap_inode->i_size >> PAGE_SHIFT;
-	} else
+	else
 		goto bad_swap;

+	error = -EBUSY;
+	for (i = 0 ; i < nr_swapfiles ; i++) {
+		struct swap_info_struct *q = &swap_info[i];
+		if (i == type || !q->swap_file)
+			continue;
+		if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping)
+			goto bad_swap;
+	}
+
 	swap_header = (void *) __get_free_page(GFP_USER);
 	if (!swap_header) {
 		printk("Unable to start swapping: out of memory :-)\n");
@@ -900,6 +883,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 	}
 	p->swap_map[0] = SWAP_MAP_BAD;
 	swap_list_lock();
+	swap_device_lock(p);
 	p->max = maxpages;
 	p->flags = SWP_WRITEOK;
 	p->pages = nr_good_pages;
@@ -922,6 +906,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 	} else {
 		swap_info[prev].next = p - swap_info;
 	}
+	swap_device_unlock(p);
 	swap_list_unlock();
 	error = 0;
 	goto out;
@@ -929,11 +914,10 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 	if (bdev)
 		blkdev_put(bdev, BDEV_SWAP);
 bad_swap_2:
-	if (p->swap_map)
-		vfree(p->swap_map);
+	swap_list_lock();
+	swap_map = p->swap_map;
 	nd.mnt = p->swap_vfsmnt;
 	nd.dentry = p->swap_file;
-	swap_list_lock();
 	p->swap_device = 0;
 	p->swap_file = NULL;
 	p->swap_vfsmnt = NULL;
@@ -942,6 +926,8 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 	if (!(swap_flags & SWAP_FLAG_PREFER))
 		++least_priority;
 	swap_list_unlock();
+	if (swap_map)
+		vfree(swap_map);
 	path_release(&nd);
 out:
 	if (swap_header)
@@ -987,43 +973,31 @@ int swap_duplicate(swp_entry_t entry)
 	unsigned long offset, type;
 	int result = 0;

-	/* Swap entry 0 is illegal */
-	if (!entry.val)
-		goto out;
 	type = SWP_TYPE(entry);
 	if (type >= nr_swapfiles)
 		goto bad_file;
 	p = type + swap_info;
 	offset = SWP_OFFSET(entry);
-	if (offset >= p->max)
-		goto bad_offset;
-	if (!p->swap_map[offset])
-		goto bad_unused;
-	/*
-	 * Entry is valid, so increment the map count.
-	 */
+
 	swap_device_lock(p);
-	if (p->swap_map[offset] < SWAP_MAP_MAX)
-		p->swap_map[offset]++;
-	else {
-		if (swap_overflow++ < 5)
-			printk(KERN_WARNING "swap_dup: swap entry overflow\n");
-		p->swap_map[offset] = SWAP_MAP_MAX;
+	if (offset < p->max && p->swap_map[offset]) {
+		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
+			p->swap_map[offset]++;
+			result = 1;
+		} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
+			if (swap_overflow++ < 5)
+				printk(KERN_WARNING "swap_dup: swap entry overflow\n");
+			p->swap_map[offset] = SWAP_MAP_MAX;
+			result = 1;
+		}
 	}
 	swap_device_unlock(p);
-	result = 1;
 out:
 	return result;

 bad_file:
 	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
 	goto out;
-bad_offset:
-	/* Don't report: can happen in read_swap_cache_async after swapoff */
-	goto out;
-bad_unused:
-	/* Don't report: can happen in read_swap_cache_async after blocking */
-	goto out;
 }

 /*
@@ -1068,7 +1042,7 @@ int swap_count(struct page *page)
 }

 /*
- * Kernel_lock protects against swap device deletion.
+ * Prior swap_duplicate protects against swap device deletion.
 */
 void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, 
 			kdev_t *dev, struct inode **swapf)
@@ -1108,8 +1082,8 @@ void get_swaphandle_info(swp_entry_t entry, unsigned long *offset,
 }

 /*
- * Kernel_lock protects against swap device deletion. Grab an extra
- * reference on the swaphandle so that it dos not become unused.
+ * swap_device_lock prevents swap_map being freed. Don't grab an extra
+ * reference on the swaphandle, it doesn't matter if it becomes unused.
 */
 int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 {
@@ -1117,20 +1091,23 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 	unsigned long toff;
 	struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;

-	*offset = SWP_OFFSET(entry);
-	toff = *offset = (*offset >> page_cluster) << page_cluster;
+	if (!page_cluster)	/* no readahead */
+		return 0;
+	toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster;
+	if (!toff)		/* first page is swap header */
+		toff++, i--;
+	*offset = toff;

 	swap_device_lock(swapdev);
 	do {
 		/* Don't read-ahead past the end of the swap area */
 		if (toff >= swapdev->max)
 			break;
-		/* Don't read in bad or busy pages */
+		/* Don't read in free or bad pages */
 		if (!swapdev->swap_map[toff])
 			break;
 		if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
 			break;
-		swapdev->swap_map[toff]++;
 		toff++;
 		ret++;
 	} while (--i);

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,14 +52,9 @@ static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct*
 	/* Don't look at this pte if it's been accessed recently. */
 	if (ptep_test_and_clear_young(page_table)) {
 		flush_tlb_page(vma, address);
-		mark_page_accessed(page);
 		return 0;
 	}

-	/* Don't bother with it if the page is otherwise active */
-	if (PageActive(page))
-		return 0;
-
 	if (TryLockPage(page))
 		return 0;

@@ -85,8 +80,8 @@ static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct*
 		entry.val = page->index;
 		if (pte_dirty(pte))
 			set_page_dirty(page);
-set_swap_pte:
 		swap_duplicate(entry);
+set_swap_pte:
 		set_pte(page_table, swp_entry_to_pte(entry));
 drop_pte:
 		mm->rss--;
@@ -130,16 +125,18 @@ static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct*
 	 * we have the swap cache set up to associate the
 	 * page with that swap entry.
 	 */
+	swap_list_lock();
 	entry = get_swap_page();
-	if (!entry.val)
-		goto out_unlock_restore; /* No swap space left */
-
-	/* Add it to the swap cache and mark it dirty */
-	add_to_swap_cache(page, entry);
-	set_page_dirty(page);
-	goto set_swap_pte;
+	if (entry.val) {
+		/* Add it to the swap cache and mark it dirty */
+		add_to_swap_cache(page, entry);
+		swap_list_unlock();
+		set_page_dirty(page);
+		goto set_swap_pte;
+	}

-out_unlock_restore:
+	/* No swap space left */
+	swap_list_unlock();
 	set_pte(page_table, pte);
 	UnlockPage(page);
 	return 0;
@@ -243,9 +240,9 @@ static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vm
 struct mm_struct *swap_mm = &init_mm;

 /*
- * Returns non-zero if we scanned all `count' pages
+ * Returns remaining count of pages to be swapped out by followup call.
 */
-static inline int swap_out_mm(struct mm_struct * mm, int count, int * race, zone_t * classzone)
+static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
@@ -255,11 +252,12 @@ static inline int swap_out_mm(struct mm_struct * mm, int count, int * race, zone
 	 * and ptes.
 	 */
 	spin_lock(&mm->page_table_lock);
-	*race = 1;
-	if (swap_mm != mm)
-		goto out_unlock;
-	*race = 0;
 	address = mm->swap_address;
+	if (address == TASK_SIZE || swap_mm != mm) {
+		/* We raced: don't count this mm but try again */
+		++*mmcounter;
+		goto out_unlock;
+	}
 	vma = find_vma(mm, address);
 	if (vma) {
 		if (address < vma->vm_start)
@@ -267,31 +265,26 @@ static inline int swap_out_mm(struct mm_struct * mm, int count, int * race, zone

 		for (;;) {
 			count = swap_out_vma(mm, vma, address, count, classzone);
-			if (!count)
-				goto out_unlock;
 			vma = vma->vm_next;
 			if (!vma)
 				break;
+			if (!count)
+				goto out_unlock;
 			address = vma->vm_start;
 		}
 	}
-	/* Reset to 0 when we reach the end of address space */
-	mm->swap_address = 0;
-
-	spin_lock(&mmlist_lock);
-	swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
-	spin_unlock(&mmlist_lock);
+	/* Indicate that we reached the end of address space */
+	mm->swap_address = TASK_SIZE;

 out_unlock:
 	spin_unlock(&mm->page_table_lock);
-
 	return count;
 }

 static int FASTCALL(swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
 static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
 {
-	int counter, race;
+	int counter;
 	struct mm_struct *mm;

 	/* Then, look at the other mm's */
@@ -304,9 +297,10 @@ static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_

 		spin_lock(&mmlist_lock);
 		mm = swap_mm;
-		if (mm == &init_mm) {
+		while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
+			mm->swap_address = 0;
 			mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
-			if (mm == &init_mm)
+			if (mm == swap_mm)
 				goto empty;
 			swap_mm = mm;
 		}
@@ -315,13 +309,13 @@ static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_
 		atomic_inc(&mm->mm_users);
 		spin_unlock(&mmlist_lock);

-		nr_pages = swap_out_mm(mm, nr_pages, &race, classzone);
+		nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);

 		mmput(mm);

 		if (!nr_pages)
 			return 1;
-	} while (race || --counter >= 0);
+	} while (--counter >= 0);

 	return 0;

@@ -330,15 +324,15 @@ static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_
 	return 0;
 }

-static int FASTCALL(shrink_cache(struct list_head * lru, int * max_scan, int this_max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask));
-static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask)
+static int FASTCALL(shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask));
+static int shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask)
 {
 	struct list_head * entry;
-	int __max_scan = *max_scan;

 	spin_lock(&pagemap_lru_lock);
-	while (__max_scan && this_max_scan && (entry = lru->prev) != lru) {
+	while (max_scan && (entry = inactive_list.prev) != &inactive_list) {
 		struct page * page;
+		swp_entry_t swap;

 		if (unlikely(current->need_resched)) {
 			spin_unlock(&pagemap_lru_lock);
@@ -353,18 +347,16 @@ static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_sca
 		if (unlikely(!PageInactive(page) && !PageActive(page)))
 			BUG();

-		this_max_scan--;
-
 		list_del(entry);
-		list_add(entry, lru);
+		list_add(entry, &inactive_list);
 		if (PageTestandClearReferenced(page))
 			continue;

+		max_scan--;
+
 		if (unlikely(!memclass(page->zone, classzone)))
 			continue;

-		__max_scan--;
-
 		/* Racy check to avoid trylocking when not worthwhile */
 		if (!page->buffers && page_count(page) != 1)
 			continue;
@@ -479,14 +471,24 @@ static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_sca
 		}

 		/* point of no return */
-		if (likely(!PageSwapCache(page)))
+		if (likely(!PageSwapCache(page))) {
+			swap.val = 0;
 			__remove_inode_page(page);
-		else
+		} else {
+			swap.val = page->index;
 			__delete_from_swap_cache(page);
+		}
 		spin_unlock(&pagecache_lock);

 		__lru_cache_del(page);

+		if (unlikely(swap.val != 0)) {
+			/* must drop lru lock if getting swap_list lock */
+			spin_unlock(&pagemap_lru_lock);
+			swap_free(swap);
+			spin_lock(&pagemap_lru_lock);
+		}
+
 		UnlockPage(page);

 		/* effectively free the page here */
@@ -498,7 +500,6 @@ static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_sca
 	}
 	spin_unlock(&pagemap_lru_lock);

-	*max_scan = __max_scan;
 	return nr_pages;
 }

@@ -509,14 +510,10 @@ static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_sca
 * We move them the other way when we see the
 * reference bit on the page.
 */
-static void balance_inactive(int nr_pages)
+static void refill_inactive(int nr_pages)
 {
 	struct list_head * entry;

-	/* If we have more inactive pages than active don't do anything */
-	if (nr_active_pages < nr_inactive_pages)
-		return;
-
 	spin_lock(&pagemap_lru_lock);
 	entry = active_list.prev;
 	while (nr_pages-- && entry != &active_list) {
@@ -541,14 +538,17 @@ static void balance_inactive(int nr_pages)
 static int FASTCALL(shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
 static int shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
 {
-	int max_scan = (nr_inactive_pages + nr_active_pages / DEF_PRIORITY) / priority;
+	int max_scan = nr_inactive_pages / priority;

 	nr_pages -= kmem_cache_reap(gfp_mask);
 	if (nr_pages <= 0)
 		return 0;

-	balance_inactive(nr_pages);
-	nr_pages = shrink_cache(&inactive_list, &max_scan, nr_inactive_pages, nr_pages, classzone, gfp_mask);
+	/* Do we want to age the active list? */
+	if (nr_inactive_pages < nr_active_pages*2)
+		refill_inactive(nr_pages);
+
+	nr_pages = shrink_cache(nr_pages, max_scan, classzone, gfp_mask);
 	if (nr_pages <= 0)
 		return 0;