[PATCH] turn i_shared_lock into a semaphore

i_shared_lock is held for a very long time during vmtruncate() and causes high scheduling latencies when truncating a file which is mmapped. I've seen 100 milliseconds. So turn it into a semaphore. It nests inside mmap_sem. This change is also needed by the shared pagetable patch, which needs to unshare pte's on the vmtruncate path - lots of pagetable pages need to be allocated and they are using __GFP_WAIT. The patch also makes unmap_vma() static.

[PATCH] turn i_shared_lock into a semaphore
i_shared_lock is held for a very long time during vmtruncate() and causes high scheduling latencies when truncating a file which is mmapped. I've seen 100 milliseconds. So turn it into a semaphore. It nests inside mmap_sem. This change is also needed by the shared pagetable patch, which needs to unshare pte's on the vmtruncate path - lots of pagetable pages need to be allocated and they are using __GFP_WAIT. The patch also makes unmap_vma() static.
d9be9136 · Andrew Morton · Richard Henderson · b473e48b · d9be9136 · d9be9136
Commit d9be9136 authored Jan 10, 2003 by Andrew Morton Committed by Richard Henderson Jan 10, 2003
12 changed files
--- a/Documentation/vm/locking
+++ b/Documentation/vm/locking
@@ -66,10 +66,9 @@ in some cases it is not really needed. Eg, vm_start is modified by
 expand_stack(), it is hard to come up with a destructive scenario without 
 having the vmlist protection in this case.

-The page_table_lock nests with the inode i_shared_lock and the kmem cache
-c_spinlock spinlocks. This is okay, since code that holds i_shared_lock 
-never asks for memory, and the kmem code asks for pages after dropping
-c_spinlock. The page_table_lock also nests with pagecache_lock and 
+The page_table_lock nests with the inode i_shared_sem and the kmem cache
+c_spinlock spinlocks.  This is okay, since the kmem code asks for pages after
+dropping c_spinlock.  The page_table_lock also nests with pagecache_lock and
 pagemap_lru_lock spinlocks, and no code asks for memory with these locks
 held.


--- a/drivers/mtd/devices/blkmtd.c
+++ b/drivers/mtd/devices/blkmtd.c
@@ -1219,7 +1219,7 @@ static int __init init_blkmtd(void)
  INIT_LIST_HEAD(&mtd_rawdevice->as.dirty_pages);
  INIT_LIST_HEAD(&mtd_rawdevice->as.locked_pages);
  mtd_rawdevice->as.host = NULL;
-  spin_lock_init(&(mtd_rawdevice->as.i_shared_lock));
+  init_MUTEX(&(mtd_rawdevice->as.i_shared_sem));

  mtd_rawdevice->as.a_ops = &blkmtd_aops;
  INIT_LIST_HEAD(&mtd_rawdevice->as.i_mmap);

--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -297,7 +297,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 		goto do_expand;

 	inode->i_size = offset;
-	spin_lock(&mapping->i_shared_lock);
+	down(&mapping->i_shared_sem);
 	if (list_empty(&mapping->i_mmap) && list_empty(&mapping->i_mmap_shared))
 		goto out_unlock;
 	if (!list_empty(&mapping->i_mmap))
@@ -306,7 +306,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 		hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff);

 out_unlock:
-	spin_unlock(&mapping->i_shared_lock);
+	up(&mapping->i_shared_sem);
 	truncate_hugepages(mapping, offset);
 	return 0;


--- a/fs/inode.c
+++ b/fs/inode.c
@@ -171,7 +171,7 @@ void inode_init_once(struct inode *inode)
 	sema_init(&inode->i_sem, 1);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	rwlock_init(&inode->i_data.page_lock);
-	spin_lock_init(&inode->i_data.i_shared_lock);
+	init_MUTEX(&inode->i_data.i_shared_sem);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
 	INIT_LIST_HEAD(&inode->i_data.i_mmap);

--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -319,7 +319,7 @@ struct address_space {
 	struct address_space_operations *a_ops;	/* methods */
 	struct list_head	i_mmap;		/* list of private mappings */
 	struct list_head	i_mmap_shared;	/* list of private mappings */
-	spinlock_t		i_shared_lock;  /* and spinlock protecting it */
+	struct semaphore	i_shared_sem;	/* and sem protecting it */
 	unsigned long		dirtied_when;	/* jiffies of first page dirtying */
 	int			gfp_mask;	/* how to allocate the pages */
 	struct backing_dev_info *backing_dev_info; /* device readahead, etc */

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -529,7 +529,6 @@ extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned lon
 					     struct vm_area_struct **pprev);
 extern int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 		     unsigned long addr, int new_below);
-extern void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area);

 /* Look up the first VMA which intersects the interval start_addr..end_addr-1,
   NULL if none.  Assume start_addr < end_addr. */

--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -262,9 +262,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 				atomic_dec(&inode->i_writecount);
      
 			/* insert tmp into the share list, just after mpnt */
-			spin_lock(&inode->i_mapping->i_shared_lock);
+			down(&inode->i_mapping->i_shared_sem);
 			list_add_tail(&tmp->shared, &mpnt->shared);
-			spin_unlock(&inode->i_mapping->i_shared_lock);
+			up(&inode->i_mapping->i_shared_sem);
 		}

 		/*

--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -55,11 +55,14 @@
 /*
 * Lock ordering:
 *
- *  ->i_shared_lock		(vmtruncate)
+ *  ->i_shared_sem		(vmtruncate)
 *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
 *      ->swap_list_lock
 *        ->swap_device_lock	(exclusive_swap_page, others)
 *          ->mapping->page_lock
+ *  ->mmap_sem
+ *    ->i_shared_sem		(various places)
+ *
 *  ->inode_lock
 *    ->sb_lock			(fs/fs-writeback.c)
 *    ->mapping->page_lock	(__sync_single_inode)

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -968,7 +968,7 @@ int vmtruncate(struct inode * inode, loff_t offset)
 	if (inode->i_size < offset)
 		goto do_expand;
 	inode->i_size = offset;
-	spin_lock(&mapping->i_shared_lock);
+	down(&mapping->i_shared_sem);
 	if (list_empty(&mapping->i_mmap) && list_empty(&mapping->i_mmap_shared))
 		goto out_unlock;

@@ -979,7 +979,7 @@ int vmtruncate(struct inode * inode, loff_t offset)
 		vmtruncate_list(&mapping->i_mmap_shared, pgoff);

 out_unlock:
-	spin_unlock(&mapping->i_shared_lock);
+	up(&mapping->i_shared_sem);
 	truncate_inode_pages(mapping, offset);
 	goto out_truncate;


--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -132,7 +132,9 @@ int vm_enough_memory(long pages)
 	return 0;
 }

-/* Remove one vm structure from the inode's i_mapping address space. */
+/*
+ * Remove one vm structure from the inode's i_mapping address space.
+ */
 static void remove_shared_vm_struct(struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;
@@ -140,11 +142,11 @@ static void remove_shared_vm_struct(struct vm_area_struct *vma)
 	if (file) {
 		struct inode *inode = file->f_dentry->d_inode;

-		spin_lock(&inode->i_mapping->i_shared_lock);
+		down(&inode->i_mapping->i_shared_sem);
 		if (vma->vm_flags & VM_DENYWRITE)
 			atomic_inc(&inode->i_writecount);
 		list_del_init(&vma->shared);
-		spin_unlock(&inode->i_mapping->i_shared_lock);
+		up(&inode->i_mapping->i_shared_sem);
 	}
 }

@@ -346,12 +348,12 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;

 	if (mapping)
-		spin_lock(&mapping->i_shared_lock);
+		down(&mapping->i_shared_sem);
 	spin_lock(&mm->page_table_lock);
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
 	spin_unlock(&mm->page_table_lock);
 	if (mapping)
-		spin_unlock(&mapping->i_shared_lock);
+		up(&mapping->i_shared_sem);

 	mm->map_count++;
 	validate_mm(mm);
@@ -955,7 +957,7 @@ static void free_pgtables(mmu_gather_t *tlb, struct vm_area_struct *prev,
 * By the time this function is called, the area struct has been
 * removed from the process mapping list.
 */
-void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
+static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
 {
 	size_t len = area->vm_end - area->vm_start;

@@ -1339,7 +1341,7 @@ void exit_mmap(struct mm_struct * mm)

 /* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap ring.  If vm_file is non-NULL
- * then the i_shared_lock must be held here.
+ * then i_shared_sem is taken here.
 */
 void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {

--- a/mm/swap.c
+++ b/mm/swap.c
@@ -363,4 +363,5 @@ void __init swap_setup(void)
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+	init_MUTEX(&swapper_space.i_shared_sem);
 }
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -42,7 +42,6 @@ struct address_space swapper_space = {
 	.host			= &swapper_inode,
 	.a_ops			= &swap_aops,
 	.backing_dev_info	= &swap_backing_dev_info,
-	.i_shared_lock		= SPIN_LOCK_UNLOCKED,
 	.i_mmap			= LIST_HEAD_INIT(swapper_space.i_mmap),
 	.i_mmap_shared		= LIST_HEAD_INIT(swapper_space.i_mmap_shared),
 	.private_lock		= SPIN_LOCK_UNLOCKED,