Commit e3db7691 authored by Trond Myklebust's avatar Trond Myklebust Committed by Linus Torvalds

[PATCH] NFS: Fix race in nfs_release_page()

    NFS: Fix race in nfs_release_page()

    invalidate_inode_pages2() may find the dirty bit has been set on a page
    owing to the fact that the page may still be mapped after it was locked.
    Only after the call to unmap_mapping_range() are we sure that the page
    can no longer be dirtied.
    In order to fix this, NFS has hooked the releasepage() method and tries
    to write the page out between the call to unmap_mapping_range() and the
    call to remove_mapping(). This, however leads to deadlocks in the page
    reclaim code, where the page may be locked without holding a reference
    to the inode or dentry.

    Fix is to add a new address_space_operation, launder_page(), which will
    attempt to write out a dirty page without releasing the page lock.
Signed-off-by: default avatarTrond Myklebust <Trond.Myklebust@netapp.com>

    Also, the bare SetPageDirty() can skew all sort of accounting leading to
    other nasties.

[akpm@osdl.org: cleanup]
Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 07031e14
...@@ -171,6 +171,7 @@ prototypes: ...@@ -171,6 +171,7 @@ prototypes:
int (*releasepage) (struct page *, int); int (*releasepage) (struct page *, int);
int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs); loff_t offset, unsigned long nr_segs);
int (*launder_page) (struct page *);
locking rules: locking rules:
All except set_page_dirty may block All except set_page_dirty may block
...@@ -188,6 +189,7 @@ bmap: yes ...@@ -188,6 +189,7 @@ bmap: yes
invalidatepage: no yes invalidatepage: no yes
releasepage: no yes releasepage: no yes
direct_IO: no direct_IO: no
launder_page: no yes
->prepare_write(), ->commit_write(), ->sync_page() and ->readpage() ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage()
may be called from the request handler (/dev/loop). may be called from the request handler (/dev/loop).
...@@ -281,6 +283,12 @@ buffers from the page in preparation for freeing it. It returns zero to ...@@ -281,6 +283,12 @@ buffers from the page in preparation for freeing it. It returns zero to
indicate that the buffers are (or may be) freeable. If ->releasepage is zero, indicate that the buffers are (or may be) freeable. If ->releasepage is zero,
the kernel assumes that the fs has no private interest in the buffers. the kernel assumes that the fs has no private interest in the buffers.
->launder_page() may be called prior to releasing a page if
it is still found to be dirty. It returns zero if the page was successfully
cleaned, or an error value if not. Note that in order to prevent the page
getting mapped back in and redirtied, it needs to be kept locked
across the entire operation.
Note: currently almost all instances of address_space methods are Note: currently almost all instances of address_space methods are
using BKL for internal serialization and that's one of the worst sources using BKL for internal serialization and that's one of the worst sources
of contention. Normally they are calling library functions (in fs/buffer.c) of contention. Normally they are calling library functions (in fs/buffer.c)
......
...@@ -315,14 +315,13 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) ...@@ -315,14 +315,13 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
static int nfs_release_page(struct page *page, gfp_t gfp) static int nfs_release_page(struct page *page, gfp_t gfp)
{ {
/* /* If PagePrivate() is set, then the page is not freeable */
* Avoid deadlock on nfs_wait_on_request(). return 0;
*/ }
if (!(gfp & __GFP_FS))
return 0; static int nfs_launder_page(struct page *page)
/* Hack... Force nfs_wb_page() to write out the page */ {
SetPageDirty(page); return nfs_wb_page(page->mapping->host, page);
return !nfs_wb_page(page->mapping->host, page);
} }
const struct address_space_operations nfs_file_aops = { const struct address_space_operations nfs_file_aops = {
...@@ -338,6 +337,7 @@ const struct address_space_operations nfs_file_aops = { ...@@ -338,6 +337,7 @@ const struct address_space_operations nfs_file_aops = {
#ifdef CONFIG_NFS_DIRECTIO #ifdef CONFIG_NFS_DIRECTIO
.direct_IO = nfs_direct_IO, .direct_IO = nfs_direct_IO,
#endif #endif
.launder_page = nfs_launder_page,
}; };
static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
......
...@@ -426,6 +426,7 @@ struct address_space_operations { ...@@ -426,6 +426,7 @@ struct address_space_operations {
/* migrate the contents of a page to the specified target */ /* migrate the contents of a page to the specified target */
int (*migratepage) (struct address_space *, int (*migratepage) (struct address_space *,
struct page *, struct page *); struct page *, struct page *);
int (*launder_page) (struct page *);
}; };
struct backing_dev_info; struct backing_dev_info;
......
...@@ -341,6 +341,15 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) ...@@ -341,6 +341,15 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
return 0; return 0;
} }
static int do_launder_page(struct address_space *mapping, struct page *page)
{
if (!PageDirty(page))
return 0;
if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
return 0;
return mapping->a_ops->launder_page(page);
}
/** /**
* invalidate_inode_pages2_range - remove range of pages from an address_space * invalidate_inode_pages2_range - remove range of pages from an address_space
* @mapping: the address_space * @mapping: the address_space
...@@ -405,7 +414,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ...@@ -405,7 +414,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
PAGE_CACHE_SIZE, 0); PAGE_CACHE_SIZE, 0);
} }
} }
if (!invalidate_complete_page2(mapping, page)) ret = do_launder_page(mapping, page);
if (ret == 0 && !invalidate_complete_page2(mapping, page))
ret = -EIO; ret = -EIO;
unlock_page(page); unlock_page(page);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment