Commit 918798e7 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] invalidate_inode_pages2() mmap coherency fix

- When invalidating pages, take care to shoot down any ptes which map them
  as well.

  This ensures that the next mmap access to the page will generate a major
  fault, so NFS's server-side modifications are picked up.

  This also allows us to call invalidate_complete_page() on all pages, so
  filesytems such as ext3 get a chance to invalidate the buffer_heads.

- Don't mark in-pagetable pages as non-uptodate any more.  That broke a
  previous guarantee that mapped-into-user-process pages are always uptodate.

- Check the return value of invalidate_complete_page().  It can fail if
  someone redirties a page after generic_file_direct_IO() write it back.

But we still have a problem.  If invalidate_inode_pages2() calls
unmap_mapping_range(), that can cause zap_pte_range() to dirty the pagecache
pages.  That will redirty the page's buffers and will cause
invalidate_complete_page() to fail.

So, in generic_file_direct_IO() we do a complete pte shootdown on the file
up-front, prior to writing back dirty pagecache.  This is only done for
O_DIRECT writes.  It _could_ be done for O_DIRECT reads too, providing full
mmap-vs-direct-IO coherency for both O_DIRECT reads and O_DIRECT writes, but
permitting the pte shootdown on O_DIRECT reads trivially allows people to nuke
other people's mapped pagecache.

NFS also uses invalidate_inode_pages2() for handling server-side modification
notifications.  But in the NFS case the clear_page_dirty() in
invalidate_inode_pages2() is sufficient, because NFS doesn't have to worry
about the "dirty buffers against a clean page" problem. (I think)
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent ba1f08f1
...@@ -1345,7 +1345,7 @@ static inline void invalidate_remote_inode(struct inode *inode) ...@@ -1345,7 +1345,7 @@ static inline void invalidate_remote_inode(struct inode *inode)
S_ISLNK(inode->i_mode)) S_ISLNK(inode->i_mode))
invalidate_inode_pages(inode->i_mapping); invalidate_inode_pages(inode->i_mapping);
} }
extern void invalidate_inode_pages2(struct address_space *mapping); extern int invalidate_inode_pages2(struct address_space *mapping);
extern void write_inode_now(struct inode *, int); extern void write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *); extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *); extern int filemap_flush(struct address_space *);
......
...@@ -2247,7 +2247,8 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov, ...@@ -2247,7 +2247,8 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
EXPORT_SYMBOL(generic_file_writev); EXPORT_SYMBOL(generic_file_writev);
/* /*
* Called under i_sem for writes to S_ISREG files * Called under i_sem for writes to S_ISREG files. Returns -EIO if something
* went wrong during pagecache shootdown.
*/ */
ssize_t ssize_t
generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
...@@ -2257,12 +2258,23 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, ...@@ -2257,12 +2258,23 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
struct address_space *mapping = file->f_mapping; struct address_space *mapping = file->f_mapping;
ssize_t retval; ssize_t retval;
/*
* If it's a write, unmap all mmappings of the file up-front. This
* will cause any pte dirty bits to be propagated into the pageframes
* for the subsequent filemap_write_and_wait().
*/
if (rw == WRITE && mapping_mapped(mapping))
unmap_mapping_range(mapping, 0, -1, 0);
retval = filemap_write_and_wait(mapping); retval = filemap_write_and_wait(mapping);
if (retval == 0) { if (retval == 0) {
retval = mapping->a_ops->direct_IO(rw, iocb, iov, retval = mapping->a_ops->direct_IO(rw, iocb, iov,
offset, nr_segs); offset, nr_segs);
if (rw == WRITE && mapping->nrpages) if (rw == WRITE && mapping->nrpages) {
invalidate_inode_pages2(mapping); int err = invalidate_inode_pages2(mapping);
if (err)
retval = err;
}
} }
return retval; return retval;
} }
......
...@@ -65,6 +65,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page) ...@@ -65,6 +65,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
* be marked dirty at any time too. So we re-check the dirtiness inside * be marked dirty at any time too. So we re-check the dirtiness inside
* ->tree_lock. That provides exclusion against the __set_page_dirty * ->tree_lock. That provides exclusion against the __set_page_dirty
* functions. * functions.
*
* Returns non-zero if the page was successfully invalidated.
*/ */
static int static int
invalidate_complete_page(struct address_space *mapping, struct page *page) invalidate_complete_page(struct address_space *mapping, struct page *page)
...@@ -240,50 +242,67 @@ unsigned long invalidate_inode_pages(struct address_space *mapping) ...@@ -240,50 +242,67 @@ unsigned long invalidate_inode_pages(struct address_space *mapping)
EXPORT_SYMBOL(invalidate_inode_pages); EXPORT_SYMBOL(invalidate_inode_pages);
/** /**
* invalidate_inode_pages2 - remove all unmapped pages from an address_space * invalidate_inode_pages2 - remove all pages from an address_space
* @mapping - the address_space * @mapping - the address_space
* *
* invalidate_inode_pages2() is like truncate_inode_pages(), except for the case * Any pages which are found to be mapped into pagetables are unmapped prior to
* where the page is seen to be mapped into process pagetables. In that case, * invalidation.
* the page is marked clean but is left attached to its address_space.
*
* The page is also marked not uptodate so that a subsequent pagefault will
* perform I/O to bringthe page's contents back into sync with its backing
* store.
* *
* FIXME: invalidate_inode_pages2() is probably trivially livelockable. * Returns -EIO if any pages could not be invalidated.
*/ */
void invalidate_inode_pages2(struct address_space *mapping) int invalidate_inode_pages2(struct address_space *mapping)
{ {
struct pagevec pvec; struct pagevec pvec;
pgoff_t next = 0; pgoff_t next = 0;
int i; int i;
int ret = 0;
int did_full_unmap = 0;
pagevec_init(&pvec, 0); pagevec_init(&pvec, 0);
while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { while (!ret && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) { for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i]; struct page *page = pvec.pages[i];
int was_dirty;
lock_page(page); lock_page(page);
if (page->mapping == mapping) { /* truncate race? */ if (page->mapping != mapping) { /* truncate race? */
wait_on_page_writeback(page); unlock_page(page);
next = page->index + 1; continue;
if (page_mapped(page)) { }
clear_page_dirty(page); wait_on_page_writeback(page);
ClearPageUptodate(page); next = page->index + 1;
while (page_mapped(page)) {
if (!did_full_unmap) {
/*
* Zap the rest of the file in one hit.
* FIXME: invalidate_inode_pages2()
* should take start/end offsets.
*/
unmap_mapping_range(mapping,
page->index << PAGE_CACHE_SHIFT,
-1, 0);
did_full_unmap = 1;
} else { } else {
if (!invalidate_complete_page(mapping, /*
page)) { * Just zap this page
clear_page_dirty(page); */
ClearPageUptodate(page); unmap_mapping_range(mapping,
} page->index << PAGE_CACHE_SHIFT,
(page->index << PAGE_CACHE_SHIFT)+1,
0);
} }
} }
was_dirty = test_clear_page_dirty(page);
if (!invalidate_complete_page(mapping, page)) {
if (was_dirty)
set_page_dirty(page);
ret = -EIO;
}
unlock_page(page); unlock_page(page);
} }
pagevec_release(&pvec); pagevec_release(&pvec);
cond_resched(); cond_resched();
} }
return ret;
} }
EXPORT_SYMBOL_GPL(invalidate_inode_pages2); EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment