Commit 918798e7 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] invalidate_inode_pages2() mmap coherency fix

- When invalidating pages, take care to shoot down any ptes which map them
  as well.

  This ensures that the next mmap access to the page will generate a major
  fault, so NFS's server-side modifications are picked up.

  This also allows us to call invalidate_complete_page() on all pages, so
  filesytems such as ext3 get a chance to invalidate the buffer_heads.

- Don't mark in-pagetable pages as non-uptodate any more.  That broke a
  previous guarantee that mapped-into-user-process pages are always uptodate.

- Check the return value of invalidate_complete_page().  It can fail if
  someone redirties a page after generic_file_direct_IO() write it back.

But we still have a problem.  If invalidate_inode_pages2() calls
unmap_mapping_range(), that can cause zap_pte_range() to dirty the pagecache
pages.  That will redirty the page's buffers and will cause
invalidate_complete_page() to fail.

So, in generic_file_direct_IO() we do a complete pte shootdown on the file
up-front, prior to writing back dirty pagecache.  This is only done for
O_DIRECT writes.  It _could_ be done for O_DIRECT reads too, providing full
mmap-vs-direct-IO coherency for both O_DIRECT reads and O_DIRECT writes, but
permitting the pte shootdown on O_DIRECT reads trivially allows people to nuke
other people's mapped pagecache.

NFS also uses invalidate_inode_pages2() for handling server-side modification
notifications.  But in the NFS case the clear_page_dirty() in
invalidate_inode_pages2() is sufficient, because NFS doesn't have to worry
about the "dirty buffers against a clean page" problem. (I think)
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent ba1f08f1
......@@ -1345,7 +1345,7 @@ static inline void invalidate_remote_inode(struct inode *inode)
S_ISLNK(inode->i_mode))
invalidate_inode_pages(inode->i_mapping);
}
extern void invalidate_inode_pages2(struct address_space *mapping);
extern int invalidate_inode_pages2(struct address_space *mapping);
extern void write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
......
......@@ -2247,7 +2247,8 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
EXPORT_SYMBOL(generic_file_writev);
/*
* Called under i_sem for writes to S_ISREG files
* Called under i_sem for writes to S_ISREG files. Returns -EIO if something
* went wrong during pagecache shootdown.
*/
ssize_t
generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
......@@ -2257,12 +2258,23 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
struct address_space *mapping = file->f_mapping;
ssize_t retval;
/*
* If it's a write, unmap all mmappings of the file up-front. This
* will cause any pte dirty bits to be propagated into the pageframes
* for the subsequent filemap_write_and_wait().
*/
if (rw == WRITE && mapping_mapped(mapping))
unmap_mapping_range(mapping, 0, -1, 0);
retval = filemap_write_and_wait(mapping);
if (retval == 0) {
retval = mapping->a_ops->direct_IO(rw, iocb, iov,
offset, nr_segs);
if (rw == WRITE && mapping->nrpages)
invalidate_inode_pages2(mapping);
if (rw == WRITE && mapping->nrpages) {
int err = invalidate_inode_pages2(mapping);
if (err)
retval = err;
}
}
return retval;
}
......
......@@ -65,6 +65,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
* be marked dirty at any time too. So we re-check the dirtiness inside
* ->tree_lock. That provides exclusion against the __set_page_dirty
* functions.
*
* Returns non-zero if the page was successfully invalidated.
*/
static int
invalidate_complete_page(struct address_space *mapping, struct page *page)
......@@ -240,50 +242,67 @@ unsigned long invalidate_inode_pages(struct address_space *mapping)
EXPORT_SYMBOL(invalidate_inode_pages);
/**
* invalidate_inode_pages2 - remove all unmapped pages from an address_space
* invalidate_inode_pages2 - remove all pages from an address_space
* @mapping - the address_space
*
* invalidate_inode_pages2() is like truncate_inode_pages(), except for the case
* where the page is seen to be mapped into process pagetables. In that case,
* the page is marked clean but is left attached to its address_space.
*
* The page is also marked not uptodate so that a subsequent pagefault will
* perform I/O to bringthe page's contents back into sync with its backing
* store.
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
* FIXME: invalidate_inode_pages2() is probably trivially livelockable.
* Returns -EIO if any pages could not be invalidated.
*/
void invalidate_inode_pages2(struct address_space *mapping)
int invalidate_inode_pages2(struct address_space *mapping)
{
struct pagevec pvec;
pgoff_t next = 0;
int i;
int ret = 0;
int did_full_unmap = 0;
pagevec_init(&pvec, 0);
while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
while (!ret && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
int was_dirty;
lock_page(page);
if (page->mapping == mapping) { /* truncate race? */
wait_on_page_writeback(page);
next = page->index + 1;
if (page_mapped(page)) {
clear_page_dirty(page);
ClearPageUptodate(page);
if (page->mapping != mapping) { /* truncate race? */
unlock_page(page);
continue;
}
wait_on_page_writeback(page);
next = page->index + 1;
while (page_mapped(page)) {
if (!did_full_unmap) {
/*
* Zap the rest of the file in one hit.
* FIXME: invalidate_inode_pages2()
* should take start/end offsets.
*/
unmap_mapping_range(mapping,
page->index << PAGE_CACHE_SHIFT,
-1, 0);
did_full_unmap = 1;
} else {
if (!invalidate_complete_page(mapping,
page)) {
clear_page_dirty(page);
ClearPageUptodate(page);
}
/*
* Just zap this page
*/
unmap_mapping_range(mapping,
page->index << PAGE_CACHE_SHIFT,
(page->index << PAGE_CACHE_SHIFT)+1,
0);
}
}
was_dirty = test_clear_page_dirty(page);
if (!invalidate_complete_page(mapping, page)) {
if (was_dirty)
set_page_dirty(page);
ret = -EIO;
}
unlock_page(page);
}
pagevec_release(&pvec);
cond_resched();
}
return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment