Commit 46f65ec1 authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

tmpfs: convert shmem_unuse_inode to radix-swap

Convert shmem_unuse_inode() to use a lockless gang lookup of the radix
tree, searching for matching swap.

This is somewhat slower than the old method: because of repeated radix
tree descents, because of copying entries up, but probably most because
the old method noted and skipped once a vector page was cleared of swap.
Perhaps we can devise a use of radix tree tagging to achieve that later.

shmem_add_to_page_cache() uses shmem_radix_tree_replace() to compensate
for the lockless lookup by checking that the expected entry is in place,
under lock.  It is not very satisfactory to be copying this much from
add_to_page_cache_locked(), but I think easier to sell than insisting
that every caller of add_to_page_cache*() go through the extras.
Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Acked-by: default avatarRik van Riel <riel@redhat.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 7a5d0fbb
...@@ -261,6 +261,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping, ...@@ -261,6 +261,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
return 0; return 0;
} }
/*
* Like add_to_page_cache_locked, but error if expected item has gone.
*/
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
pgoff_t index, gfp_t gfp, void *expected)
{
int error;
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(!PageSwapBacked(page));
error = mem_cgroup_cache_charge(page, current->mm,
gfp & GFP_RECLAIM_MASK);
if (error)
goto out;
if (!expected)
error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
if (!error) {
page_cache_get(page);
page->mapping = mapping;
page->index = index;
spin_lock_irq(&mapping->tree_lock);
if (!expected)
error = radix_tree_insert(&mapping->page_tree,
index, page);
else
error = shmem_radix_tree_replace(mapping, index,
expected, page);
if (!error) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
__inc_zone_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
spin_unlock_irq(&mapping->tree_lock);
page_cache_release(page);
}
if (!expected)
radix_tree_preload_end();
}
if (error)
mem_cgroup_uncharge_cache_page(page);
out:
return error;
}
/* /*
* Like find_get_pages, but collecting swap entries as well as pages. * Like find_get_pages, but collecting swap entries as well as pages.
*/ */
...@@ -308,6 +357,42 @@ static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, ...@@ -308,6 +357,42 @@ static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
return ret; return ret;
} }
/*
* Lockless lookup of swap entry in radix tree, avoiding refcount on pages.
*/
static pgoff_t shmem_find_swap(struct address_space *mapping, void *radswap)
{
void **slots[PAGEVEC_SIZE];
pgoff_t indices[PAGEVEC_SIZE];
unsigned int nr_found;
restart:
nr_found = 1;
indices[0] = -1;
while (nr_found) {
pgoff_t index = indices[nr_found - 1] + 1;
unsigned int i;
rcu_read_lock();
nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
slots, indices, index, PAGEVEC_SIZE);
for (i = 0; i < nr_found; i++) {
void *item = radix_tree_deref_slot(slots[i]);
if (radix_tree_deref_retry(item)) {
rcu_read_unlock();
goto restart;
}
if (item == radswap) {
rcu_read_unlock();
return indices[i];
}
}
rcu_read_unlock();
cond_resched();
}
return -1;
}
/* /*
* Remove swap entry from radix tree, free the swap and its page cache. * Remove swap entry from radix tree, free the swap and its page cache.
*/ */
...@@ -515,23 +600,21 @@ static void shmem_evict_inode(struct inode *inode) ...@@ -515,23 +600,21 @@ static void shmem_evict_inode(struct inode *inode)
end_writeback(inode); end_writeback(inode);
} }
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
static int shmem_unuse_inode(struct shmem_inode_info *info, static int shmem_unuse_inode(struct shmem_inode_info *info,
swp_entry_t swap, struct page *page) swp_entry_t swap, struct page *page)
{ {
struct address_space *mapping = info->vfs_inode.i_mapping; struct address_space *mapping = info->vfs_inode.i_mapping;
void *radswap;
pgoff_t index; pgoff_t index;
int error; int error;
for (index = 0; index < SHMEM_NR_DIRECT; index++) radswap = swp_to_radix_entry(swap);
if (shmem_get_swap(info, index).val == swap.val) index = shmem_find_swap(mapping, radswap);
goto found; if (index == -1)
return 0;
found:
spin_lock(&info->lock);
if (shmem_get_swap(info, index).val != swap.val) {
spin_unlock(&info->lock);
return 0; return 0;
}
/* /*
* Move _head_ to start search for next from here. * Move _head_ to start search for next from here.
...@@ -547,23 +630,30 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, ...@@ -547,23 +630,30 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
* but also to hold up shmem_evict_inode(): so inode cannot be freed * but also to hold up shmem_evict_inode(): so inode cannot be freed
* beneath us (pagelock doesn't help until the page is in pagecache). * beneath us (pagelock doesn't help until the page is in pagecache).
*/ */
error = add_to_page_cache_locked(page, mapping, index, GFP_NOWAIT); error = shmem_add_to_page_cache(page, mapping, index,
GFP_NOWAIT, radswap);
/* which does mem_cgroup_uncharge_cache_page on error */ /* which does mem_cgroup_uncharge_cache_page on error */
if (error != -ENOMEM) { if (error != -ENOMEM) {
/*
* Truncation and eviction use free_swap_and_cache(), which
* only does trylock page: if we raced, best clean up here.
*/
delete_from_swap_cache(page); delete_from_swap_cache(page);
set_page_dirty(page); set_page_dirty(page);
shmem_put_swap(info, index, (swp_entry_t){0}); if (!error) {
info->swapped--; spin_lock(&info->lock);
swap_free(swap); info->swapped--;
spin_unlock(&info->lock);
swap_free(swap);
}
error = 1; /* not an error, but entry was found */ error = 1; /* not an error, but entry was found */
} }
spin_unlock(&info->lock);
return error; return error;
} }
/* /*
* shmem_unuse() search for an eventually swapped out shmem page. * Search through swapped inodes to find and replace swap by page.
*/ */
int shmem_unuse(swp_entry_t swap, struct page *page) int shmem_unuse(swp_entry_t swap, struct page *page)
{ {
...@@ -576,20 +666,12 @@ int shmem_unuse(swp_entry_t swap, struct page *page) ...@@ -576,20 +666,12 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
* Charge page using GFP_KERNEL while we can wait, before taking * Charge page using GFP_KERNEL while we can wait, before taking
* the shmem_swaplist_mutex which might hold up shmem_writepage(). * the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used. * Charged back to the user (not to caller) when swap account is used.
* add_to_page_cache() will be called with GFP_NOWAIT. * shmem_add_to_page_cache() will be called with GFP_NOWAIT.
*/ */
error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
if (error) if (error)
goto out; goto out;
/* /* No radix_tree_preload: swap entry keeps a place for page in tree */
* Try to preload while we can wait, to not make a habit of
* draining atomic reserves; but don't latch on to this cpu,
* it's okay if sometimes we get rescheduled after this.
*/
error = radix_tree_preload(GFP_KERNEL);
if (error)
goto uncharge;
radix_tree_preload_end();
mutex_lock(&shmem_swaplist_mutex); mutex_lock(&shmem_swaplist_mutex);
list_for_each_safe(this, next, &shmem_swaplist) { list_for_each_safe(this, next, &shmem_swaplist) {
...@@ -608,7 +690,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page) ...@@ -608,7 +690,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
} }
mutex_unlock(&shmem_swaplist_mutex); mutex_unlock(&shmem_swaplist_mutex);
uncharge:
if (!found) if (!found)
mem_cgroup_uncharge_cache_page(page); mem_cgroup_uncharge_cache_page(page);
if (found < 0) if (found < 0)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment