Commit 67f96aa2 authored by Rik van Riel's avatar Rik van Riel Committed by Linus Torvalds

mm: make swapin readahead skip over holes

Ever since abandoning the virtual scan of processes, for scalability
reasons, swap space has been a little more fragmented than before.  This
can lead to the situation where a large memory user is killed, swap space
ends up full of "holes" and swapin readahead is totally ineffective.

On my home system, after killing a leaky firefox it took over an hour to
page just under 2GB of memory back in, slowing the virtual machines down
to a crawl.

This patch makes swapin readahead simply skip over holes, instead of
stopping at them.  This allows the system to swap things back in at rates
of several MB/second, instead of a few hundred kB/second.

The checks done in valid_swaphandles are already done in
read_swap_cache_async as well, allowing us to remove a fair amount of
code.

[akpm@linux-foundation.org: fix it for page_cluster >= 32]
Signed-off-by: default avatarRik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Acked-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Acked-by: default avatarMel Gorman <mgorman@suse.de>
Cc: Adrian Drzewiecki <z@drze.net>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent c38446cc
...@@ -329,7 +329,6 @@ extern long total_swap_pages; ...@@ -329,7 +329,6 @@ extern long total_swap_pages;
extern void si_swapinfo(struct sysinfo *); extern void si_swapinfo(struct sysinfo *);
extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page(void);
extern swp_entry_t get_swap_page_of_type(int); extern swp_entry_t get_swap_page_of_type(int);
extern int valid_swaphandles(swp_entry_t, unsigned long *);
extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t); extern void swap_shmem_alloc(swp_entry_t);
extern int swap_duplicate(swp_entry_t); extern int swap_duplicate(swp_entry_t);
......
...@@ -372,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, ...@@ -372,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr) struct vm_area_struct *vma, unsigned long addr)
{ {
int nr_pages;
struct page *page; struct page *page;
unsigned long offset; unsigned long offset = swp_offset(entry);
unsigned long end_offset; unsigned long start_offset, end_offset;
unsigned long mask = (1UL << page_cluster) - 1;
/* /* Read a page_cluster sized and aligned cluster around offset. */
* Get starting offset for readaround, and number of pages to read. start_offset = offset & ~mask;
* Adjust starting address by readbehind (for NUMA interleave case)? end_offset = offset | mask;
* No, it's very unlikely that swap layout would follow vma layout, if (!start_offset) /* First page is swap header. */
* more likely that neighbouring swap pages came from the same node: start_offset++;
* so use the same "addr" to choose the same node for each swap read.
*/ for (offset = start_offset; offset <= end_offset ; offset++) {
nr_pages = valid_swaphandles(entry, &offset);
for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
/* Ok, do the async read-ahead now */ /* Ok, do the async read-ahead now */
page = read_swap_cache_async(swp_entry(swp_type(entry), offset), page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
gfp_mask, vma, addr); gfp_mask, vma, addr);
if (!page) if (!page)
break; continue;
page_cache_release(page); page_cache_release(page);
} }
lru_add_drain(); /* Push any new pages onto the LRU now */ lru_add_drain(); /* Push any new pages onto the LRU now */
......
...@@ -2287,58 +2287,6 @@ int swapcache_prepare(swp_entry_t entry) ...@@ -2287,58 +2287,6 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE); return __swap_duplicate(entry, SWAP_HAS_CACHE);
} }
/*
* swap_lock prevents swap_map being freed. Don't grab an extra
* reference on the swaphandle, it doesn't matter if it becomes unused.
*/
int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
{
struct swap_info_struct *si;
int our_page_cluster = page_cluster;
pgoff_t target, toff;
pgoff_t base, end;
int nr_pages = 0;
if (!our_page_cluster) /* no readahead */
return 0;
si = swap_info[swp_type(entry)];
target = swp_offset(entry);
base = (target >> our_page_cluster) << our_page_cluster;
end = base + (1 << our_page_cluster);
if (!base) /* first page is swap header */
base++;
spin_lock(&swap_lock);
if (end > si->max) /* don't go beyond end of map */
end = si->max;
/* Count contiguous allocated slots above our target */
for (toff = target; ++toff < end; nr_pages++) {
/* Don't read in free or bad pages */
if (!si->swap_map[toff])
break;
if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
break;
}
/* Count contiguous allocated slots below our target */
for (toff = target; --toff >= base; nr_pages++) {
/* Don't read in free or bad pages */
if (!si->swap_map[toff])
break;
if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
break;
}
spin_unlock(&swap_lock);
/*
* Indicate starting offset, and return number of pages to get:
* if only 1, say 0, since there's then no readahead to be done.
*/
*offset = ++toff;
return nr_pages? ++nr_pages: 0;
}
/* /*
* add_swap_count_continuation - called when a swap count is duplicated * add_swap_count_continuation - called when a swap count is duplicated
* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment