Commit ec8acf20 authored by Shaohua Li's avatar Shaohua Li Committed by Linus Torvalds

swap: add per-partition lock for swapfile

swap_lock is heavily contended when I test swap to 3 fast SSD (even
slightly slower than swap to 2 such SSD).  The main contention comes
from swap_info_get().  This patch tries to fix the gap with adding a new
per-partition lock.

Global data like nr_swapfiles, total_swap_pages, least_priority and
swap_list are still protected by swap_lock.

nr_swap_pages is an atomic now, it can be changed without swap_lock.  In
theory, it's possible get_swap_page() finds no swap pages but actually
there are free swap pages.  But sounds not a big problem.

Accessing partition specific data (like scan_swap_map and so on) is only
protected by swap_info_struct.lock.

Changing swap_info_struct.flags need hold swap_lock and
swap_info_struct.lock, because scan_scan_map() will check it.  read the
flags is ok with either the locks hold.

If both swap_lock and swap_info_struct.lock must be hold, we always hold
the former first to avoid deadlock.

swap_entry_free() can change swap_list.  To delete that code, we add a
new highest_priority_index.  Whenever get_swap_page() is called, we
check it.  If it's valid, we use it.

It's a pity get_swap_page() still holds swap_lock().  But in practice,
swap_lock() isn't heavily contended in my test with this patch (or I can
say there are other much more heavier bottlenecks like TLB flush).  And
BTW, looks get_swap_page() doesn't really need the lock.  We never free
swap_info[] and we check SWAP_WRITEOK flag.  The only risk without the
lock is we could swapout to some low priority swap, but we can quickly
recover after several rounds of swap, so sounds not a big deal to me.
But I'd prefer to fix this if it's a real problem.

"swap: make each swap partition have one address_space" improved the
swapout speed from 1.7G/s to 2G/s.  This patch further improves the
speed to 2.3G/s, so around 15% improvement.  It's a multi-process test,
so TLB flush isn't the biggest bottleneck before the patches.

[arnd@arndb.de: fix it for nommu]
[hughd@google.com: add missing unlock]
[minchan@kernel.org: get rid of lockdep whinge on sys_swapon]
Signed-off-by: default avatarShaohua Li <shli@fusionio.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: default avatarArnd Bergmann <arnd@arndb.de>
Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Signed-off-by: default avatarMinchan Kim <minchan@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 33806f06
...@@ -57,7 +57,7 @@ void show_mem(unsigned int filter) ...@@ -57,7 +57,7 @@ void show_mem(unsigned int filter)
printk("Mem-info:\n"); printk("Mem-info:\n");
show_free_areas(filter); show_free_areas(filter);
printk("Free swap: %6ldkB\n", printk("Free swap: %6ldkB\n",
nr_swap_pages << (PAGE_SHIFT-10)); get_nr_swap_pages() << (PAGE_SHIFT-10));
printk("%ld pages of RAM\n", totalram_pages); printk("%ld pages of RAM\n", totalram_pages);
printk("%ld free pages\n", nr_free_pages()); printk("%ld free pages\n", nr_free_pages());
} }
......
...@@ -61,7 +61,7 @@ void show_mem(unsigned int filter) ...@@ -61,7 +61,7 @@ void show_mem(unsigned int filter)
global_page_state(NR_PAGETABLE), global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE), global_page_state(NR_BOUNCE),
global_page_state(NR_FILE_PAGES), global_page_state(NR_FILE_PAGES),
nr_swap_pages); get_nr_swap_pages());
for_each_zone(zone) { for_each_zone(zone) {
unsigned long flags, order, total = 0, largest_order = -1; unsigned long flags, order, total = 0, largest_order = -1;
......
...@@ -202,6 +202,18 @@ struct swap_info_struct { ...@@ -202,6 +202,18 @@ struct swap_info_struct {
unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
atomic_t frontswap_pages; /* frontswap pages in-use counter */ atomic_t frontswap_pages; /* frontswap pages in-use counter */
#endif #endif
spinlock_t lock; /*
* protect map scan related fields like
* swap_map, lowest_bit, highest_bit,
* inuse_pages, cluster_next,
* cluster_nr, lowest_alloc and
* highest_alloc. other fields are only
* changed at swapon/swapoff, so are
* protected by swap_lock. changing
* flags need hold this lock and
* swap_lock. If both locks need hold,
* hold swap_lock first.
*/
}; };
struct swap_list_t { struct swap_list_t {
...@@ -209,9 +221,6 @@ struct swap_list_t { ...@@ -209,9 +221,6 @@ struct swap_list_t {
int next; /* swapfile to be used next */ int next; /* swapfile to be used next */
}; };
/* Swap 50% full? Release swapcache more aggressively.. */
#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
/* linux/mm/page_alloc.c */ /* linux/mm/page_alloc.c */
extern unsigned long totalram_pages; extern unsigned long totalram_pages;
extern unsigned long totalreserve_pages; extern unsigned long totalreserve_pages;
...@@ -347,8 +356,20 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, ...@@ -347,8 +356,20 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
struct vm_area_struct *vma, unsigned long addr); struct vm_area_struct *vma, unsigned long addr);
/* linux/mm/swapfile.c */ /* linux/mm/swapfile.c */
extern long nr_swap_pages; extern atomic_long_t nr_swap_pages;
extern long total_swap_pages; extern long total_swap_pages;
/* Swap 50% full? Release swapcache more aggressively.. */
static inline bool vm_swap_full(void)
{
return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
}
static inline long get_nr_swap_pages(void)
{
return atomic_long_read(&nr_swap_pages);
}
extern void si_swapinfo(struct sysinfo *); extern void si_swapinfo(struct sysinfo *);
extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page(void);
extern swp_entry_t get_swap_page_of_type(int); extern swp_entry_t get_swap_page_of_type(int);
...@@ -381,9 +402,10 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) ...@@ -381,9 +402,10 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
#else /* CONFIG_SWAP */ #else /* CONFIG_SWAP */
#define nr_swap_pages 0L #define get_nr_swap_pages() 0L
#define total_swap_pages 0L #define total_swap_pages 0L
#define total_swapcache_pages() 0UL #define total_swapcache_pages() 0UL
#define vm_swap_full() 0
#define si_swapinfo(val) \ #define si_swapinfo(val) \
do { (val)->freeswap = (val)->totalswap = 0; } while (0) do { (val)->freeswap = (val)->totalswap = 0; } while (0)
......
...@@ -144,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) ...@@ -144,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
*/ */
free -= global_page_state(NR_SHMEM); free -= global_page_state(NR_SHMEM);
free += nr_swap_pages; free += get_nr_swap_pages();
/* /*
* Any slabs which are created with the * Any slabs which are created with the
......
...@@ -1907,7 +1907,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) ...@@ -1907,7 +1907,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
*/ */
free -= global_page_state(NR_SHMEM); free -= global_page_state(NR_SHMEM);
free += nr_swap_pages; free += get_nr_swap_pages();
/* /*
* Any slabs which are created with the * Any slabs which are created with the
......
...@@ -69,7 +69,8 @@ void show_swap_cache_info(void) ...@@ -69,7 +69,8 @@ void show_swap_cache_info(void)
printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.add_total, swap_cache_info.del_total,
swap_cache_info.find_success, swap_cache_info.find_total); swap_cache_info.find_success, swap_cache_info.find_total);
printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); printk("Free swap = %ldkB\n",
get_nr_swap_pages() << (PAGE_SHIFT - 10));
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
} }
......
This diff is collapsed.
...@@ -1684,7 +1684,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, ...@@ -1684,7 +1684,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
force_scan = true; force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */ /* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) { if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
scan_balance = SCAN_FILE; scan_balance = SCAN_FILE;
goto out; goto out;
} }
...@@ -1933,7 +1933,7 @@ static inline bool should_continue_reclaim(struct zone *zone, ...@@ -1933,7 +1933,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
*/ */
pages_for_compaction = (2UL << sc->order); pages_for_compaction = (2UL << sc->order);
inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
if (nr_swap_pages > 0) if (get_nr_swap_pages() > 0)
inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction && if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction) inactive_lru_pages > pages_for_compaction)
...@@ -3085,7 +3085,7 @@ unsigned long global_reclaimable_pages(void) ...@@ -3085,7 +3085,7 @@ unsigned long global_reclaimable_pages(void)
nr = global_page_state(NR_ACTIVE_FILE) + nr = global_page_state(NR_ACTIVE_FILE) +
global_page_state(NR_INACTIVE_FILE); global_page_state(NR_INACTIVE_FILE);
if (nr_swap_pages > 0) if (get_nr_swap_pages() > 0)
nr += global_page_state(NR_ACTIVE_ANON) + nr += global_page_state(NR_ACTIVE_ANON) +
global_page_state(NR_INACTIVE_ANON); global_page_state(NR_INACTIVE_ANON);
...@@ -3099,7 +3099,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) ...@@ -3099,7 +3099,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
nr = zone_page_state(zone, NR_ACTIVE_FILE) + nr = zone_page_state(zone, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_FILE); zone_page_state(zone, NR_INACTIVE_FILE);
if (nr_swap_pages > 0) if (get_nr_swap_pages() > 0)
nr += zone_page_state(zone, NR_ACTIVE_ANON) + nr += zone_page_state(zone, NR_ACTIVE_ANON) +
zone_page_state(zone, NR_INACTIVE_ANON); zone_page_state(zone, NR_INACTIVE_ANON);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment