Commit 0538a82c authored by Johannes Weiner's avatar Johannes Weiner Committed by Andrew Morton

mm: vmscan: make rotations a secondary factor in balancing anon vs file

We noticed a 2% webserver throughput regression after upgrading from 5.6. 
This could be tracked down to a shift in the anon/file reclaim balance
(confirmed with swappiness) that resulted in worse reclaim efficiency and
thus more kswapd activity for the same outcome.

The change that exposed the problem is aae466b0 ("mm/swap: implement
workingset detection for anonymous LRU").  By qualifying swapins based on
their refault distance, it lowered the cost of anon reclaim in this
workload, in turn causing (much) more anon scanning than before.  Scanning
the anon list is more expensive due to the higher ratio of mmapped pages
that may rotate during reclaim, and so the result was an increase in %sys
time.

Right now, rotations aren't considered a cost when balancing scan pressure
between LRUs.  We can end up with very few file refaults putting all the
scan pressure on hot anon pages that are rotated en masse, don't get
reclaimed, and never push back on the file LRU again.  We still only
reclaim file cache in that case, but we burn a lot CPU rotating anon
pages.  It's "fair" from an LRU age POV, but doesn't reflect the real cost
it imposes on the system.

Consider rotations as a secondary factor in balancing the LRUs.  This
doesn't attempt to make a precise comparison between IO cost and CPU cost,
it just says: if reloads are about comparable between the lists, or
rotations are overwhelmingly different, adjust for CPU work.

This fixed the regression on our webservers.  It has since been deployed
to the entire Meta fleet and hasn't caused any problems.

Link: https://lkml.kernel.org/r/20221013193113.726425-1-hannes@cmpxchg.orgSigned-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 57a196a5
...@@ -384,8 +384,9 @@ extern unsigned long totalreserve_pages; ...@@ -384,8 +384,9 @@ extern unsigned long totalreserve_pages;
/* linux/mm/swap.c */ /* linux/mm/swap.c */
void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); void lru_note_cost(struct lruvec *lruvec, bool file,
void lru_note_cost_folio(struct folio *); unsigned int nr_io, unsigned int nr_rotated);
void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *); void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
void lru_cache_add(struct page *); void lru_cache_add(struct page *);
......
...@@ -295,8 +295,20 @@ void folio_rotate_reclaimable(struct folio *folio) ...@@ -295,8 +295,20 @@ void folio_rotate_reclaimable(struct folio *folio)
} }
} }
void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) void lru_note_cost(struct lruvec *lruvec, bool file,
unsigned int nr_io, unsigned int nr_rotated)
{ {
unsigned long cost;
/*
* Reflect the relative cost of incurring IO and spending CPU
* time on rotations. This doesn't attempt to make a precise
* comparison, it just says: if reloads are about comparable
* between the LRU lists, or rotations are overwhelmingly
* different between them, adjust scan balance for CPU work.
*/
cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
do { do {
unsigned long lrusize; unsigned long lrusize;
...@@ -310,9 +322,9 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) ...@@ -310,9 +322,9 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
spin_lock_irq(&lruvec->lru_lock); spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */ /* Record cost event */
if (file) if (file)
lruvec->file_cost += nr_pages; lruvec->file_cost += cost;
else else
lruvec->anon_cost += nr_pages; lruvec->anon_cost += cost;
/* /*
* Decay previous events * Decay previous events
...@@ -335,10 +347,10 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) ...@@ -335,10 +347,10 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
} while ((lruvec = parent_lruvec(lruvec))); } while ((lruvec = parent_lruvec(lruvec)));
} }
void lru_note_cost_folio(struct folio *folio) void lru_note_cost_refault(struct folio *folio)
{ {
lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
folio_nr_pages(folio)); folio_nr_pages(folio), 0);
} }
static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio)
......
...@@ -2499,7 +2499,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, ...@@ -2499,7 +2499,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&lruvec->lru_lock); spin_unlock_irq(&lruvec->lru_lock);
lru_note_cost(lruvec, file, stat.nr_pageout); lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
mem_cgroup_uncharge_list(&folio_list); mem_cgroup_uncharge_list(&folio_list);
free_unref_page_list(&folio_list); free_unref_page_list(&folio_list);
...@@ -2639,6 +2639,8 @@ static void shrink_active_list(unsigned long nr_to_scan, ...@@ -2639,6 +2639,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock); spin_unlock_irq(&lruvec->lru_lock);
if (nr_rotated)
lru_note_cost(lruvec, file, 0, nr_rotated);
mem_cgroup_uncharge_list(&l_active); mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active); free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
......
...@@ -493,7 +493,7 @@ void workingset_refault(struct folio *folio, void *shadow) ...@@ -493,7 +493,7 @@ void workingset_refault(struct folio *folio, void *shadow)
if (workingset) { if (workingset) {
folio_set_workingset(folio); folio_set_workingset(folio);
/* XXX: Move to lru_cache_add() when it supports new vs putback */ /* XXX: Move to lru_cache_add() when it supports new vs putback */
lru_note_cost_folio(folio); lru_note_cost_refault(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
} }
out: out:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment