Commit 7cf111bc authored by Johannes Weiner's avatar Johannes Weiner Committed by Linus Torvalds

mm: vmscan: determine anon/file pressure balance at the reclaim root

We split the LRU lists into anon and file, and we rebalance the scan
pressure between them when one of them begins thrashing: if the file cache
experiences workingset refaults, we increase the pressure on anonymous
pages; if the workload is stalled on swapins, we increase the pressure on
the file cache instead.

With cgroups and their nested LRU lists, we currently don't do this
correctly.  While recursive cgroup reclaim establishes a relative LRU
order among the pages of all involved cgroups, LRU pressure balancing is
done on an individual cgroup LRU level.  As a result, when one cgroup is
thrashing on the filesystem cache while a sibling may have cold anonymous
pages, pressure doesn't get equalized between them.

This patch moves LRU balancing decision to the root of reclaim - the same
level where the LRU order is established.

It does this by tracking LRU cost recursively, so that every level of the
cgroup tree knows the aggregate LRU cost of all memory within its domain.
When the page scanner calculates the scan balance for any given individual
cgroup's LRU list, it uses the values from the ancestor cgroup that
initiated the reclaim cycle.

If one sibling is then thrashing on the cache, it will tip the pressure
balance inside its ancestors, and the next hierarchical reclaim iteration
will go more after the anon pages in the tree.
Signed-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Link: http://lkml.kernel.org/r/20200520232525.798933-13-hannes@cmpxchg.orgSigned-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 314b57fb
...@@ -1242,6 +1242,19 @@ static inline void dec_lruvec_page_state(struct page *page, ...@@ -1242,6 +1242,19 @@ static inline void dec_lruvec_page_state(struct page *page,
mod_lruvec_page_state(page, idx, -1); mod_lruvec_page_state(page, idx, -1);
} }
static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
{
struct mem_cgroup *memcg;
memcg = lruvec_memcg(lruvec);
if (!memcg)
return NULL;
memcg = parent_mem_cgroup(memcg);
if (!memcg)
return NULL;
return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
......
...@@ -282,11 +282,33 @@ void lru_note_cost(struct page *page) ...@@ -282,11 +282,33 @@ void lru_note_cost(struct page *page)
{ {
struct lruvec *lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); struct lruvec *lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
/* Record new data point */ do {
unsigned long lrusize;
/* Record cost event */
if (page_is_file_lru(page)) if (page_is_file_lru(page))
lruvec->file_cost++; lruvec->file_cost++;
else else
lruvec->anon_cost++; lruvec->anon_cost++;
/*
* Decay previous events
*
* Because workloads change over time (and to avoid
* overflow) we keep these statistics as a floating
* average, which ends up weighing recent refaults
* more than old ones.
*/
lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
lruvec_page_state(lruvec, NR_ACTIVE_FILE);
if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
lruvec->file_cost /= 2;
lruvec->anon_cost /= 2;
}
} while ((lruvec = parent_lruvec(lruvec)));
} }
static void __activate_page(struct page *page, struct lruvec *lruvec, static void __activate_page(struct page *page, struct lruvec *lruvec,
......
...@@ -79,6 +79,12 @@ struct scan_control { ...@@ -79,6 +79,12 @@ struct scan_control {
*/ */
struct mem_cgroup *target_mem_cgroup; struct mem_cgroup *target_mem_cgroup;
/*
* Scan pressure balancing between anon and file LRUs
*/
unsigned long anon_cost;
unsigned long file_cost;
/* Can active pages be deactivated as part of reclaim? */ /* Can active pages be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1 #define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2 #define DEACTIVATE_FILE 2
...@@ -2231,10 +2237,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, ...@@ -2231,10 +2237,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
int swappiness = mem_cgroup_swappiness(memcg); int swappiness = mem_cgroup_swappiness(memcg);
u64 fraction[2]; u64 fraction[2];
u64 denominator = 0; /* gcc */ u64 denominator = 0; /* gcc */
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
unsigned long anon_prio, file_prio; unsigned long anon_prio, file_prio;
enum scan_balance scan_balance; enum scan_balance scan_balance;
unsigned long anon, file;
unsigned long totalcost; unsigned long totalcost;
unsigned long ap, fp; unsigned long ap, fp;
enum lru_list lru; enum lru_list lru;
...@@ -2285,7 +2289,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, ...@@ -2285,7 +2289,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
} }
scan_balance = SCAN_FRACT; scan_balance = SCAN_FRACT;
/* /*
* Calculate the pressure balance between anon and file pages. * Calculate the pressure balance between anon and file pages.
* *
...@@ -2300,30 +2303,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, ...@@ -2300,30 +2303,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
anon_prio = swappiness; anon_prio = swappiness;
file_prio = 200 - anon_prio; file_prio = 200 - anon_prio;
/* totalcost = sc->anon_cost + sc->file_cost;
* Because workloads change over time (and to avoid overflow)
* we keep these statistics as a floating average, which ends
* up weighing recent refaults more than old ones.
*/
anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
spin_lock_irq(&pgdat->lru_lock);
totalcost = lruvec->anon_cost + lruvec->file_cost;
if (unlikely(totalcost > (anon + file) / 4)) {
lruvec->anon_cost /= 2;
lruvec->file_cost /= 2;
totalcost /= 2;
}
ap = anon_prio * (totalcost + 1); ap = anon_prio * (totalcost + 1);
ap /= lruvec->anon_cost + 1; ap /= sc->anon_cost + 1;
fp = file_prio * (totalcost + 1); fp = file_prio * (totalcost + 1);
fp /= lruvec->file_cost + 1; fp /= sc->file_cost + 1;
spin_unlock_irq(&pgdat->lru_lock);
fraction[0] = ap; fraction[0] = ap;
fraction[1] = fp; fraction[1] = fp;
...@@ -2687,6 +2672,14 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) ...@@ -2687,6 +2672,14 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
nr_reclaimed = sc->nr_reclaimed; nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned; nr_scanned = sc->nr_scanned;
/*
* Determine the scan balance between anon and file LRUs.
*/
spin_lock_irq(&pgdat->lru_lock);
sc->anon_cost = target_lruvec->anon_cost;
sc->file_cost = target_lruvec->file_cost;
spin_unlock_irq(&pgdat->lru_lock);
/* /*
* Target desirable inactive:active list ratios for the anon * Target desirable inactive:active list ratios for the anon
* and file LRU lists. * and file LRU lists.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment