Commit 1b4e3f26 authored by Mel Gorman's avatar Mel Gorman Committed by Linus Torvalds

mm: vmscan: Reduce throttling due to a failure to make progress

Mike Galbraith, Alexey Avramov and Darrick Wong all reported similar
problems due to reclaim throttling for excessive lengths of time.  In
Alexey's case, a memory hog that should go OOM quickly stalls for
several minutes before stalling.  In Mike and Darrick's cases, a small
memcg environment stalled excessively even though the system had enough
memory overall.

Commit 69392a40 ("mm/vmscan: throttle reclaim when no progress is
being made") introduced the problem although commit a19594ca
("mm/vmscan: increase the timeout if page reclaim is not making
progress") made it worse.  Systems at or near an OOM state that cannot
be recovered must reach OOM quickly and memcg should kill tasks if a
memcg is near OOM.

To address this, only stall for the first zone in the zonelist, reduce
the timeout to 1 tick for VMSCAN_THROTTLE_NOPROGRESS and only stall if
the scan control nr_reclaimed is 0, kswapd is still active and there
were excessive pages pending for writeback.  If kswapd has stopped
reclaiming due to excessive failures, do not stall at all so that OOM
triggers relatively quickly.  Similarly, if an LRU is simply congested,
only lightly throttle similar to NOPROGRESS.

Alexey's original case was the most straight forward

	for i in {1..3}; do tail /dev/zero; done

On vanilla 5.16-rc1, this test stalled heavily, after the patch the test
completes in a few seconds similar to 5.15.

Alexey's second test case added watching a youtube video while tail runs
10 times.  On 5.15, playback only jitters slightly, 5.16-rc1 stalls a
lot with lots of frames missing and numerous audio glitches.  With this
patch applies, the video plays similarly to 5.15.

[lkp@intel.com: Fix W=1 build warning]

Link: https://lore.kernel.org/r/99e779783d6c7fce96448a3402061b9dc1b3b602.camel@gmx.de
Link: https://lore.kernel.org/r/20211124011954.7cab9bb4@mail.inbox.lv
Link: https://lore.kernel.org/r/20211022144651.19914-1-mgorman@techsingularity.net
Link: https://lore.kernel.org/r/20211202150614.22440-1-mgorman@techsingularity.net
Link: https://linux-regtracking.leemhuis.info/regzbot/regression/20211124011954.7cab9bb4@mail.inbox.lv/Reported-and-tested-by: default avatarAlexey Avramov <hakavlad@inbox.lv>
Reported-and-tested-by: default avatarMike Galbraith <efault@gmx.de>
Reported-and-tested-by: default avatarDarrick J. Wong <djwong@kernel.org>
Reported-by: default avatarkernel test robot <lkp@intel.com>
Acked-by: default avatarHugh Dickins <hughd@google.com>
Tracked-by: default avatarThorsten Leemhuis <regressions@leemhuis.info>
Fixes: 69392a40 ("mm/vmscan: throttle reclaim when no progress is being made")
Signed-off-by: default avatarMel Gorman <mgorman@techsingularity.net>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent f87bcc88
...@@ -277,6 +277,7 @@ enum vmscan_throttle_state { ...@@ -277,6 +277,7 @@ enum vmscan_throttle_state {
VMSCAN_THROTTLE_WRITEBACK, VMSCAN_THROTTLE_WRITEBACK,
VMSCAN_THROTTLE_ISOLATED, VMSCAN_THROTTLE_ISOLATED,
VMSCAN_THROTTLE_NOPROGRESS, VMSCAN_THROTTLE_NOPROGRESS,
VMSCAN_THROTTLE_CONGESTED,
NR_VMSCAN_THROTTLE, NR_VMSCAN_THROTTLE,
}; };
......
...@@ -30,12 +30,14 @@ ...@@ -30,12 +30,14 @@
#define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK) #define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK)
#define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED) #define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED)
#define _VMSCAN_THROTTLE_NOPROGRESS (1 << VMSCAN_THROTTLE_NOPROGRESS) #define _VMSCAN_THROTTLE_NOPROGRESS (1 << VMSCAN_THROTTLE_NOPROGRESS)
#define _VMSCAN_THROTTLE_CONGESTED (1 << VMSCAN_THROTTLE_CONGESTED)
#define show_throttle_flags(flags) \ #define show_throttle_flags(flags) \
(flags) ? __print_flags(flags, "|", \ (flags) ? __print_flags(flags, "|", \
{_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \ {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \
{_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"}, \ {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"}, \
{_VMSCAN_THROTTLE_NOPROGRESS, "VMSCAN_THROTTLE_NOPROGRESS"} \ {_VMSCAN_THROTTLE_NOPROGRESS, "VMSCAN_THROTTLE_NOPROGRESS"}, \
{_VMSCAN_THROTTLE_CONGESTED, "VMSCAN_THROTTLE_CONGESTED"} \
) : "VMSCAN_THROTTLE_NONE" ) : "VMSCAN_THROTTLE_NONE"
......
...@@ -1021,6 +1021,39 @@ static void handle_write_error(struct address_space *mapping, ...@@ -1021,6 +1021,39 @@ static void handle_write_error(struct address_space *mapping,
unlock_page(page); unlock_page(page);
} }
static bool skip_throttle_noprogress(pg_data_t *pgdat)
{
int reclaimable = 0, write_pending = 0;
int i;
/*
* If kswapd is disabled, reschedule if necessary but do not
* throttle as the system is likely near OOM.
*/
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
/*
* If there are a lot of dirty/writeback pages then do not
* throttle as throttling will occur when the pages cycle
* towards the end of the LRU if still under writeback.
*/
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
reclaimable += zone_reclaimable_pages(zone);
write_pending += zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
}
if (2 * write_pending <= reclaimable)
return true;
return false;
}
void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
{ {
wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
...@@ -1056,8 +1089,16 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) ...@@ -1056,8 +1089,16 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
} }
break; break;
case VMSCAN_THROTTLE_CONGESTED:
fallthrough;
case VMSCAN_THROTTLE_NOPROGRESS: case VMSCAN_THROTTLE_NOPROGRESS:
timeout = HZ/2; if (skip_throttle_noprogress(pgdat)) {
cond_resched();
return;
}
timeout = 1;
break; break;
case VMSCAN_THROTTLE_ISOLATED: case VMSCAN_THROTTLE_ISOLATED:
timeout = HZ/50; timeout = HZ/50;
...@@ -3321,7 +3362,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) ...@@ -3321,7 +3362,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
if (!current_is_kswapd() && current_may_throttle() && if (!current_is_kswapd() && current_may_throttle() &&
!sc->hibernation_mode && !sc->hibernation_mode &&
test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc)) sc))
...@@ -3386,16 +3427,16 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) ...@@ -3386,16 +3427,16 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
} }
/* /*
* Do not throttle kswapd on NOPROGRESS as it will throttle on * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
* VMSCAN_THROTTLE_WRITEBACK if there are too many pages under * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
* writeback and marked for immediate reclaim at the tail of * under writeback and marked for immediate reclaim at the tail of the
* the LRU. * LRU.
*/ */
if (current_is_kswapd()) if (current_is_kswapd() || cgroup_reclaim(sc))
return; return;
/* Throttle if making no progress at high prioities. */ /* Throttle if making no progress at high prioities. */
if (sc->priority < DEF_PRIORITY - 2) if (sc->priority == 1 && !sc->nr_reclaimed)
reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
} }
...@@ -3415,6 +3456,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) ...@@ -3415,6 +3456,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
unsigned long nr_soft_scanned; unsigned long nr_soft_scanned;
gfp_t orig_mask; gfp_t orig_mask;
pg_data_t *last_pgdat = NULL; pg_data_t *last_pgdat = NULL;
pg_data_t *first_pgdat = NULL;
/* /*
* If the number of buffer_heads in the machine exceeds the maximum * If the number of buffer_heads in the machine exceeds the maximum
...@@ -3478,14 +3520,18 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) ...@@ -3478,14 +3520,18 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
/* need some check for avoid more shrink_zone() */ /* need some check for avoid more shrink_zone() */
} }
if (!first_pgdat)
first_pgdat = zone->zone_pgdat;
/* See comment about same check for global reclaim above */ /* See comment about same check for global reclaim above */
if (zone->zone_pgdat == last_pgdat) if (zone->zone_pgdat == last_pgdat)
continue; continue;
last_pgdat = zone->zone_pgdat; last_pgdat = zone->zone_pgdat;
shrink_node(zone->zone_pgdat, sc); shrink_node(zone->zone_pgdat, sc);
consider_reclaim_throttle(zone->zone_pgdat, sc);
} }
consider_reclaim_throttle(first_pgdat, sc);
/* /*
* Restore to original mask to avoid the impact on the caller if we * Restore to original mask to avoid the impact on the caller if we
* promoted it to __GFP_HIGHMEM. * promoted it to __GFP_HIGHMEM.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment