Commit 47fb3887 authored by Andrea Arcangeli's avatar Andrea Arcangeli

oom: fix ext4 __GFP_NOFAIL livelock

The previous commit fixed a ext4 livelock by not making !__GFP_FS
allocations behave similarly to __GFP_NOFAIL and I mentioned how
__GFP_NOFAIL is livelock prone.

After letting the trinity load run for a while I actually hit the
very __GFP_NOFAIL livelock too:

 #0  get_page_from_freelist (gfp_mask=0x20858, nodemask=0x0 <irq_stack_union>, order=0x0, zonelist=0xffff88007fffc100, hi
gh_zoneidx=0x2, alloc_flags=0xc0, preferred_zone=0xffff88007fffa840, classzone_idx=classzone_idx@entry=0x1, migratetype=
migratetype@entry=0x2) at mm/page_alloc.c:1953
 #1  0xffffffff81178e88 in __alloc_pages_slowpath (migratetype=0x2, classzone_idx=0x1, preferred_zone=0xffff88007fffa840,
 nodemask=0x0 <irq_stack_union>, high_zoneidx=ZONE_NORMAL, zonelist=0xffff88007fffc100, order=0x0, gfp_mask=0x20858) at
mm/page_alloc.c:2597
 #2  __alloc_pages_nodemask (gfp_mask=<optimized out>, order=0x0, zonelist=0xffff87fffffffffa, nodemask=0x0 <irq_stack_un
ion>) at mm/page_alloc.c:2832
 #3  0xffffffff811becab in alloc_pages_current (gfp=0x20858, order=0x0) at mm/mempolicy.c:2100
 #4  0xffffffff8116e450 in alloc_pages (order=0x0, gfp_mask=0x20858) at include/linux/gfp.h:336
 #5  __page_cache_alloc (gfp=0x20858) at mm/filemap.c:663
 #6  0xffffffff8116f03c in pagecache_get_page (mapping=0xffff88007cc03908, offset=0xc920f, fgp_flags=0x7, cache_gfp_mask=
0x20858, radix_gfp_mask=0x850) at mm/filemap.c:1096
 #7  0xffffffff812160f4 in find_or_create_page (mapping=<optimized out>, gfp_mask=<optimized out>, offset=0xc920f) at inc
lude/linux/pagemap.h:336
 #8  grow_dev_page (sizebits=0x0, size=0x1000, index=0xc920f, block=0xc920f, bdev=0xffff88007cc03580) at fs/buffer.c:1022
 #9  grow_buffers (size=<optimized out>, block=<optimized out>, bdev=<optimized out>) at fs/buffer.c:1095
 #10 __getblk_slow (size=0x1000, block=0xc920f, bdev=0xffff88007cc03580) at fs/buffer.c:1121
 #11 __getblk (bdev=0xffff88007cc03580, block=0xc920f, size=0x1000) at fs/buffer.c:1395
 #12 0xffffffff8125c8ed in sb_getblk (block=0xc920f, sb=<optimized out>) at include/linux/buffer_head.h:310
 #13 ext4_read_block_bitmap_nowait (sb=0xffff88007c579000, block_group=0x2f) at fs/ext4/balloc.c:407
 #14 0xffffffff8125ced4 in ext4_read_block_bitmap (sb=0xffff88007c579000, block_group=0x2f) at fs/ext4/balloc.c:489
 #15 0xffffffff8167963b in ext4_mb_discard_group_preallocations (sb=0xffff88007c579000, group=0x2f, needed=0x38) at fs/ex
t4/mballoc.c:3798
 #16 0xffffffff8129ddbd in ext4_mb_discard_preallocations (needed=0x38, sb=0xffff88007c579000) at fs/ext4/mballoc.c:4346
 #17 ext4_mb_new_blocks (handle=0xffff88003305ee98, ar=0xffff88001f50b890, errp=0xffff88001f50b880) at fs/ext4/mballoc.c:4479
 #18 0xffffffff81290fd3 in ext4_ext_map_blocks (handle=0xffff88003305ee98, inode=0xffff88007b85b178, map=0xffff88001f50ba50, flags=0x25) at fs/ext4/extents.c:4453
 #19 0xffffffff81265688 in ext4_map_blocks (handle=0xffff88003305ee98, inode=0xffff88007b85b178, map=0xffff88001f50ba50, flags=0x25) at fs/ext4/inode.c:648
 #20 0xffffffff8126af77 in mpage_map_one_extent (mpd=0xffff88001f50ba28, handle=0xffff88003305ee98) at fs/ext4/inode.c:2164
 #21 mpage_map_and_submit_extent (give_up_on_write=<synthetic pointer>, mpd=0xffff88001f50ba28, handle=0xffff88003305ee98) at fs/ext4/inode.c:2219
 #22 ext4_writepages (mapping=0xffff88007b85b350, wbc=0xffff88001f50bb60) at fs/ext4/inode.c:2557
 #23 0xffffffff8117ce81 in do_writepages (mapping=0xffff88007b85b350, wbc=0xffff88001f50bb60) at mm/page-writeback.c:2046
 #24 0xffffffff812096c0 in __writeback_single_inode (inode=0xffff88007b85b178, wbc=0xffff88001f50bb60) at fs/fs-writeback.c:460
 #25 0xffffffff8120b311 in writeback_sb_inodes (sb=0xffff88007c579000, wb=0xffff88007bceb060, work=0xffff8800130f9d80) at fs/fs-writeback.c:687
 #26 0xffffffff8120b68f in __writeback_inodes_wb (wb=0xffff88007bceb060, work=0xffff8800130f9d80) at fs/fs-writeback.c:732
 #27 0xffffffff8120b94b in wb_writeback (wb=0xffff88007bceb060, work=0xffff8800130f9d80) at fs/fs-writeback.c:863
 #28 0xffffffff8120befc in wb_do_writeback (wb=0xffff88007bceb060) at fs/fs-writeback.c:998
 #29 bdi_writeback_workfn (work=0xffff88007bceb078) at fs/fs-writeback.c:1043
 #30 0xffffffff81092cf5 in process_one_work (worker=0xffff88002c555e80, work=0xffff88007bceb078) at kernel/workqueue.c:2081
 #31 0xffffffff8109376b in worker_thread (__worker=0xffff88002c555e80) at kernel/workqueue.c:2212
 #32 0xffffffff8109ba54 in kthread (_create=0xffff88007bf2e2c0) at kernel/kthread.c:207
 #33 <signal handler called>
 #34 0x0000000000000000 in irq_stack_union ()
 #35 0x0000000000000000 in ?? ()

To solve this I set manually with gdb ALLOC_NO_WATERMARKS in
alloc_flags, and the livelock resolved itself.

The fix simply allows __GFP_NOFAIL allocation to get access to the
emergency reserves in the buddy allocator if __GFP_NOFAIL triggers a
reclaim failure signaling an out of memory condition. Worst case it'll
deadlock because we run out of emergency reserves but not giving it
access to the emergency reserves after the __GFP_NOFAIL hits on a out
of memory condition may actually result in a livelock despite there
are still ~50Mbyte free! So this is safer. After applying this OOM
livelock fix I cannot reproduce the livelock anymore in __GFP_NOFAIL.
parent 7636db0a
......@@ -2375,8 +2375,27 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
return 0;
}
static inline int gfp_to_alloc_flags(gfp_t gfp_mask);
static void gfp_nofail_emergency(gfp_t *gfp_mask, int *alloc_flags,
unsigned int order)
{
/*
* If we reached an out of memory condition in the context of
* a __GFP_NOFAIL (in turn livelock prone) allocation try to
* give access to the emergency pools, otherwise we could
* livelock.
*/
if ((*gfp_mask & __GFP_NOFAIL) && !order) {
*gfp_mask |= __GFP_MEMALLOC;
*gfp_mask &= ~__GFP_NOMEMALLOC;
*alloc_flags = gfp_to_alloc_flags(*gfp_mask);
VM_BUG_ON(!(*alloc_flags & ALLOC_NO_WATERMARKS));
}
}
static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
__alloc_pages_may_oom(gfp_t *gfp_mask, unsigned int order, int *alloc_flags,
const struct alloc_context *ac, unsigned long *did_some_progress)
{
struct page *page;
......@@ -2387,7 +2406,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* Acquire the per-zone oom lock for each zone. If that
* fails, somebody else is making progress for us.
*/
if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
if (!oom_zonelist_trylock(ac->zonelist, *gfp_mask)) {
*did_some_progress = 1;
schedule_timeout_uninterruptible(1);
return NULL;
......@@ -2398,12 +2417,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* here, this is only to catch a parallel oom killing, we must fail if
* we're still under heavy pressure.
*/
page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
page = get_page_from_freelist(*gfp_mask | __GFP_HARDWALL, order,
ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
if (page)
goto out;
if (!(gfp_mask & __GFP_NOFAIL)) {
if (!(*gfp_mask & __GFP_NOFAIL)) {
/* Coredumps can quickly deplete all memory reserves */
if (current->flags & PF_DUMPCORE)
goto out;
......@@ -2414,7 +2433,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (ac->high_zoneidx < ZONE_NORMAL)
goto out;
/* The OOM killer does not compensate for light reclaim */
if (!(gfp_mask & __GFP_FS)) {
if (!(*gfp_mask & __GFP_FS)) {
/*
* XXX: Page reclaim didn't yield anything,
* and the OOM killer can't be invoked, but
......@@ -2424,15 +2443,16 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* The OOM killer may not free memory on a specific node */
if (gfp_mask & __GFP_THISNODE)
if (*gfp_mask & __GFP_THISNODE)
goto out;
}
} else
gfp_nofail_emergency(gfp_mask, alloc_flags, order);
/* Exhausted what can be done so it's blamo time */
if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
|| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
if (out_of_memory(ac->zonelist, *gfp_mask, order, ac->nodemask, false)
|| WARN_ON_ONCE(*gfp_mask & __GFP_NOFAIL))
*did_some_progress = 1;
out:
oom_zonelist_unlock(ac->zonelist, gfp_mask);
oom_zonelist_unlock(ac->zonelist, *gfp_mask);
return page;
}
......@@ -2815,8 +2835,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* start OOM killing tasks.
*/
if (!did_some_progress) {
page = __alloc_pages_may_oom(gfp_mask, order, ac,
&did_some_progress);
page = __alloc_pages_may_oom(&gfp_mask, order,
&alloc_flags, ac,
&did_some_progress);
if (page)
goto got_pg;
if (!did_some_progress)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment