page allocator: break up the allocator entry point into fast and slow paths

The core of the page allocator is one giant function which allocates memory on the stack and makes calculations that may not be needed for every allocation. This patch breaks up the allocator path into fast and slow paths for clarity. Note the slow paths are still inlined but the entry is marked unlikely. If they were not inlined, it actally increases text size to generate the as there is only one call site. Signed-off-by: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Christoph Lameter <cl@linux-foundation.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

page allocator: break up the allocator entry point into fast and slow paths
The core of the page allocator is one giant function which allocates memory on the stack and makes calculations that may not be needed for every allocation. This patch breaks up the allocator path into fast and slow paths for clarity. Note the slow paths are still inlined but the entry is marked unlikely. If they were not inlined, it actally increases text size to generate the as there is only one call site. Signed-off-by: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Christoph Lameter <cl@linux-foundation.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
11e33f6a · Mel Gorman · Linus Torvalds · 7f82af97 · 11e33f6a
Commit 11e33f6a authored Jun 16, 2009 by Mel Gorman Committed by Linus Torvalds Jun 16, 2009
Hide whitespace changes
Inline Side-by-side

Showing with 228 additions and 125 deletions

mm/page_alloc.c mm/page_alloc.c +228 -125

No files found.
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1457,47 +1457,171 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 	return page;
 }
-/*
+static inline int
- * This is the 'heart' of the zoned buddy allocator.
+should_alloc_retry(gfp_t gfp_mask, unsigned int order,
- */
+				unsigned long pages_reclaimed)
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-			struct zonelist *zonelist, nodemask_t *nodemask)
 {
-	const gfp_t wait = gfp_mask & __GFP_WAIT;
+	/* Do not loop if specifically requested */
-	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+	if (gfp_mask & __GFP_NORETRY)
-	struct zoneref *z;
+		return 0;
-	struct zone *zone;
-	struct page *page;
-	struct reclaim_state reclaim_state;
-	struct task_struct *p = current;
-	int do_retry;
-	int alloc_flags;
-	unsigned long did_some_progress;
-	unsigned long pages_reclaimed = 0;
-	lockdep_trace_alloc(gfp_mask);
+	/*
+	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
+	 * means __GFP_NOFAIL, but that may not be true in other
+	 * implementations.
+	 */
+	if (order <= PAGE_ALLOC_COSTLY_ORDER)
+		return 1;
-	might_sleep_if(wait);
+	/*
+	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+	 * specified, then we retry until we no longer reclaim any pages
+	 * (above), or we've reclaimed an order of pages at least as
+	 * large as the allocation's order. In both cases, if the
+	 * allocation still fails, we stop retrying.
+	 */
+	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
+		return 1;
-	if (should_fail_alloc_page(gfp_mask, order))
+	/*
-		return NULL;
+	 * Don't let big-order allocations loop unless the caller
+	 * explicitly requests that.
+	 */
+	if (gfp_mask & __GFP_NOFAIL)
+		return 1;
-	/* the list of zones suitable for gfp_mask */
+	return 0;
-	z = zonelist->_zonerefs;
+}
-	if (unlikely(!z->zone)) {
-		/*
+static inline struct page *
-		 * Happens if we have an empty zonelist as a result of
+__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
-		 * GFP_THISNODE being used on a memoryless node
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
-		 */
+	nodemask_t *nodemask)
+{
+	struct page *page;
+	/* Acquire the OOM killer lock for the zones in zonelist */
+	if (!try_set_zone_oom(zonelist, gfp_mask)) {
+		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
-restart:
+	/*
-	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+	 * Go through the zonelist yet one more time, keep very high watermark
-			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+	 * here, this is only to catch a parallel oom killing, we must fail if
+	 * we're still under heavy pressure.
+	 */
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+		order, zonelist, high_zoneidx,
+		ALLOC_WMARK_HIGH|ALLOC_CPUSET);
 	if (page)
-		goto got_pg;
+		goto out;
+	/* The OOM killer will not help higher order allocs */
+	if (order > PAGE_ALLOC_COSTLY_ORDER)
+		goto out;
+	/* Exhausted what can be done so it's blamo time */
+	out_of_memory(zonelist, gfp_mask, order);
+out:
+	clear_zonelist_oom(zonelist, gfp_mask);
+	return page;
+}
+/* The really slow allocator path where we enter direct reclaim */
+static inline struct page *
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress)
+{
+	struct page *page = NULL;
+	struct reclaim_state reclaim_state;
+	struct task_struct *p = current;
+	cond_resched();
+	/* We now go into synchronous reclaim */
+	cpuset_memory_pressure_bump();
+	/*
+	 * The task's cpuset might have expanded its set of allowable nodes
+	 */
+	p->flags |= PF_MEMALLOC;
+	lockdep_set_current_reclaim_state(gfp_mask);
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
+	*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+	p->reclaim_state = NULL;
+	lockdep_clear_current_reclaim_state();
+	p->flags &= ~PF_MEMALLOC;
+	cond_resched();
+	if (order != 0)
+		drain_all_pages();
+	if (likely(*did_some_progress))
+		page = get_page_from_freelist(gfp_mask, nodemask, order,
+					zonelist, high_zoneidx, alloc_flags);
+	return page;
+}
+static inline int
+is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
+{
+	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+			&& !in_interrupt())
+		return 1;
+	return 0;
+}
+/*
+ * This is called in the allocator slow-path if the allocation request is of
+ * sufficient urgency to ignore watermarks and take other desperate measures
+ */
+static inline struct page *
+__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask)
+{
+	struct page *page;
+	do {
+		page = get_page_from_freelist(gfp_mask, nodemask, order,
+			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
+		if (!page && gfp_mask & __GFP_NOFAIL)
+			congestion_wait(WRITE, HZ/50);
+	} while (!page && (gfp_mask & __GFP_NOFAIL));
+	return page;
+}
+static inline
+void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
+						enum zone_type high_zoneidx)
+{
+	struct zoneref *z;
+	struct zone *zone;
+	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+		wakeup_kswapd(zone, order);
+}
+static inline struct page *
+__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask)
+{
+	const gfp_t wait = gfp_mask & __GFP_WAIT;
+	struct page *page = NULL;
+	int alloc_flags;
+	unsigned long pages_reclaimed = 0;
+	unsigned long did_some_progress;
+	struct task_struct *p = current;
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1510,8 +1634,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+	wake_all_kswapd(order, zonelist, high_zoneidx);
-		wakeup_kswapd(zone, order);
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
@@ -1531,6 +1654,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (wait)
 		alloc_flags |= ALLOC_CPUSET;
+restart:
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
 	 * coming from realtime tasks go deeper into reserves.
@@ -1544,23 +1668,18 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (page)
 		goto got_pg;
-	/* This allocation should allow future memory freeing. */
 rebalance:
-	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+	/* Allocate without watermarks if the context allows */
-			&& !in_interrupt()) {
+	if (is_allocation_high_priority(p, gfp_mask)) {
+		/* Do not dip into emergency reserves if specified */
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
-nofail_alloc:
+			page = __alloc_pages_high_priority(gfp_mask, order,
-			/* go through the zonelist yet again, ignoring mins */
+				zonelist, high_zoneidx, nodemask);
-			page = get_page_from_freelist(gfp_mask, nodemask, order,
-				zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
 			if (page)
 				goto got_pg;
-			if (gfp_mask & __GFP_NOFAIL) {
-				congestion_wait(WRITE, HZ/50);
-				goto nofail_alloc;
-			}
 		}
+		/* Ensure no recursion into the allocator */
 		goto nopage;
 	}
@@ -1568,93 +1687,42 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (!wait)
 		goto nopage;
-	cond_resched();
+	/* Try direct reclaim and then allocating */
+	page = __alloc_pages_direct_reclaim(gfp_mask, order,
-	/* We now go into synchronous reclaim */
+					zonelist, high_zoneidx,
-	cpuset_memory_pressure_bump();
+					nodemask,
+					alloc_flags, &did_some_progress);
-	p->flags |= PF_MEMALLOC;
+	if (page)
+		goto got_pg;
-	lockdep_set_current_reclaim_state(gfp_mask);
-	reclaim_state.reclaimed_slab = 0;
-	p->reclaim_state = &reclaim_state;
-	did_some_progress = try_to_free_pages(zonelist, order,
-						gfp_mask, nodemask);
-	p->reclaim_state = NULL;
-	lockdep_clear_current_reclaim_state();
-	p->flags &= ~PF_MEMALLOC;
-	cond_resched();
+	/*
+	 * If we failed to make any progress reclaiming, then we are
+	 * running out of options and have to consider going OOM
+	 */
+	if (!did_some_progress) {
+		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+			page = __alloc_pages_may_oom(gfp_mask, order,
+					zonelist, high_zoneidx,
+					nodemask);
+			if (page)
+				goto got_pg;
-	if (order != 0)
+			/*
-		drain_all_pages();
+			 * The OOM killer does not trigger for high-order allocations
+			 * but if no progress is being made, there are no other
+			 * options and retrying is unlikely to help
+			 */
+			if (order > PAGE_ALLOC_COSTLY_ORDER)
+				goto nopage;
-	if (likely(did_some_progress)) {
-		page = get_page_from_freelist(gfp_mask, nodemask, order,
-					zonelist, high_zoneidx, alloc_flags);
-		if (page)
-			goto got_pg;
-	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
-		if (!try_set_zone_oom(zonelist, gfp_mask)) {
-			schedule_timeout_uninterruptible(1);
 			goto restart;
 		}
-		/*
-		 * Go through the zonelist yet one more time, keep
-		 * very high watermark here, this is only to catch
-		 * a parallel oom killing, we must fail if we're still
-		 * under heavy pressure.
-		 */
-		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
-			order, zonelist, high_zoneidx,
-			ALLOC_WMARK_HIGH|ALLOC_CPUSET);
-		if (page) {
-			clear_zonelist_oom(zonelist, gfp_mask);
-			goto got_pg;
-		}
-		/* The OOM killer will not help higher order allocs so fail */
-		if (order > PAGE_ALLOC_COSTLY_ORDER) {
-			clear_zonelist_oom(zonelist, gfp_mask);
-			goto nopage;
-		}
-		out_of_memory(zonelist, gfp_mask, order);
-		clear_zonelist_oom(zonelist, gfp_mask);
-		goto restart;
 	}
-	/*
+	/* Check if we should retry the allocation */
-	 * Don't let big-order allocations loop unless the caller explicitly
-	 * requests that.  Wait for some write requests to complete then retry.
-	 *
-	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
-	 * means __GFP_NOFAIL, but that may not be true in other
-	 * implementations.
-	 *
-	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
-	 * specified, then we retry until we no longer reclaim any pages
-	 * (above), or we've reclaimed an order of pages at least as
-	 * large as the allocation's order. In both cases, if the
-	 * allocation still fails, we stop retrying.
-	 */
 	pages_reclaimed += did_some_progress;
-	do_retry = 0;
+	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
-	if (!(gfp_mask & __GFP_NORETRY)) {
+		/* Wait for some write requests to complete then retry */
-		if (order <= PAGE_ALLOC_COSTLY_ORDER) {
-			do_retry = 1;
-		} else {
-			if (gfp_mask & __GFP_REPEAT &&
-				pages_reclaimed < (1 << order))
-					do_retry = 1;
-		}
-		if (gfp_mask & __GFP_NOFAIL)
-			do_retry = 1;
-	}
-	if (do_retry) {
 		congestion_wait(WRITE, HZ/50);
 		goto rebalance;
 	}
@@ -1669,6 +1737,41 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	}
 got_pg:
 	return page;
+}
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+			struct zonelist *zonelist, nodemask_t *nodemask)
+{
+	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+	struct page *page;
+	lockdep_trace_alloc(gfp_mask);
+	might_sleep_if(gfp_mask & __GFP_WAIT);
+	if (should_fail_alloc_page(gfp_mask, order))
+		return NULL;
+	/*
+	 * Check the zones suitable for the gfp_mask contain at least one
+	 * valid zone. It's possible to have an empty zonelist as a result
+	 * of GFP_THISNODE and a memoryless node
+	 */
+	if (unlikely(!zonelist->_zonerefs->zone))
+		return NULL;
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+	if (unlikely(!page))
+		page = __alloc_pages_slowpath(gfp_mask, order,
+				zonelist, high_zoneidx, nodemask);
+	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);