Commit 7858d7bc authored by Feng Tang's avatar Feng Tang Committed by Linus Torvalds

mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy

MPOL_LOCAL policy has been setup as a real policy, but it is still handled
like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit
set, and there are many places having to judge the real 'prefer' or the
'local' policy, which are quite confusing.

In current code, there are 4 cases that MPOL_LOCAL are used:

1. user specifies 'local' policy

2. user specifies 'prefer' policy, but with empty nodemask

3. system 'default' policy is used

4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES
   flag set, and when it is 'rebind' to a nodemask which doesn't contains
   the 'preferred' node, it will perform as 'local' policy

So make 'local' a real policy instead of a fake 'prefer' one, and kill
MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading.

For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal
Hocko pointed out:

: I do believe that rebinding preferred policy is just bogus and it should
: be dropped altogether on the ground that a preference is a mere hint from
: userspace where to start the allocation.  Unless I am missing something
: cpusets will be always authoritative for the final placement.  The
: preferred node just acts as a starting point and it should be really
: preserved when cpusets changes.  Otherwise we have a very subtle behavior
: corner cases.

So dump all the tricky transformation between 'prefer' and 'local', and
just record the new nodemask of rebinding.

[feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko]
  Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com
[feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal]
  Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com

Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.comSigned-off-by: default avatarFeng Tang <feng.tang@intel.com>
Suggested-by: default avatarMichal Hocko <mhocko@suse.com>
Acked-by: default avatarMichal Hocko <mhocko@suse.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ben Widawsky <ben.widawsky@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent b26e517a
...@@ -60,7 +60,6 @@ enum { ...@@ -60,7 +60,6 @@ enum {
* are never OR'ed into the mode in mempolicy API arguments. * are never OR'ed into the mode in mempolicy API arguments.
*/ */
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */ #define MPOL_F_SHARED (1 << 0) /* identify shared policies */
#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
......
...@@ -121,8 +121,7 @@ enum zone_type policy_zone = 0; ...@@ -121,8 +121,7 @@ enum zone_type policy_zone = 0;
*/ */
static struct mempolicy default_policy = { static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */ .refcnt = ATOMIC_INIT(1), /* never free it */
.mode = MPOL_PREFERRED, .mode = MPOL_LOCAL,
.flags = MPOL_F_LOCAL,
}; };
static struct mempolicy preferred_node_policy[MAX_NUMNODES]; static struct mempolicy preferred_node_policy[MAX_NUMNODES];
...@@ -200,11 +199,8 @@ static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) ...@@ -200,11 +199,8 @@ static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{ {
if (!nodes) if (nodes_empty(*nodes))
pol->flags |= MPOL_F_LOCAL; /* local allocation */ return -EINVAL;
else if (nodes_empty(*nodes))
return -EINVAL; /* no allowed nodes */
else
pol->v.preferred_node = first_node(*nodes); pol->v.preferred_node = first_node(*nodes);
return 0; return 0;
} }
...@@ -220,8 +216,7 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) ...@@ -220,8 +216,7 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
/* /*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes * any, for the new policy. mpol_new() has already validated the nodes
* parameter with respect to the policy mode and flags. But, we need to * parameter with respect to the policy mode and flags.
* handle an empty nodemask with MPOL_PREFERRED here.
* *
* Must be called holding task's alloc_lock to protect task's mems_allowed * Must be called holding task's alloc_lock to protect task's mems_allowed
* and mempolicy. May also be called holding the mmap_lock for write. * and mempolicy. May also be called holding the mmap_lock for write.
...@@ -231,17 +226,20 @@ static int mpol_set_nodemask(struct mempolicy *pol, ...@@ -231,17 +226,20 @@ static int mpol_set_nodemask(struct mempolicy *pol,
{ {
int ret; int ret;
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ /*
if (pol == NULL) * Default (pol==NULL) resp. local memory policies are not a
* subject of any remapping. They also do not need any special
* constructor.
*/
if (!pol || pol->mode == MPOL_LOCAL)
return 0; return 0;
/* Check N_MEMORY */ /* Check N_MEMORY */
nodes_and(nsc->mask1, nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_MEMORY]); cpuset_current_mems_allowed, node_states[N_MEMORY]);
VM_BUG_ON(!nodes); VM_BUG_ON(!nodes);
if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
nodes = NULL; /* explicit local allocation */
else {
if (pol->flags & MPOL_F_RELATIVE_NODES) if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
else else
...@@ -250,14 +248,9 @@ static int mpol_set_nodemask(struct mempolicy *pol, ...@@ -250,14 +248,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
if (mpol_store_user_nodemask(pol)) if (mpol_store_user_nodemask(pol))
pol->w.user_nodemask = *nodes; pol->w.user_nodemask = *nodes;
else else
pol->w.cpuset_mems_allowed = pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
cpuset_current_mems_allowed;
}
if (nodes)
ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
else
ret = mpol_ops[pol->mode].create(pol, NULL);
return ret; return ret;
} }
...@@ -290,13 +283,14 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, ...@@ -290,13 +283,14 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
if (((flags & MPOL_F_STATIC_NODES) || if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES))) (flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
mode = MPOL_LOCAL;
} }
} else if (mode == MPOL_LOCAL) { } else if (mode == MPOL_LOCAL) {
if (!nodes_empty(*nodes) || if (!nodes_empty(*nodes) ||
(flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)) (flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes)) } else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
...@@ -344,25 +338,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) ...@@ -344,25 +338,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
static void mpol_rebind_preferred(struct mempolicy *pol, static void mpol_rebind_preferred(struct mempolicy *pol,
const nodemask_t *nodes) const nodemask_t *nodes)
{ {
nodemask_t tmp;
if (pol->flags & MPOL_F_STATIC_NODES) {
int node = first_node(pol->w.user_nodemask);
if (node_isset(node, *nodes)) {
pol->v.preferred_node = node;
pol->flags &= ~MPOL_F_LOCAL;
} else
pol->flags |= MPOL_F_LOCAL;
} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
pol->v.preferred_node = first_node(tmp);
} else if (!(pol->flags & MPOL_F_LOCAL)) {
pol->v.preferred_node = node_remap(pol->v.preferred_node,
pol->w.cpuset_mems_allowed,
*nodes);
pol->w.cpuset_mems_allowed = *nodes; pol->w.cpuset_mems_allowed = *nodes;
}
} }
/* /*
...@@ -376,7 +352,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) ...@@ -376,7 +352,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{ {
if (!pol) if (!pol)
return; return;
if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && if (!mpol_store_user_nodemask(pol) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return; return;
...@@ -427,6 +403,9 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { ...@@ -427,6 +403,9 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.create = mpol_new_bind, .create = mpol_new_bind,
.rebind = mpol_rebind_nodemask, .rebind = mpol_rebind_nodemask,
}, },
[MPOL_LOCAL] = {
.rebind = mpol_rebind_default,
},
}; };
static int migrate_page_add(struct page *page, struct list_head *pagelist, static int migrate_page_add(struct page *page, struct list_head *pagelist,
...@@ -919,10 +898,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) ...@@ -919,10 +898,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
case MPOL_INTERLEAVE: case MPOL_INTERLEAVE:
*nodes = p->v.nodes; *nodes = p->v.nodes;
break; break;
case MPOL_LOCAL:
/* return empty node mask for local allocation */
break;
case MPOL_PREFERRED: case MPOL_PREFERRED:
if (!(p->flags & MPOL_F_LOCAL))
node_set(p->v.preferred_node, *nodes); node_set(p->v.preferred_node, *nodes);
/* else return empty node mask for local allocation */
break; break;
default: default:
BUG(); BUG();
...@@ -1894,9 +1875,9 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) ...@@ -1894,9 +1875,9 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
/* Return the node id preferred by the given mempolicy, or the given id */ /* Return the node id preferred by the given mempolicy, or the given id */
static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{ {
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) if (policy->mode == MPOL_PREFERRED) {
nd = policy->v.preferred_node; nd = policy->v.preferred_node;
else { } else {
/* /*
* __GFP_THISNODE shouldn't even be used with the bind policy * __GFP_THISNODE shouldn't even be used with the bind policy
* because we might easily break the expectation to stay on the * because we might easily break the expectation to stay on the
...@@ -1933,14 +1914,11 @@ unsigned int mempolicy_slab_node(void) ...@@ -1933,14 +1914,11 @@ unsigned int mempolicy_slab_node(void)
return node; return node;
policy = current->mempolicy; policy = current->mempolicy;
if (!policy || policy->flags & MPOL_F_LOCAL) if (!policy)
return node; return node;
switch (policy->mode) { switch (policy->mode) {
case MPOL_PREFERRED: case MPOL_PREFERRED:
/*
* handled MPOL_F_LOCAL above
*/
return policy->v.preferred_node; return policy->v.preferred_node;
case MPOL_INTERLEAVE: case MPOL_INTERLEAVE:
...@@ -1960,6 +1938,8 @@ unsigned int mempolicy_slab_node(void) ...@@ -1960,6 +1938,8 @@ unsigned int mempolicy_slab_node(void)
&policy->v.nodes); &policy->v.nodes);
return z->zone ? zone_to_nid(z->zone) : node; return z->zone ? zone_to_nid(z->zone) : node;
} }
case MPOL_LOCAL:
return node;
default: default:
BUG(); BUG();
...@@ -2072,9 +2052,6 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) ...@@ -2072,9 +2052,6 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
mempolicy = current->mempolicy; mempolicy = current->mempolicy;
switch (mempolicy->mode) { switch (mempolicy->mode) {
case MPOL_PREFERRED: case MPOL_PREFERRED:
if (mempolicy->flags & MPOL_F_LOCAL)
nid = numa_node_id();
else
nid = mempolicy->v.preferred_node; nid = mempolicy->v.preferred_node;
init_nodemask_of_node(mask, nid); init_nodemask_of_node(mask, nid);
break; break;
...@@ -2084,6 +2061,11 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) ...@@ -2084,6 +2061,11 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
*mask = mempolicy->v.nodes; *mask = mempolicy->v.nodes;
break; break;
case MPOL_LOCAL:
nid = numa_node_id();
init_nodemask_of_node(mask, nid);
break;
default: default:
BUG(); BUG();
} }
...@@ -2188,7 +2170,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, ...@@ -2188,7 +2170,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
* If the policy is interleave, or does not allow the current * If the policy is interleave, or does not allow the current
* node in its nodemask, we allocate the standard way. * node in its nodemask, we allocate the standard way.
*/ */
if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) if (pol->mode == MPOL_PREFERRED)
hpage_node = pol->v.preferred_node; hpage_node = pol->v.preferred_node;
nmask = policy_nodemask(gfp, pol); nmask = policy_nodemask(gfp, pol);
...@@ -2324,10 +2306,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) ...@@ -2324,10 +2306,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_INTERLEAVE: case MPOL_INTERLEAVE:
return !!nodes_equal(a->v.nodes, b->v.nodes); return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED: case MPOL_PREFERRED:
/* a's ->flags is the same as b's */
if (a->flags & MPOL_F_LOCAL)
return true;
return a->v.preferred_node == b->v.preferred_node; return a->v.preferred_node == b->v.preferred_node;
case MPOL_LOCAL:
return true;
default: default:
BUG(); BUG();
return false; return false;
...@@ -2465,12 +2446,13 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long ...@@ -2465,12 +2446,13 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break; break;
case MPOL_PREFERRED: case MPOL_PREFERRED:
if (pol->flags & MPOL_F_LOCAL)
polnid = numa_node_id();
else
polnid = pol->v.preferred_node; polnid = pol->v.preferred_node;
break; break;
case MPOL_LOCAL:
polnid = numa_node_id();
break;
case MPOL_BIND: case MPOL_BIND:
/* Optimize placement among multiple nodes via NUMA balancing */ /* Optimize placement among multiple nodes via NUMA balancing */
if (pol->flags & MPOL_F_MORON) { if (pol->flags & MPOL_F_MORON) {
...@@ -2835,9 +2817,6 @@ void numa_default_policy(void) ...@@ -2835,9 +2817,6 @@ void numa_default_policy(void)
* Parse and format mempolicy from/to strings * Parse and format mempolicy from/to strings
*/ */
/*
* "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
*/
static const char * const policy_modes[] = static const char * const policy_modes[] =
{ {
[MPOL_DEFAULT] = "default", [MPOL_DEFAULT] = "default",
...@@ -2915,7 +2894,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) ...@@ -2915,7 +2894,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
*/ */
if (nodelist) if (nodelist)
goto out; goto out;
mode = MPOL_PREFERRED;
break; break;
case MPOL_DEFAULT: case MPOL_DEFAULT:
/* /*
...@@ -2959,7 +2937,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) ...@@ -2959,7 +2937,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
else if (nodelist) else if (nodelist)
new->v.preferred_node = first_node(nodes); new->v.preferred_node = first_node(nodes);
else else
new->flags |= MPOL_F_LOCAL; new->mode = MPOL_LOCAL;
/* /*
* Save nodes for contextualization: this will be used to "clone" * Save nodes for contextualization: this will be used to "clone"
...@@ -3005,11 +2983,9 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) ...@@ -3005,11 +2983,9 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
switch (mode) { switch (mode) {
case MPOL_DEFAULT: case MPOL_DEFAULT:
case MPOL_LOCAL:
break; break;
case MPOL_PREFERRED: case MPOL_PREFERRED:
if (flags & MPOL_F_LOCAL)
mode = MPOL_LOCAL;
else
node_set(pol->v.preferred_node, nodes); node_set(pol->v.preferred_node, nodes);
break; break;
case MPOL_BIND: case MPOL_BIND:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment