Commit e2b149cc authored by Ilya Dryomov's avatar Ilya Dryomov Committed by Sage Weil

crush: add chooseleaf_vary_r tunable

The current crush_choose_firstn code will re-use the same 'r' value for
the recursive call.  That means that if we are hitting a collision or
rejection for some reason (say, an OSD that is marked out) and need to
retry, we will keep making the same (bad) choice in that recursive
selection.

Introduce a tunable that fixes that behavior by incorporating the parent
'r' value into the recursive starting point, so that a different path
will be taken in subsequent placement attempts.

Note that this was done from the get-go for the new crush_choose_indep
algorithm.

This was exposed by a user who was seeing PGs stuck in active+remapped
after reweight-by-utilization because the up set mapped to a single OSD.

Reflects ceph.git commit a8e6c9fbf88bad056dd05d3eb790e98a5e43451a.
Signed-off-by: default avatarIlya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: default avatarJosh Durgin <josh.durgin@inktank.com>
parent 6ed1002f
...@@ -173,6 +173,12 @@ struct crush_map { ...@@ -173,6 +173,12 @@ struct crush_map {
* apply to a collision: in that case we will retry as we used * apply to a collision: in that case we will retry as we used
* to. */ * to. */
__u32 chooseleaf_descend_once; __u32 chooseleaf_descend_once;
/* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
* bits. a value of 1 is best for new clusters. for legacy clusters
* that want to limit reshuffling, a value of 3 or 4 will make the
* mappings line up a bit better with previous mappings. */
__u8 chooseleaf_vary_r;
}; };
......
...@@ -295,7 +295,9 @@ static int is_out(const struct crush_map *map, ...@@ -295,7 +295,9 @@ static int is_out(const struct crush_map *map,
* @local_retries: localized retries * @local_retries: localized retries
* @local_fallback_retries: localized fallback retries * @local_fallback_retries: localized fallback retries
* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
* @vary_r: pass r to recursive calls
* @out2: second output vector for leaf items (if @recurse_to_leaf) * @out2: second output vector for leaf items (if @recurse_to_leaf)
* @parent_r: r value passed from the parent
*/ */
static int crush_choose_firstn(const struct crush_map *map, static int crush_choose_firstn(const struct crush_map *map,
struct crush_bucket *bucket, struct crush_bucket *bucket,
...@@ -307,7 +309,9 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -307,7 +309,9 @@ static int crush_choose_firstn(const struct crush_map *map,
unsigned int local_retries, unsigned int local_retries,
unsigned int local_fallback_retries, unsigned int local_fallback_retries,
int recurse_to_leaf, int recurse_to_leaf,
int *out2) unsigned int vary_r,
int *out2,
int parent_r)
{ {
int rep; int rep;
unsigned int ftotal, flocal; unsigned int ftotal, flocal;
...@@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map,
int itemtype; int itemtype;
int collide, reject; int collide, reject;
dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
bucket->id, x, outpos, numrep); recurse_to_leaf ? "_LEAF" : "",
bucket->id, x, outpos, numrep,
tries, recurse_tries, local_retries, local_fallback_retries,
parent_r);
for (rep = outpos; rep < numrep; rep++) { for (rep = outpos; rep < numrep; rep++) {
/* keep trying until we get a non-out, non-colliding item */ /* keep trying until we get a non-out, non-colliding item */
...@@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map,
do { do {
collide = 0; collide = 0;
retry_bucket = 0; retry_bucket = 0;
r = rep; r = rep + parent_r;
/* r' = r + f_total */ /* r' = r + f_total */
r += ftotal; r += ftotal;
...@@ -387,6 +394,11 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -387,6 +394,11 @@ static int crush_choose_firstn(const struct crush_map *map,
reject = 0; reject = 0;
if (!collide && recurse_to_leaf) { if (!collide && recurse_to_leaf) {
if (item < 0) { if (item < 0) {
int sub_r;
if (vary_r)
sub_r = r >> (vary_r-1);
else
sub_r = 0;
if (crush_choose_firstn(map, if (crush_choose_firstn(map,
map->buckets[-1-item], map->buckets[-1-item],
weight, weight_max, weight, weight_max,
...@@ -396,7 +408,9 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -396,7 +408,9 @@ static int crush_choose_firstn(const struct crush_map *map,
local_retries, local_retries,
local_fallback_retries, local_fallback_retries,
0, 0,
NULL) <= outpos) vary_r,
NULL,
sub_r) <= outpos)
/* didn't get leaf */ /* didn't get leaf */
reject = 1; reject = 1;
} else { } else {
...@@ -653,6 +667,8 @@ int crush_do_rule(const struct crush_map *map, ...@@ -653,6 +667,8 @@ int crush_do_rule(const struct crush_map *map,
int choose_local_retries = map->choose_local_tries; int choose_local_retries = map->choose_local_tries;
int choose_local_fallback_retries = map->choose_local_fallback_tries; int choose_local_fallback_retries = map->choose_local_fallback_tries;
int vary_r = map->chooseleaf_vary_r;
if ((__u32)ruleno >= map->max_rules) { if ((__u32)ruleno >= map->max_rules) {
dprintk(" bad ruleno %d\n", ruleno); dprintk(" bad ruleno %d\n", ruleno);
return 0; return 0;
...@@ -745,7 +761,9 @@ int crush_do_rule(const struct crush_map *map, ...@@ -745,7 +761,9 @@ int crush_do_rule(const struct crush_map *map,
choose_local_retries, choose_local_retries,
choose_local_fallback_retries, choose_local_fallback_retries,
recurse_to_leaf, recurse_to_leaf,
c+osize); vary_r,
c+osize,
0);
} else { } else {
crush_choose_indep( crush_choose_indep(
map, map,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment