Commit 5d6a6a75 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph fixes from Sage Weil:
 "We have a few wire protocol compatibility fixes, ports of a few recent
  CRUSH mapping changes, and a couple error path fixes"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
  libceph: MOSDOpReply v7 encoding
  libceph: advertise support for TUNABLES5
  crush: decode and initialize chooseleaf_stable
  crush: add chooseleaf_stable tunable
  crush: ensure take bucket value is valid
  crush: ensure bucket id is valid before indexing buckets array
  ceph: fix snap context leak in error path
  ceph: checking for IS_ERR instead of NULL
parents 9b108828 b0b31a8f
...@@ -698,8 +698,8 @@ static void ceph_aio_retry_work(struct work_struct *work) ...@@ -698,8 +698,8 @@ static void ceph_aio_retry_work(struct work_struct *work)
req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
false, GFP_NOFS); false, GFP_NOFS);
if (IS_ERR(req)) { if (!req) {
ret = PTR_ERR(req); ret = -ENOMEM;
req = orig_req; req = orig_req;
goto out; goto out;
} }
...@@ -716,7 +716,6 @@ static void ceph_aio_retry_work(struct work_struct *work) ...@@ -716,7 +716,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
ceph_osdc_build_request(req, req->r_ops[0].extent.offset, ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
snapc, CEPH_NOSNAP, &aio_req->mtime); snapc, CEPH_NOSNAP, &aio_req->mtime);
ceph_put_snap_context(snapc);
ceph_osdc_put_request(orig_req); ceph_osdc_put_request(orig_req);
req->r_callback = ceph_aio_complete_req; req->r_callback = ceph_aio_complete_req;
...@@ -731,6 +730,7 @@ static void ceph_aio_retry_work(struct work_struct *work) ...@@ -731,6 +730,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
ceph_aio_complete_req(req, NULL); ceph_aio_complete_req(req, NULL);
} }
ceph_put_snap_context(snapc);
kfree(aio_work); kfree(aio_work);
} }
......
...@@ -63,6 +63,18 @@ ...@@ -63,6 +63,18 @@
#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49) #define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49)
// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */
#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52)
#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53)
#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
#define CEPH_FEATURE_NEW_OSDOP_ENCODING (1ULL<<56) /* New, v7 encoding */
#define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */
#define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */
#define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */
// duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5
#define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */
/* /*
* The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
...@@ -108,7 +120,9 @@ static inline u64 ceph_sanitize_features(u64 features) ...@@ -108,7 +120,9 @@ static inline u64 ceph_sanitize_features(u64 features)
CEPH_FEATURE_CRUSH_TUNABLES3 | \ CEPH_FEATURE_CRUSH_TUNABLES3 | \
CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \
CEPH_FEATURE_MSGR_KEEPALIVE2 | \ CEPH_FEATURE_MSGR_KEEPALIVE2 | \
CEPH_FEATURE_CRUSH_V4) CEPH_FEATURE_CRUSH_V4 | \
CEPH_FEATURE_CRUSH_TUNABLES5 | \
CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING)
#define CEPH_FEATURES_REQUIRED_DEFAULT \ #define CEPH_FEATURES_REQUIRED_DEFAULT \
(CEPH_FEATURE_NOSRCADDR | \ (CEPH_FEATURE_NOSRCADDR | \
......
...@@ -59,7 +59,8 @@ enum { ...@@ -59,7 +59,8 @@ enum {
CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12 CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
}; };
/* /*
...@@ -205,6 +206,11 @@ struct crush_map { ...@@ -205,6 +206,11 @@ struct crush_map {
* mappings line up a bit better with previous mappings. */ * mappings line up a bit better with previous mappings. */
__u8 chooseleaf_vary_r; __u8 chooseleaf_vary_r;
/* if true, it makes chooseleaf firstn to return stable results (if
* no local retry) so that data migrations would be optimal when some
* device fails. */
__u8 chooseleaf_stable;
#ifndef __KERNEL__ #ifndef __KERNEL__
/* /*
* version 0 (original) of straw_calc has various flaws. version 1 * version 0 (original) of straw_calc has various flaws. version 1
......
...@@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map, ...@@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map,
* @local_retries: localized retries * @local_retries: localized retries
* @local_fallback_retries: localized fallback retries * @local_fallback_retries: localized fallback retries
* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
* @stable: stable mode starts rep=0 in the recursive call for all replicas
* @vary_r: pass r to recursive calls * @vary_r: pass r to recursive calls
* @out2: second output vector for leaf items (if @recurse_to_leaf) * @out2: second output vector for leaf items (if @recurse_to_leaf)
* @parent_r: r value passed from the parent * @parent_r: r value passed from the parent
...@@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map,
unsigned int local_fallback_retries, unsigned int local_fallback_retries,
int recurse_to_leaf, int recurse_to_leaf,
unsigned int vary_r, unsigned int vary_r,
unsigned int stable,
int *out2, int *out2,
int parent_r) int parent_r)
{ {
...@@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map,
int collide, reject; int collide, reject;
int count = out_size; int count = out_size;
dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
recurse_to_leaf ? "_LEAF" : "", recurse_to_leaf ? "_LEAF" : "",
bucket->id, x, outpos, numrep, bucket->id, x, outpos, numrep,
tries, recurse_tries, local_retries, local_fallback_retries, tries, recurse_tries, local_retries, local_fallback_retries,
parent_r); parent_r, stable);
for (rep = outpos; rep < numrep && count > 0 ; rep++) { for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
/* keep trying until we get a non-out, non-colliding item */ /* keep trying until we get a non-out, non-colliding item */
ftotal = 0; ftotal = 0;
skip_rep = 0; skip_rep = 0;
...@@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map,
if (crush_choose_firstn(map, if (crush_choose_firstn(map,
map->buckets[-1-item], map->buckets[-1-item],
weight, weight_max, weight, weight_max,
x, outpos+1, 0, x, stable ? 1 : outpos+1, 0,
out2, outpos, count, out2, outpos, count,
recurse_tries, 0, recurse_tries, 0,
local_retries, local_retries,
local_fallback_retries, local_fallback_retries,
0, 0,
vary_r, vary_r,
stable,
NULL, NULL,
sub_r) <= outpos) sub_r) <= outpos)
/* didn't get leaf */ /* didn't get leaf */
...@@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map, ...@@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map,
int choose_local_fallback_retries = map->choose_local_fallback_tries; int choose_local_fallback_retries = map->choose_local_fallback_tries;
int vary_r = map->chooseleaf_vary_r; int vary_r = map->chooseleaf_vary_r;
int stable = map->chooseleaf_stable;
if ((__u32)ruleno >= map->max_rules) { if ((__u32)ruleno >= map->max_rules) {
dprintk(" bad ruleno %d\n", ruleno); dprintk(" bad ruleno %d\n", ruleno);
...@@ -835,7 +839,8 @@ int crush_do_rule(const struct crush_map *map, ...@@ -835,7 +839,8 @@ int crush_do_rule(const struct crush_map *map,
case CRUSH_RULE_TAKE: case CRUSH_RULE_TAKE:
if ((curstep->arg1 >= 0 && if ((curstep->arg1 >= 0 &&
curstep->arg1 < map->max_devices) || curstep->arg1 < map->max_devices) ||
(-1-curstep->arg1 < map->max_buckets && (-1-curstep->arg1 >= 0 &&
-1-curstep->arg1 < map->max_buckets &&
map->buckets[-1-curstep->arg1])) { map->buckets[-1-curstep->arg1])) {
w[0] = curstep->arg1; w[0] = curstep->arg1;
wsize = 1; wsize = 1;
...@@ -869,6 +874,11 @@ int crush_do_rule(const struct crush_map *map, ...@@ -869,6 +874,11 @@ int crush_do_rule(const struct crush_map *map,
vary_r = curstep->arg1; vary_r = curstep->arg1;
break; break;
case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
if (curstep->arg1 >= 0)
stable = curstep->arg1;
break;
case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSELEAF_FIRSTN:
case CRUSH_RULE_CHOOSE_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN:
firstn = 1; firstn = 1;
...@@ -888,6 +898,7 @@ int crush_do_rule(const struct crush_map *map, ...@@ -888,6 +898,7 @@ int crush_do_rule(const struct crush_map *map,
osize = 0; osize = 0;
for (i = 0; i < wsize; i++) { for (i = 0; i < wsize; i++) {
int bno;
/* /*
* see CRUSH_N, CRUSH_N_MINUS macros. * see CRUSH_N, CRUSH_N_MINUS macros.
* basically, numrep <= 0 means relative to * basically, numrep <= 0 means relative to
...@@ -900,6 +911,13 @@ int crush_do_rule(const struct crush_map *map, ...@@ -900,6 +911,13 @@ int crush_do_rule(const struct crush_map *map,
continue; continue;
} }
j = 0; j = 0;
/* make sure bucket id is valid */
bno = -1 - w[i];
if (bno < 0 || bno >= map->max_buckets) {
/* w[i] is probably CRUSH_ITEM_NONE */
dprintk(" bad w[i] %d\n", w[i]);
continue;
}
if (firstn) { if (firstn) {
int recurse_tries; int recurse_tries;
if (choose_leaf_tries) if (choose_leaf_tries)
...@@ -911,7 +929,7 @@ int crush_do_rule(const struct crush_map *map, ...@@ -911,7 +929,7 @@ int crush_do_rule(const struct crush_map *map,
recurse_tries = choose_tries; recurse_tries = choose_tries;
osize += crush_choose_firstn( osize += crush_choose_firstn(
map, map,
map->buckets[-1-w[i]], map->buckets[bno],
weight, weight_max, weight, weight_max,
x, numrep, x, numrep,
curstep->arg2, curstep->arg2,
...@@ -923,6 +941,7 @@ int crush_do_rule(const struct crush_map *map, ...@@ -923,6 +941,7 @@ int crush_do_rule(const struct crush_map *map,
choose_local_fallback_retries, choose_local_fallback_retries,
recurse_to_leaf, recurse_to_leaf,
vary_r, vary_r,
stable,
c+osize, c+osize,
0); 0);
} else { } else {
...@@ -930,7 +949,7 @@ int crush_do_rule(const struct crush_map *map, ...@@ -930,7 +949,7 @@ int crush_do_rule(const struct crush_map *map,
numrep : (result_max-osize)); numrep : (result_max-osize));
crush_choose_indep( crush_choose_indep(
map, map,
map->buckets[-1-w[i]], map->buckets[bno],
weight, weight_max, weight, weight_max,
x, out_size, numrep, x, out_size, numrep,
curstep->arg2, curstep->arg2,
......
...@@ -1770,6 +1770,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -1770,6 +1770,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
u32 osdmap_epoch; u32 osdmap_epoch;
int already_completed; int already_completed;
u32 bytes; u32 bytes;
u8 decode_redir;
unsigned int i; unsigned int i;
tid = le64_to_cpu(msg->hdr.tid); tid = le64_to_cpu(msg->hdr.tid);
...@@ -1841,6 +1842,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -1841,6 +1842,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
p += 8 + 4; /* skip replay_version */ p += 8 + 4; /* skip replay_version */
p += 8; /* skip user_version */ p += 8; /* skip user_version */
if (le16_to_cpu(msg->hdr.version) >= 7)
ceph_decode_8_safe(&p, end, decode_redir, bad_put);
else
decode_redir = 1;
} else {
decode_redir = 0;
}
if (decode_redir) {
err = ceph_redirect_decode(&p, end, &redir); err = ceph_redirect_decode(&p, end, &redir);
if (err) if (err)
goto bad_put; goto bad_put;
......
...@@ -342,23 +342,32 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ...@@ -342,23 +342,32 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
c->choose_local_tries = ceph_decode_32(p); c->choose_local_tries = ceph_decode_32(p);
c->choose_local_fallback_tries = ceph_decode_32(p); c->choose_local_fallback_tries = ceph_decode_32(p);
c->choose_total_tries = ceph_decode_32(p); c->choose_total_tries = ceph_decode_32(p);
dout("crush decode tunable choose_local_tries = %d", dout("crush decode tunable choose_local_tries = %d\n",
c->choose_local_tries); c->choose_local_tries);
dout("crush decode tunable choose_local_fallback_tries = %d", dout("crush decode tunable choose_local_fallback_tries = %d\n",
c->choose_local_fallback_tries); c->choose_local_fallback_tries);
dout("crush decode tunable choose_total_tries = %d", dout("crush decode tunable choose_total_tries = %d\n",
c->choose_total_tries); c->choose_total_tries);
ceph_decode_need(p, end, sizeof(u32), done); ceph_decode_need(p, end, sizeof(u32), done);
c->chooseleaf_descend_once = ceph_decode_32(p); c->chooseleaf_descend_once = ceph_decode_32(p);
dout("crush decode tunable chooseleaf_descend_once = %d", dout("crush decode tunable chooseleaf_descend_once = %d\n",
c->chooseleaf_descend_once); c->chooseleaf_descend_once);
ceph_decode_need(p, end, sizeof(u8), done); ceph_decode_need(p, end, sizeof(u8), done);
c->chooseleaf_vary_r = ceph_decode_8(p); c->chooseleaf_vary_r = ceph_decode_8(p);
dout("crush decode tunable chooseleaf_vary_r = %d", dout("crush decode tunable chooseleaf_vary_r = %d\n",
c->chooseleaf_vary_r); c->chooseleaf_vary_r);
/* skip straw_calc_version, allowed_bucket_algs */
ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done);
*p += sizeof(u8) + sizeof(u32);
ceph_decode_need(p, end, sizeof(u8), done);
c->chooseleaf_stable = ceph_decode_8(p);
dout("crush decode tunable chooseleaf_stable = %d\n",
c->chooseleaf_stable);
done: done:
dout("crush_decode success\n"); dout("crush_decode success\n");
return c; return c;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment