Commit 66a0e2d5 authored by Ilya Dryomov's avatar Ilya Dryomov

crush: remove mutable part of CRUSH map

Then add it to the working state. It would be very nice if we didn't
have to take a lock to calculate a crush placement. By moving the
permutation array into the working data, we can treat the CRUSH map as
immutable.

Reflects ceph.git commit cbcd039651c0569551cb90d26ce27e1432671f2a.
Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
parent 1b6a78b5
......@@ -175,6 +175,7 @@ struct ceph_osdmap {
struct mutex crush_scratch_mutex;
int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
void *crush_workspace;
};
static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
......
......@@ -135,13 +135,6 @@ struct crush_bucket {
__u32 size; /* num items */
__s32 *items;
/*
* cached random permutation: used for uniform bucket and for
* the linear search fallback for the other bucket types.
*/
__u32 perm_x; /* @x for which *perm is defined */
__u32 perm_n; /* num elements of *perm that are permuted/defined */
__u32 *perm;
};
struct crush_bucket_uniform {
......@@ -211,6 +204,21 @@ struct crush_map {
* device fails. */
__u8 chooseleaf_stable;
/*
* This value is calculated after decode or construction by
* the builder. It is exposed here (rather than having a
* 'build CRUSH working space' function) so that callers can
* reserve a static buffer, allocate space on the stack, or
* otherwise avoid calling into the heap allocator if they
* want to. The size of the working space depends on the map,
* while the size of the scratch vector passed to the mapper
* depends on the size of the desired result set.
*
* Nothing stops the caller from allocating both in one swell
* foop and passing in two points, though.
*/
size_t working_size;
#ifndef __KERNEL__
/*
* version 0 (original) of straw_calc has various flaws. version 1
......@@ -248,4 +256,23 @@ static inline int crush_calc_tree_node(int i)
return ((i+1) << 1)-1;
}
/*
* These data structures are private to the CRUSH implementation. They
* are exposed in this header file because builder needs their
* definitions to calculate the total working size.
*
* Moving this out of the crush map allow us to treat the CRUSH map as
* immutable within the mapper and removes the requirement for a CRUSH
* map lock.
*/
struct crush_work_bucket {
__u32 perm_x; /* @x for which *perm is defined */
__u32 perm_n; /* num elements of *perm that are permuted/defined */
__u32 *perm; /* Permutation of the bucket's items */
};
struct crush_work {
struct crush_work_bucket **work; /* Per-bucket working store */
};
#endif
......@@ -15,6 +15,8 @@ extern int crush_do_rule(const struct crush_map *map,
int ruleno,
int x, int *result, int result_max,
const __u32 *weights, int weight_max,
int *scratch);
void *cwin, int *scratch);
void crush_init_workspace(const struct crush_map *map, void *v);
#endif
......@@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
{
kfree(b->h.perm);
kfree(b->h.items);
kfree(b);
}
......@@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
{
kfree(b->item_weights);
kfree(b->sum_weights);
kfree(b->h.perm);
kfree(b->h.items);
kfree(b);
}
void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
{
kfree(b->h.perm);
kfree(b->h.items);
kfree(b->node_weights);
kfree(b);
......@@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
{
kfree(b->straws);
kfree(b->item_weights);
kfree(b->h.perm);
kfree(b->h.items);
kfree(b);
}
......@@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
{
kfree(b->item_weights);
kfree(b->h.perm);
kfree(b->h.items);
kfree(b);
}
......
This diff is collapsed.
......@@ -153,6 +153,32 @@ static int skip_name_map(void **p, void *end)
return -EINVAL;
}
static void crush_finalize(struct crush_map *c)
{
__s32 b;
/* Space for the array of pointers to per-bucket workspace */
c->working_size = sizeof(struct crush_work) +
c->max_buckets * sizeof(struct crush_work_bucket *);
for (b = 0; b < c->max_buckets; b++) {
if (!c->buckets[b])
continue;
switch (c->buckets[b]->alg) {
default:
/*
* The base case, permutation variables and
* the pointer to the permutation array.
*/
c->working_size += sizeof(struct crush_work_bucket);
break;
}
/* Every bucket has a permutation array. */
c->working_size += c->buckets[b]->size * sizeof(__u32);
}
}
static struct crush_map *crush_decode(void *pbyval, void *end)
{
struct crush_map *c;
......@@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
if (b->items == NULL)
goto badmem;
b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
if (b->perm == NULL)
goto badmem;
b->perm_n = 0;
ceph_decode_need(p, end, b->size*sizeof(u32), bad);
for (j = 0; j < b->size; j++)
......@@ -368,6 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
dout("crush decode tunable chooseleaf_stable = %d\n",
c->chooseleaf_stable);
crush_finalize(c);
done:
dout("crush_decode success\n");
return c;
......@@ -753,6 +777,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
kfree(map->osd_weight);
kfree(map->osd_addr);
kfree(map->osd_primary_affinity);
kfree(map->crush_workspace);
kfree(map);
}
......@@ -810,12 +835,23 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
{
void *workspace;
if (IS_ERR(crush))
return PTR_ERR(crush);
workspace = kmalloc(crush->working_size, GFP_NOIO);
if (!workspace) {
crush_destroy(crush);
return -ENOMEM;
}
crush_init_workspace(crush, workspace);
if (map->crush)
crush_destroy(map->crush);
kfree(map->crush_workspace);
map->crush = crush;
map->crush_workspace = workspace;
return 0;
}
......@@ -1940,7 +1976,8 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
mutex_lock(&map->crush_scratch_mutex);
r = crush_do_rule(map->crush, ruleno, x, result, result_max,
weight, weight_max, map->crush_scratch_ary);
weight, weight_max, map->crush_workspace,
map->crush_scratch_ary);
mutex_unlock(&map->crush_scratch_mutex);
return r;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment