Commit 82a1bd8c authored by Yoni Fogel's avatar Yoni Fogel

addresses #523

implemented lock escalation MINUS flagging of messy transactions
implemented tests for lock escalation

git-svn-id: file:///svn/tokudb@2902 c7de825b-a66e-492c-adef-691d508d4ae1
parent 64ba924c
...@@ -122,10 +122,11 @@ int toku__lt_point_cmp(const toku_point* x, const toku_point* y) { ...@@ -122,10 +122,11 @@ int toku__lt_point_cmp(const toku_point* x, const toku_point* y) {
toku__recreate_DBT(&point_2, y->data_payload, y->data_len)); toku__recreate_DBT(&point_2, y->data_payload, y->data_len));
} }
static inline BOOL toku__lt_fraction_ranges_free(toku_lock_tree* tree, u_int32_t denominator) { static inline BOOL toku__lt_percent_ranges_free(toku_lock_tree* tree,
assert(tree && tree->num_ranges && denominator); u_int32_t percent) {
return *tree->num_ranges <= assert(tree && tree->num_ranges && (percent <= 100));
tree->max_ranges - (tree->max_ranges / denominator); u_int64_t max_ranges64= tree->max_ranges;
return *tree->num_ranges <= max_ranges64 * (100 - percent) / 100;
} }
/* Functions to update the range count and compare it with the /* Functions to update the range count and compare it with the
...@@ -283,6 +284,12 @@ static inline int toku__lt_selfwrite(toku_lock_tree* tree, DB_TXN* txn, ...@@ -283,6 +284,12 @@ static inline int toku__lt_selfwrite(toku_lock_tree* tree, DB_TXN* txn,
} }
static inline BOOL toku__dominated(toku_range* query, toku_range* by) {
assert(query && by);
return (toku__lt_point_cmp(query->left, by->left) >= 0 &&
toku__lt_point_cmp(query->right, by->right) <= 0);
}
/* /*
This function only supports non-overlapping trees. This function only supports non-overlapping trees.
Uses the standard definition of dominated from the design document. Uses the standard definition of dominated from the design document.
...@@ -316,8 +323,7 @@ static inline int toku__lt_rt_dominates(toku_lock_tree* tree, toku_range* query, ...@@ -316,8 +323,7 @@ static inline int toku__lt_rt_dominates(toku_lock_tree* tree, toku_range* query,
return 0; return 0;
} }
assert(numfound == 1); assert(numfound == 1);
*dominated = (toku__lt_point_cmp(query->left, buf[0].left) >= 0 && *dominated = toku__dominated(query, &buf[0]);
toku__lt_point_cmp(query->right, buf[0].right) <= 0);
return 0; return 0;
} }
...@@ -392,7 +398,8 @@ static inline int toku__lt_meets(toku_lock_tree* tree, toku_range* query, ...@@ -392,7 +398,8 @@ static inline int toku__lt_meets(toku_lock_tree* tree, toku_range* query,
/* /*
Determines whether 'query' meets 'rt' at txn2 not equal to txn. Determines whether 'query' meets 'rt' at txn2 not equal to txn.
This function supports all range trees, but queries must be a single point. This function supports all range trees, but queries must either be a single point,
or the range tree is homogenous.
Uses the standard definition of 'query' meets 'tree' at 'data' from the Uses the standard definition of 'query' meets 'tree' at 'data' from the
design document. design document.
*/ */
...@@ -400,7 +407,7 @@ static inline int toku__lt_meets_peer(toku_lock_tree* tree, toku_range* query, ...@@ -400,7 +407,7 @@ static inline int toku__lt_meets_peer(toku_lock_tree* tree, toku_range* query,
toku_range_tree* rt, BOOL is_homogenous, toku_range_tree* rt, BOOL is_homogenous,
DB_TXN* self, BOOL* met) { DB_TXN* self, BOOL* met) {
assert(tree && query && rt && self && met); assert(tree && query && rt && self && met);
assert(query->left == query->right); assert(query->left == query->right || is_homogenous);
const u_int32_t query_size = is_homogenous ? 1 : 2; const u_int32_t query_size = is_homogenous ? 1 : 2;
toku_range buffer[2]; toku_range buffer[2];
...@@ -1071,7 +1078,7 @@ static inline int toku__lt_write_range_conflicts_reads(toku_lock_tree* tree, ...@@ -1071,7 +1078,7 @@ static inline int toku__lt_write_range_conflicts_reads(toku_lock_tree* tree,
while ((forest = toku_rth_next(tree->rth)) != NULL) { while ((forest = toku_rth_next(tree->rth)) != NULL) {
if (forest->self_read != NULL && forest->hash_key != txn) { if (forest->self_read != NULL && forest->hash_key != txn) {
r = toku__lt_meets_peer(tree, query, forest->self_read, TRUE, txn, r = toku__lt_meets_peer(tree, query, forest->self_read, TRUE, txn,///
&met); &met);
if (r!=0) { goto cleanup; } if (r!=0) { goto cleanup; }
if (met) { r = DB_LOCK_NOTGRANTED; goto cleanup; } if (met) { r = DB_LOCK_NOTGRANTED; goto cleanup; }
...@@ -1082,7 +1089,13 @@ static inline int toku__lt_write_range_conflicts_reads(toku_lock_tree* tree, ...@@ -1082,7 +1089,13 @@ static inline int toku__lt_write_range_conflicts_reads(toku_lock_tree* tree,
return r; return r;
} }
static inline int toku__border_escalation_trivial(toku_lock_tree* tree, toku_range* border_range, BOOL* trivial) { /*
Tests whether a range from BorderWrite is trivially escalatable.
i.e. No read locks from other transactions overlap the range.
*/
static inline int toku__border_escalation_trivial(toku_lock_tree* tree,
toku_range* border_range,
BOOL* trivial) {
assert(tree && border_range && trivial); assert(tree && border_range && trivial);
int r = ENOSYS; int r = ENOSYS;
...@@ -1099,23 +1112,111 @@ static inline int toku__border_escalation_trivial(toku_lock_tree* tree, toku_ran ...@@ -1099,23 +1112,111 @@ static inline int toku__border_escalation_trivial(toku_lock_tree* tree, toku_ran
return r; return r;
} }
static inline int toku__escalate_reads_from_border_range(toku_lock_tree* tree, toku_range* border_range) { /* */
assert(tree && border_range); static inline int toku__escalate_writes_from_border_range(toku_lock_tree* tree,
return 0; toku_range* border_range) {
int r = ENOSYS;
if (!tree || !border_range) { r = EINVAL; goto cleanup; }
DB_TXN* txn = border_range->data;
toku_range_tree* self_write = toku__lt_ifexist_selfwrite(tree, txn);
assert(self_write);
toku_range query = *border_range;
u_int32_t numfound = 0;
query.data = NULL;
/*
* Delete all overlapping ranges
*/
r = toku_rt_find(self_write, &query, 0, &tree->buf, &tree->buflen, &numfound);
if (r != 0) { goto cleanup; }
u_int32_t i;
for (i = 0; i < numfound; i++) {
r = toku_rt_delete(self_write, &tree->buf[i]);
if (r != 0) { r = toku__lt_panic(tree, r); goto cleanup; }
/*
* Clean up memory that is not referenced by border_range.
*/
if (tree->buf[i].left != tree->buf[i].right &&
toku__lt_p_independent(tree->buf[i].left, border_range)) {
/* Do not double free if left and right are same point. */
toku__p_free(tree, tree->buf[i].left);
}
if (toku__lt_p_independent(tree->buf[i].right, border_range)) {
toku__p_free(tree, tree->buf[i].right);
}
}
/*
* Insert border_range into self_write table
*/
r = toku_rt_insert(self_write, border_range);
if (r != 0) { r = toku__lt_panic(tree, r); goto cleanup; }
toku__lt_range_incr(tree, numfound);
r = 0;
cleanup:
return r;
} }
static inline int toku__escalate_writes_from_border_range(toku_lock_tree* tree, toku_range* border_range) {
assert(tree && border_range); static inline int toku__escalate_reads_from_border_range(toku_lock_tree* tree,
return 0; toku_range* border_range) {
int r = ENOSYS;
if (!tree || !border_range) { r = EINVAL; goto cleanup; }
DB_TXN* txn = border_range->data;
toku_range_tree* self_read = toku__lt_ifexist_selfread(tree, txn);
if (self_read == NULL) { r = 0; goto cleanup; }
toku_range query = *border_range;
u_int32_t numfound = 0;
query.data = NULL;
/*
* Delete all overlapping ranges
*/
r = toku_rt_find(self_read, &query, 0, &tree->buf, &tree->buflen, &numfound);
if (r != 0) { goto cleanup; }
u_int32_t i;
u_int32_t removed = 0;
for (i = 0; i < numfound; i++) {
if (!toku__dominated(&tree->buf[0], border_range)) { continue; }
r = toku_rt_delete(self_read, &tree->buf[i]);
if (r != 0) { r = toku__lt_panic(tree, r); goto cleanup; }
#if !defined(TOKU_RT_NOOVERLAPS)
r = toku_rt_delete(tree->mainread, &tree->buf[i]);
if (r != 0) { r = toku__lt_panic(tree, r); goto cleanup; }
#endif /* TOKU_RT_NOOVERLAPS */
removed++;
/*
* Clean up memory that is not referenced by border_range.
*/
if (tree->buf[i].left != tree->buf[i].right &&
toku__lt_p_independent(tree->buf[i].left, border_range)) {
/* Do not double free if left and right are same point. */
toku__p_free(tree, tree->buf[i].left);
}
if (toku__lt_p_independent(tree->buf[i].right, border_range)) {
toku__p_free(tree, tree->buf[i].right);
}
}
toku__lt_range_decr(tree, removed);
r = 0;
cleanup:
return r;
} }
/* /*
* TODO: implement function * For each range in BorderWrite:
* Check to see if range conflicts any read lock held by other transactions
* Replaces all writes that overlap with range
* Deletes all reads dominated by range
*/ */
static int toku__do_escalation(toku_lock_tree* tree, BOOL* locks_available) { static int toku__do_escalation(toku_lock_tree* tree, BOOL* locks_available) {
int r = ENOSYS; int r = ENOSYS;
if (!tree || !locks_available) { r = EINVAL; goto cleanup; } if (!tree || !locks_available) { r = EINVAL; goto cleanup; }
if (!tree->lock_escalation_allowed) { r = EDOM; goto cleanup; } if (!tree->lock_escalation_allowed) { r = EDOM; goto cleanup; }
toku_range_tree* border = tree->borderwrite; toku_range_tree* border = tree->borderwrite;
assert(border);
toku_range border_range; toku_range border_range;
BOOL found = FALSE; BOOL found = FALSE;
BOOL trivial = FALSE; BOOL trivial = FALSE;
...@@ -1125,6 +1226,10 @@ static int toku__do_escalation(toku_lock_tree* tree, BOOL* locks_available) { ...@@ -1125,6 +1226,10 @@ static int toku__do_escalation(toku_lock_tree* tree, BOOL* locks_available) {
r = toku__border_escalation_trivial(tree, &border_range, &trivial); r = toku__border_escalation_trivial(tree, &border_range, &trivial);
if (r!=0) { goto cleanup; } if (r!=0) { goto cleanup; }
if (!trivial) { continue; } if (!trivial) { continue; }
/*
* At this point, we determine that escalation is simple,
* Attempt escalation
*/
r = toku__escalate_writes_from_border_range(tree, &border_range); r = toku__escalate_writes_from_border_range(tree, &border_range);
if (r!=0) { r = toku__lt_panic(tree, r); goto cleanup; } if (r!=0) { r = toku__lt_panic(tree, r); goto cleanup; }
r = toku__escalate_reads_from_border_range(tree, &border_range); r = toku__escalate_reads_from_border_range(tree, &border_range);
...@@ -1133,7 +1238,8 @@ static int toku__do_escalation(toku_lock_tree* tree, BOOL* locks_available) { ...@@ -1133,7 +1238,8 @@ static int toku__do_escalation(toku_lock_tree* tree, BOOL* locks_available) {
r = 0; r = 0;
*locks_available = toku__lt_range_test_incr(tree, 0); *locks_available = toku__lt_range_test_incr(tree, 0);
/* Escalation is allowed if 1/10th of the locks (or more) are free. */ /* Escalation is allowed if 1/10th of the locks (or more) are free. */
tree->lock_escalation_allowed = toku__lt_fraction_ranges_free(tree, 10); tree->lock_escalation_allowed = toku__lt_percent_ranges_free(tree,
TOKU_DISABLE_ESCALATION_THRESHOLD);
cleanup: cleanup:
if (r!=0) { if (r!=0) {
if (tree && locks_available) { if (tree && locks_available) {
...@@ -1470,6 +1576,10 @@ int toku_lt_unlock(toku_lock_tree* tree, DB_TXN* txn) { ...@@ -1470,6 +1576,10 @@ int toku_lt_unlock(toku_lock_tree* tree, DB_TXN* txn) {
toku__lt_range_decr(tree, ranges); toku__lt_range_decr(tree, ranges);
if (toku__lt_percent_ranges_free(tree, TOKU_ENABLE_ESCALATION_THRESHOLD)) {
tree->lock_escalation_allowed = TRUE;
}
return 0; return 0;
} }
......
...@@ -30,6 +30,9 @@ typedef enum { ...@@ -30,6 +30,9 @@ typedef enum {
state */ state */
} TOKU_LT_ERROR; } TOKU_LT_ERROR;
#define TOKU_DISABLE_ESCALATION_THRESHOLD 10
#define TOKU_ENABLE_ESCALATION_THRESHOLD 20
/** Convert error codes into a human-readable error message */ /** Convert error codes into a human-readable error message */
char* toku_lt_strerror(TOKU_LT_ERROR r /**< Error code */) char* toku_lt_strerror(TOKU_LT_ERROR r /**< Error code */)
__attribute__((const,pure)); __attribute__((const,pure));
......
...@@ -34,6 +34,19 @@ int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2le ...@@ -34,6 +34,19 @@ int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2le
return key1len-key2len; return key1len-key2len;
} }
int intcmp(DB *db __attribute__((__unused__)), const DBT* a, const DBT* b) {
int x = *(int*)a->data;
int y = *(int*)b->data;
return x - y;
}
int charcmp(DB *db __attribute__((__unused__)), const DBT* a, const DBT* b) {
int x = *(char*)a->data;
int y = *(char*)b->data;
return x - y;
}
int dbcmp (DB *db __attribute__((__unused__)), const DBT *a, const DBT*b) { int dbcmp (DB *db __attribute__((__unused__)), const DBT *a, const DBT*b) {
return toku_keycompare(a->data, a->size, b->data, b->size); return toku_keycompare(a->data, a->size, b->data, b->size);
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment