elocks.c 7.99 KB
Newer Older
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1
/* -*- mode: C; c-basic-offset: 4 -*- */
2
#ident "Copyright (c) 2007-2009 Tokutek Inc.  All rights reserved."
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
3 4 5 6 7 8 9 10 11 12 13

#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

/**
  \file elocks.c
  \brief Ephemeral locks

   The ydb big lock serializes access to the tokudb
   every call (including methods) into the tokudb library gets the lock 
   no internal function should invoke a method through an object */

14
#include <toku_portability.h>
15
#include "ydb-internal.h"
16
#include <string.h>
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
17
#include <assert.h>
18
#include <toku_pthread.h>
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
19 20
#include <sys/types.h>

21
#if defined(__linux__) && __linux__
22 23 24 25
#define YDB_LOCK_MISS_TIME 1
#else
#define YDB_LOCK_FIFO 0
#endif
26

27 28
struct ydb_big_lock {
    toku_pthread_mutex_t lock;
29
#if defined(YDB_LOCK_MISS_TIME) && YDB_LOCK_MISS_TIME
30 31 32 33 34 35 36 37 38 39 40
    int32_t waiters;
    toku_pthread_key_t time_key;
    uint64_t start_misscount, start_misstime;
#endif
};
static struct ydb_big_lock ydb_big_lock;

// status is intended for display to humans to help understand system behavior.
// It does not need to be perfectly thread-safe.
static SCHEDULE_STATUS_S status;

41
static inline u_int64_t u64max(u_int64_t a, u_int64_t b) {return a > b ? a : b; }
42 43 44

#define MAX_SLEEP 1000000  // 1 second covers the case of a 5 level tree with 30 millisecond read delays and a few waiting threads

45
#if defined(YDB_LOCK_MISS_TIME) && YDB_LOCK_MISS_TIME
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68

#include "toku_atomic.h"

#define MAXTHELD 250000   // if lock was apparently held longer than 250 msec, then theld is probably invalid (do we still need this?)

struct ydbtime {          // one per thread, 
    uint64_t tacquire;    // valid only when lock is not held, this is the next time the thread may take the lock (0 if no latency required)
    uint64_t theld_prev;  // how long was lock held the previous time this thread held the lock
};

// get a timestamp in units of microseconds
static uint64_t 
get_tnow(void) {
    struct timeval tv;
    int r = gettimeofday(&tv, NULL);
    assert(r == 0);
    return tv.tv_sec * 1000000ULL + tv.tv_usec;
}
#endif

static void 
init_status(void) {
    uint64_t cpuhz = 0;
69
#if defined(YDB_LOCK_MISS_TIME) && YDB_LOCK_MISS_TIME
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
    int r = toku_os_get_processor_frequency(&cpuhz); assert(r == 0);
#endif
    status.ydb_lock_ctr = 0;
    status.max_possible_sleep = MAX_SLEEP;
    status.processor_freq_mhz = cpuhz / 1000000ULL;
    status.max_requested_sleep = 0;
    status.times_max_sleep_used = 0;
    status.total_sleepers = 0;
    status.total_sleep_time = 0;
    status.max_waiters = 0;
    status.total_waiters = 0;
    status.total_clients = 0;
    status.time_ydb_lock_held_unavailable = 0;
    status.max_time_ydb_lock_held = 0;
    status.total_time_ydb_lock_held = 0;
}

void 
toku_ydb_lock_get_status(SCHEDULE_STATUS statp) {
    *statp = status;
90
}
Yoni Fogel's avatar
Yoni Fogel committed
91

92
int 
Yoni Fogel's avatar
Yoni Fogel committed
93
toku_ydb_lock_init(void) {
94 95
    int r;
    r = toku_pthread_mutex_init(&ydb_big_lock.lock, NULL); assert(r == 0);
96
#if defined(YDB_LOCK_MISS_TIME) && YDB_LOCK_MISS_TIME
97 98 99 100
    ydb_big_lock.waiters = 0;
    r = toku_pthread_key_create(&ydb_big_lock.time_key, toku_free); assert(r == 0);
#endif
    init_status();
Yoni Fogel's avatar
Yoni Fogel committed
101
    return r;
102 103
}

104
int 
Yoni Fogel's avatar
Yoni Fogel committed
105
toku_ydb_lock_destroy(void) {
106 107
    int r;
    r = toku_pthread_mutex_destroy(&ydb_big_lock.lock); assert(r == 0);
108
#if defined(YDB_LOCK_MISS_TIME) && YDB_LOCK_MISS_TIME
109 110
    r = toku_pthread_key_delete(ydb_big_lock.time_key); assert(r == 0);
#endif
Yoni Fogel's avatar
Yoni Fogel committed
111
    return r;
112 113
}

114 115 116
void 
toku_ydb_lock(void) {
#if YDB_LOCK_FIFO
117
    int r = toku_pthread_mutex_lock(&ydb_big_lock);   assert(r == 0);
118 119
#endif

120
#if defined(YDB_LOCK_MISS_TIME) && YDB_LOCK_MISS_TIME
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
    int r;
    u_int64_t requested_sleep = 0;
    struct ydbtime *ydbtime = toku_pthread_getspecific(ydb_big_lock.time_key);
    if (!ydbtime) {          // allocate the per thread timestamp if not yet allocated
        ydbtime = toku_malloc(sizeof (struct ydbtime));
        assert(ydbtime);
        memset(ydbtime, 0, sizeof (struct ydbtime));
        r = toku_pthread_setspecific(ydb_big_lock.time_key, ydbtime);
        assert(r == 0);
	(void) toku_sync_fetch_and_add_uint64(&status.total_clients, 1);
    }
    if (ydbtime->tacquire) { // delay the thread if the lock acquire time is set and is less than the current time
	if (0) printf("%"PRIu64"\n", ydbtime->tacquire);
        uint64_t t = get_tnow();
        if (t < ydbtime->tacquire) {
            t = ydbtime->tacquire - t;
	    requested_sleep = t;
            // put an upper bound on the sleep time since the timestamps may be crazy due to thread movement between cpu's or cpu frequency changes
            if (t > MAX_SLEEP) {
                t = MAX_SLEEP;
		(void) toku_sync_fetch_and_add_uint64(&status.times_max_sleep_used, 1);
	    }
	    (void) toku_sync_fetch_and_add_uint64(&status.total_sleep_time, t);
	    (void) toku_sync_fetch_and_add_uint64(&status.total_sleepers, 1);
            usleep(t);	    
        }
    }
    r = toku_pthread_mutex_trylock(&ydb_big_lock.lock);
    if (r != 0) {           // if we can not get the lock, bump the count of the lock waits, and block on the lock
        assert(r == EBUSY);
        (void) toku_sync_fetch_and_add_int32(&ydb_big_lock.waiters, 1);
        (void) toku_sync_fetch_and_add_uint64(&status.total_waiters, 1);
        r = toku_pthread_mutex_lock(&ydb_big_lock.lock);
        assert(r == 0);
        (void) toku_sync_fetch_and_add_int32(&ydb_big_lock.waiters, -1);
    }
157
    status.max_requested_sleep = u64max(status.max_requested_sleep, requested_sleep);
158 159 160 161 162
    toku_cachetable_get_miss_times(NULL, &ydb_big_lock.start_misscount, &ydb_big_lock.start_misstime);
#endif

    status.ydb_lock_ctr++;
    assert((status.ydb_lock_ctr & 0x01) == 1);
Yoni Fogel's avatar
Yoni Fogel committed
163 164
}

165 166 167 168 169 170
void 
toku_ydb_unlock(void) {
    status.ydb_lock_ctr++;
    assert((status.ydb_lock_ctr & 0x01) == 0);

#if YDB_LOCK_FIFO
171
    int r = toku_pthread_mutex_unlock(&ydb_big_lock); assert(r == 0);
172 173
#endif

174
#if defined(YDB_LOCK_MISS_TIME) && YDB_LOCK_MISS_TIME
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
    struct ydbtime *ydbtime = toku_pthread_getspecific(ydb_big_lock.time_key);
    assert(ydbtime);

    int r;
    uint64_t theld;
    int waiters = ydb_big_lock.waiters;             // get the number of lock waiters (used to compute the lock acquisition time)
    if (waiters == 0) {
	theld = 0;
    } else {
	uint64_t misscount, misstime;
        toku_cachetable_get_miss_times(NULL, &misscount, &misstime);
	misscount -= ydb_big_lock.start_misscount;  // how many cache misses for this operation
	misstime -= ydb_big_lock.start_misstime;    // how many usec spent waiting for disk read this operation
	if (0 && (misscount || misstime))
	    printf("%d %"PRIu64" %"PRIu64"\n", waiters, misscount, misstime);
	if (misscount == 0) {
	    theld = 0;
	} else {
	    theld = misstime ? misstime : misscount * 20000ULL; // if we decide not to compile in misstime, then backoff to 20 milliseconds per cache miss

	    if (theld < MAXTHELD) {
196
		status.max_time_ydb_lock_held = u64max(status.max_time_ydb_lock_held, theld);
197 198 199 200 201
		ydbtime->theld_prev = theld;
	    } else {                                      // thread appears to have migrated (theld out of range)
		theld = ydbtime->theld_prev;              // if time measurement unavailable, assume same as previous use of ydb lock by this thread
		status.time_ydb_lock_held_unavailable++;
	    }
202
	    status.max_waiters = u64max(status.max_waiters, waiters);
203 204 205 206 207 208 209 210 211 212 213 214 215
	    status.total_time_ydb_lock_held += theld;
	}
    }

    r = toku_pthread_mutex_unlock(&ydb_big_lock.lock); assert(r == 0);

    // we use a lower bound of 100 microseconds on the sleep time to avoid system call overhead for short sleeps
    if (waiters == 0 || theld <= 100ULL)
	ydbtime->tacquire = 0;                            // there is no delay on acquiring the lock the next time since there was no lock contention or the lock was not held very long
    else
	ydbtime->tacquire = get_tnow() + theld * waiters; // set the min time from now that the lock can not be reacquired
#endif

Yoni Fogel's avatar
Yoni Fogel committed
216 217
}

218 219 220 221
int 
toku_ydb_lock_ctr(void) {
    return status.ydb_lock_ctr;
}