Fixes #3200. Use rdtsc instead of gettimeofday() to instrument elocks. close[t:3200]

git-svn-id: file:///svn/toku/tokudb@27464 c7de825b-a66e-492c-adef-691d508d4ae1

Fixes #3200. Use rdtsc instead of gettimeofday() to instrument elocks. close[t:3200]
git-svn-id: file:///svn/toku/tokudb@27464 c7de825b-a66e-492c-adef-691d508d4ae1
b6e2307d · Bradley C. Kuszmaul · Yoni Fogel · 8d1e879e · b6e2307d · b6e2307d
Commit b6e2307d authored Jan 17, 2011 by Bradley C. Kuszmaul Committed by Yoni Fogel Apr 16, 2013
7 changed files
--- a/buildheader/make_db_h.c
+++ b/buildheader/make_db_h.c
@@ -462,9 +462,9 @@ int main (int argc __attribute__((__unused__)), char *const argv[] __attribute__
    printf("  u_int32_t        num_waiters_now;         /* How many are waiting on the ydb lock right now (including the current lock holder if any)?                            */\n");
    printf("  u_int32_t        max_waiters;             /* The maximum of num_waiters_now.                                                                                       */\n");
    printf("  u_int64_t        total_sleep_time;        /* Total time spent (since the system was booted) sleeping (by the indexer) to give foreground threads a chance to work. */\n");
-    printf("  u_int64_t        max_time_ydb_lock_held;  /* Maximum time that the ydb lock was held.                                                                              */\n"); 
-    printf("  u_int64_t        total_time_ydb_lock_held;/* Total time client threads held the ydb lock                                                                           */\n");
-    printf("  u_int64_t        total_time_since_start;  /* Total time since the lock was created.  Use this as total_time_ydb_lock_held/total_time_since_start to get a ratio.   */\n");
+    printf("  u_int64_t        max_time_ydb_lock_held;  /* Maximum time that the ydb lock was held (tokutime_t).                                                                 */\n"); 
+    printf("  u_int64_t        total_time_ydb_lock_held;/* Total time client threads held the ydb lock (really tokutime_t, convert to seconds with tokutime_to_seconds())        */\n");
+    printf("  u_int64_t        total_time_since_start;  /* Total time since the lock was created (tokutime_t).  Use this as total_time_ydb_lock_held/total_time_since_start to get a ratio.   */\n");
    printf("  u_int32_t        checkpoint_period;       /* delay between automatic checkpoints  */ \n");
    printf("  u_int32_t        checkpoint_footprint;    /* state of checkpoint procedure        */ \n");
    printf("  char             checkpoint_time_begin[26]; /* time of last checkpoint begin      */ \n");

--- a/linux/linux.c
+++ b/linux/linux.c
@@ -339,6 +339,23 @@ toku_dup2(int fd, int fd2) {
    return r;
 }

+
+// Time
+static       double seconds_per_clock = -1;
+
+double tokutime_to_seconds(tokutime_t t) {
+    // Convert tokutime to seconds.
+    if (seconds_per_clock<0) {
+	uint64_t hz;
+	int r = toku_os_get_processor_frequency(&hz);
+	assert(r==0);
+	// There's a race condition here, but it doesn't really matter.  If two threads call tokutime_to_seconds
+	// for the first time at the same time, then both will fetch the value and set the same value.
+	seconds_per_clock = 1.0/hz;
+    }
+    return t*seconds_per_clock;
+}
+
 #if __GNUC__ && __i386__

 // workaround for a gcc 4.1.2 bug on 32 bit platforms.
@@ -349,5 +366,3 @@ uint64_t toku_sync_fetch_and_add_uint64(volatile uint64_t *a, uint64_t b) {
 }

 #endif
-
-
--- a/src/elocks.c
+++ b/src/elocks.c
@@ -20,8 +20,8 @@

 struct ydb_big_lock {
    toku_pthread_mutex_t lock;
-    u_int64_t starttime; // what time (in microseconds according to gettimeofday()) was the lock initialized?
-    u_int64_t acquired_time;
+    tokutime_t starttime; // what time was the lock initialized?
+    tokutime_t acquired_time; // what time was the lock acquired
 };
 static struct ydb_big_lock ydb_big_lock;

@@ -47,18 +47,11 @@ toku_ydb_lock_get_status(SCHEDULE_STATUS statp) {
    *statp = status;
 }

-static u_int64_t get_current_time_in_microseconds (void) {
-    // On recent linux, gettimeofday() runs extremely fast (in userspace only), taking only a few nanoseconds.
-    struct timeval tv;
-    gettimeofday(&tv, 0);
-    return tv.tv_usec + 1000000 * tv.tv_sec;
-}
-
 int 
 toku_ydb_lock_init(void) {
    int r;
    r = toku_pthread_mutex_init(&ydb_big_lock.lock, NULL); resource_assert_zero(r);
-    ydb_big_lock.starttime   = get_current_time_in_microseconds();
+    ydb_big_lock.starttime   = get_tokutime();
    ydb_big_lock.acquired_time = 0;
    init_status();
    return r;
@@ -77,7 +70,7 @@ toku_ydb_lock(void) {

    int r = toku_pthread_mutex_lock(&ydb_big_lock.lock);   resource_assert_zero(r);

-    u_int64_t now = get_current_time_in_microseconds();
+    u_int64_t now = get_tokutime();

    // Update the lock
    ydb_big_lock.acquired_time = now;
@@ -95,8 +88,8 @@ ydb_unlock_internal(unsigned long useconds) {
    status.ydb_lock_ctr++;
    invariant((status.ydb_lock_ctr & 0x01) == 0);

-    u_int64_t now = get_current_time_in_microseconds();
-    u_int64_t time_held = now - ydb_big_lock.acquired_time;
+    tokutime_t now = get_tokutime();
+    tokutime_t time_held = now - ydb_big_lock.acquired_time;
    status.total_time_ydb_lock_held += time_held;
    if (time_held > status.max_time_ydb_lock_held) status.max_time_ydb_lock_held = time_held;
    status.total_time_since_start = now - ydb_big_lock.starttime;

--- a/src/export.map
+++ b/src/export.map
@@ -72,6 +72,8 @@
   toku_os_get_max_process_data_size;
   toku_os_get_phys_memory_size;

+   tokutime_to_seconds;
+
   toku_do_assert;
   toku_do_assert_fail;
   toku_do_assert_zero_fail;

--- a/src/ydb-internal.h
+++ b/src/ydb-internal.h
@@ -111,9 +111,9 @@ typedef struct {
    volatile u_int32_t        num_waiters_now;         /* How many are waiting on the ydb lock right now (including the current lock holder).  This is precise since it is updated with a fetch-and-add. */
    volatile u_int32_t        max_waiters;             /* max number of simultaneous client threads kept waiting for ydb lock.  This is precise (updated only when the lock is held) but may be running a little behind (while waiting for the lock it hasn't been updated).  */ 
    volatile u_int64_t        total_sleep_time;        /* total time spent sleeping for ydb lock scheduling (useconds).   This adds up over many clients. This is precise since it is updated with an atomic fetch-and-add. */ 
-    volatile u_int64_t        max_time_ydb_lock_held;  /* max time the ydb lock was held (in microseconds).  This is precise since it is updated only when the lock is held.  */ 
-    volatile u_int64_t        total_time_ydb_lock_held;/* total time the ydb lock has been held (in microseconds).  */
-    volatile u_int64_t        total_time_since_start;  /* total time since the ydb lock was initialized (in microseconds) This is only updated when the lock is accessed (so if you don't acquire the lock this doesn't increase), and it is updated precisely (even though it isn't updated continuously). */
+    volatile tokutime_t        max_time_ydb_lock_held;  /* max time the ydb lock was held (in microseconds).  This is precise since it is updated only when the lock is held.  */ 
+    volatile tokutime_t        total_time_ydb_lock_held;/* total time the ydb lock has been held.  */
+    volatile tokutime_t        total_time_since_start;  /* total time since the ydb lock was initialized.  This is only updated when the lock is accessed (so if you don't acquire the lock this doesn't increase), and it is updated precisely (even though it isn't updated continuously). */
 } SCHEDULE_STATUS_S, *SCHEDULE_STATUS;



--- a/src/ydb.c
+++ b/src/ydb.c
@@ -2019,8 +2019,8 @@ env_get_engine_status_text(DB_ENV * env, char * buff, int bufsiz) {
 	n += snprintf(buff + n, bufsiz - n, "num_waiters_now                  %"PRIu32"\n", engstat.num_waiters_now);
 	n += snprintf(buff + n, bufsiz - n, "max_waiters                      %"PRIu32"\n", engstat.max_waiters);
 	n += snprintf(buff + n, bufsiz - n, "total_sleep_time                 %"PRIu64"\n", engstat.total_sleep_time);
-	n += snprintf(buff + n, bufsiz - n, "max_time_ydb_lock_held           %"PRIu64"\n", engstat.max_time_ydb_lock_held);
-	n += snprintf(buff + n, bufsiz - n, "total_time_ydb_lock_held         %"PRIu64"\n", engstat.total_time_ydb_lock_held);
+	n += snprintf(buff + n, bufsiz - n, "max_time_ydb_lock_held           %.6f\n",      tokutime_to_seconds(engstat.max_time_ydb_lock_held));
+	n += snprintf(buff + n, bufsiz - n, "total_time_ydb_lock_held         %.6f\n",      tokutime_to_seconds(engstat.total_time_ydb_lock_held));
 	n += snprintf(buff + n, bufsiz - n, "total_time_since_start           %"PRIu64"\n", engstat.total_time_since_start);
 	n += snprintf(buff + n, bufsiz - n, "le_max_committed_xr              %"PRIu64"\n", engstat.le_max_committed_xr);
 	n += snprintf(buff + n, bufsiz - n, "le_max_provisional_xr            %"PRIu64"\n", engstat.le_max_provisional_xr);

--- a/toku_include/toku_portability.h
+++ b/toku_include/toku_portability.h
@@ -207,6 +207,51 @@ int toku_set_func_pread (ssize_t (*)(int, void *, size_t, off_t));
 int toku_portability_init    (void);
 int toku_portability_destroy (void);

+// *************** Performance timers ************************
+// What do you really want from a performance timer:
+//  (1) Can determine actual time of day from the performance time.
+//  (2) Time goes forward, never backward.
+//  (3) Same time on different processors (or even different machines).
+//  (4) Time goes forward at a constant rate (doesn't get faster and slower)
+//  (5) Portable.
+//  (6) Getting the time is cheap.
+// Unfortuately it seems tough to get Properties 1-5.  So we go for Property 6,, but we abstract it.
+// We offer a type tokutime_t which can hold the time.
+// This type can be subtracted to get a time difference.
+// We can get the present time cheaply.
+// We can convert this type to seconds (but that can be expensive).
+// The implementation is to use RDTSC (hence we lose property 3: not portable).
+// Recent machines have constant_tsc in which case we get property (4).
+// Recent OSs on recent machines (that have RDTSCP) fix the per-processor clock skew, so we get property (3).
+// We get property 2 with RDTSC (as long as there's not any skew).
+// We don't even try to get propety 1, since we don't need it.
+// The decision here is that these times are really accurate only on modern machines with modern OSs.
+typedef u_int64_t tokutime_t;             // Time type used in by tokutek timers.
+
+// The value of tokutime_t is not specified here. 
+// It might be microseconds since 1/1/1970 (if gettimeofday() is
+// used), or clock cycles since boot (if rdtsc is used).  Or something
+// else.
+// Two tokutime_t values can be subtracted to get a time difference.
+// Use tokutime_to_seconds to that convert difference  to seconds.
+// We want get_tokutime() to be fast, but don't care so much about tokutime_to_seconds();
+//
+// For accurate time calculations do the subtraction in the right order:
+//   Right:  tokutime_to_seconds(t1-t2);
+//   Wrong   tokutime_to_seconds(t1)-toku_time_to_seconds(t2);
+// Doing it the wrong way is likely to result in loss of precision.
+// A double can hold numbers up to about 53 bits.  RDTSC which uses about 33 bits every second, so that leaves
+// 2^20 seconds from booting (about 2 weeks) before the RDTSC value cannot be represented accurately as a double.
+//
+double tokutime_to_seconds(tokutime_t)  __attribute__((__visibility__("default"))); // Convert tokutime to seconds.
+
+// Get tokutime.  We want this to be fast, so we expose the implementation as RDTSC.
+static inline tokutime_t get_tokutime (void) {
+    u_int32_t lo, hi;
+    __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+    return (uint64_t)hi << 32 | lo;
+}
+
 #if defined(__cplusplus) || defined(__cilkplusplus)
 };
 #endif