ydb.cc 102 KB
Newer Older
1 2
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
/*
COPYING CONDITIONS NOTICE:

  This program is free software; you can redistribute it and/or modify
  it under the terms of version 2 of the GNU General Public License as
  published by the Free Software Foundation, and provided that the
  following conditions are met:

      * Redistributions of source code must retain this COPYING
        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
        GRANT (below).

      * Redistributions in binary form must reproduce this COPYING
        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
        GRANT (below) in the documentation and/or other materials
        provided with the distribution.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  02110-1301, USA.

COPYRIGHT NOTICE:

  TokuDB, Tokutek Fractal Tree Indexing Library.
  Copyright (C) 2007-2013 Tokutek, Inc.

DISCLAIMER:

  This program is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.

UNIVERSITY PATENT NOTICE:

  The technology is licensed by the Massachusetts Institute of
  Technology, Rutgers State University of New Jersey, and the Research
  Foundation of State University of New York at Stony Brook under
  United States of America Serial No. 11/760379 and to the patents
  and/or patent applications resulting from it.

PATENT MARKING NOTICE:

  This software is covered by US Patent No. 8,185,551.

PATENT RIGHTS GRANT:

55
  "THIS IMPLEMENTATION" means the copyrightable works distributed by
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
  Tokutek as part of the Fractal Tree project.

  "PATENT CLAIMS" means the claims of patents that are owned or
  licensable by Tokutek, both currently or in the future; and that in
  the absence of this license would be infringed by THIS
  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.

  "PATENT CHALLENGE" shall mean a challenge to the validity,
  patentability, enforceability and/or non-infringement of any of the
  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.

  Tokutek hereby grants to you, for the term and geographical scope of
  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
  irrevocable (except as stated in this section) patent license to
  make, have made, use, offer to sell, sell, import, transfer, and
  otherwise run, modify, and propagate the contents of THIS
  IMPLEMENTATION, where such license applies only to the PATENT
  CLAIMS.  This grant does not include claims that would be infringed
  only as a consequence of further modifications of THIS
  IMPLEMENTATION.  If you or your agent or licensee institute or order
  or agree to the institution of patent litigation against any entity
  (including a cross-claim or counterclaim in a lawsuit) alleging that
  THIS IMPLEMENTATION constitutes direct or contributory patent
  infringement, or inducement of patent infringement, then any rights
  granted to you under this License shall terminate as of the date
  such litigation is filed.  If you or your agent or exclusive
  licensee institute or order or agree to the institution of a PATENT
  CHALLENGE, then Tokutek may terminate any rights granted to you
  under this License.
*/

87
#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
88
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
89
#ident "$Id$"
90

91
extern const char *toku_patent_string;
92
const char *toku_copyright_string = "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved.";
93

94
#include <toku_portability.h>
95
#include <toku_pthread.h>
96 97 98
#include <toku_assert.h>

#include <db.h>
99
#include <ctype.h>
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
100 101
#include <errno.h>
#include <limits.h>
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
102 103 104
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
105
#include <fcntl.h>
106 107 108
#include <unistd.h>
#include <memory.h>

Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
109
#include <sys/stat.h>
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
110
#include <sys/types.h>
111

112
#include <util/status.h>
113

114 115 116 117 118 119 120 121
#include <ft/ft-flusher.h>
#include <ft/cachetable.h>
#include <ft/log.h>
#include <ft/checkpoint.h>
#include <ft/key.h>
#include <ft/ftloader.h>
#include <ft/log_header.h>
#include <ft/ft.h>
122 123 124 125
#include <ft/txn_manager.h>

#include "ydb.h"
#include "ydb-internal.h"
126 127 128 129 130
#include "ydb_cursor.h"
#include "ydb_row_lock.h"
#include "ydb_env_func.h"
#include "ydb_db.h"
#include "ydb_write.h"
131
#include "ydb_txn.h"
132 133
#include "loader.h"
#include "indexer.h"
134

135
// Include ydb_lib.cc here so that its constructor/destructor gets put into
136 137
// ydb.o, to make sure they don't get erased at link time (when linking to
// a static libtokudb.a that was compiled with gcc).  See #5094.
138
#include "ydb_lib.cc"
139

140 141 142 143 144 145
#ifdef TOKUTRACE
 #define DB_ENV_CREATE_FUN db_env_create_toku10
 #define DB_CREATE_FUN db_create_toku10
#else
 #define DB_ENV_CREATE_FUN db_env_create
 #define DB_CREATE_FUN db_create
146
 int toku_set_trace_file (const char *fname __attribute__((__unused__))) { return 0; }
147 148 149
 int toku_close_trace_file (void) { return 0; } 
#endif

150 151 152
// Set when env is panicked, never cleared.
static int env_is_panicked = 0;

153
void
154
env_panic(DB_ENV * env, int cause, const char * msg) {
155
    if (cause == 0)
156
        cause = -1;  // if unknown cause, at least guarantee panic
157
    if (msg == NULL)
158
        msg = "Unknown cause in env_panic\n";
159 160 161 162 163
    env_is_panicked = cause;
    env->i->is_panicked = cause;
    env->i->panic_string = toku_strdup(msg);
}

164
static int env_get_engine_status_num_rows (DB_ENV * UU(env), uint64_t * num_rowsp);
165

166 167 168 169 170 171 172 173 174 175 176 177 178
/********************************************************************************
 * Status is intended for display to humans to help understand system behavior.
 * It does not need to be perfectly thread-safe.
 */

typedef enum {
    YDB_LAYER_TIME_CREATION = 0,            /* timestamp of environment creation, read from persistent environment */
    YDB_LAYER_TIME_STARTUP,                 /* timestamp of system startup */
    YDB_LAYER_TIME_NOW,                     /* timestamp of engine status query */
    YDB_LAYER_NUM_DB_OPEN,
    YDB_LAYER_NUM_DB_CLOSE,
    YDB_LAYER_NUM_OPEN_DBS,
    YDB_LAYER_MAX_OPEN_DBS,
179
    YDB_LAYER_FSYNC_LOG_PERIOD,
180 181 182 183 184 185 186 187 188 189 190
#if 0
    YDB_LAYER_ORIGINAL_ENV_VERSION,         /* version of original environment, read from persistent environment */
    YDB_LAYER_STARTUP_ENV_VERSION,          /* version of environment at this startup, read from persistent environment (curr_env_ver_key) */
    YDB_LAYER_LAST_LSN_OF_V13,              /* read from persistent environment */
    YDB_LAYER_UPGRADE_V14_TIME,             /* timestamp of upgrade to version 14, read from persistent environment */
    YDB_LAYER_UPGRADE_V14_FOOTPRINT,        /* footprint of upgrade to version 14, read from persistent environment */
#endif
    YDB_LAYER_STATUS_NUM_ROWS              /* number of rows in this status array */
} ydb_layer_status_entry;

typedef struct {
Yoni Fogel's avatar
Yoni Fogel committed
191
    bool initialized;
192 193 194 195 196 197
    TOKU_ENGINE_STATUS_ROW_S status[YDB_LAYER_STATUS_NUM_ROWS];
} YDB_LAYER_STATUS_S, *YDB_LAYER_STATUS;

static YDB_LAYER_STATUS_S ydb_layer_status;
#define STATUS_VALUE(x) ydb_layer_status.status[x].value.num

198
#define STATUS_INIT(k,c,t,l,inc) TOKUDB_STATUS_INIT(ydb_layer_status, k, c, t, l, inc)
199

200
static void
201
ydb_layer_status_init (void) {
202 203 204
    // Note, this function initializes the keyname, type, and legend fields.
    // Value fields are initialized to zero by compiler.

205 206 207
    STATUS_INIT(YDB_LAYER_TIME_CREATION,              nullptr, UNIXTIME, "time of environment creation", TOKU_ENGINE_STATUS);
    STATUS_INIT(YDB_LAYER_TIME_STARTUP,               nullptr, UNIXTIME, "time of engine startup", TOKU_ENGINE_STATUS);
    STATUS_INIT(YDB_LAYER_TIME_NOW,                   nullptr, UNIXTIME, "time now", TOKU_ENGINE_STATUS);
208 209 210 211
    STATUS_INIT(YDB_LAYER_NUM_DB_OPEN,                DB_OPENS, UINT64,   "db opens", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(YDB_LAYER_NUM_DB_CLOSE,               DB_CLOSES, UINT64,   "db closes", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(YDB_LAYER_NUM_OPEN_DBS,               DB_OPEN_CURRENT, UINT64,   "num open dbs now", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(YDB_LAYER_MAX_OPEN_DBS,               DB_OPEN_MAX, UINT64,   "max open dbs", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
212
    STATUS_INIT(YDB_LAYER_FSYNC_LOG_PERIOD,           nullptr, UINT64,   "period, in ms, that recovery log is automatically fsynced", TOKU_ENGINE_STATUS);
213 214 215 216 217

    STATUS_VALUE(YDB_LAYER_TIME_STARTUP) = time(NULL);
    ydb_layer_status.initialized = true;
}
#undef STATUS_INIT
218

219
static void
220
ydb_layer_get_status(DB_ENV* env, YDB_LAYER_STATUS statp) {
221
    STATUS_VALUE(YDB_LAYER_TIME_NOW) = time(NULL);
222
    STATUS_VALUE(YDB_LAYER_FSYNC_LOG_PERIOD) = toku_minicron_get_period_in_ms_unlocked(&env->i->fsync_log_cron);
223
    *statp = ydb_layer_status;
224
}
225

226 227 228 229 230 231
/********************************************************************************
 * End of ydb_layer local status section.
 */

static DB_ENV * volatile most_recent_env;   // most recently opened env, used for engine status on crash.  Note there are likely to be races on this if you have multiple threads creating and closing environments in parallel.  We'll declare it volatile since at least that helps make sure the compiler doesn't optimize away certain code (e.g., if while debugging, you write a code that spins on most_recent_env, you'd like to compiler not to optimize your code away.)

232
static int env_get_iname(DB_ENV* env, DBT* dname_dbt, DBT* iname_dbt);
233
static int toku_maybe_get_engine_status_text (char* buff, int buffsize);  // for use by toku_assert
234
static void toku_maybe_set_env_panic(int code, const char * msg);               // for use by toku_assert
235

236 237
int 
toku_ydb_init(void) {
Yoni Fogel's avatar
Yoni Fogel committed
238 239
    int r = 0;
    //Lower level must be initialized first.
240
    r = toku_ft_layer_init();
Yoni Fogel's avatar
Yoni Fogel committed
241
    return r;
242 243
}

244
// Do not clean up resources if env is panicked, just exit ugly
245
void 
246
toku_ydb_destroy(void) {
247
    if (env_is_panicked == 0) {
248
        toku_ft_layer_destroy();
249
    }
250 251
}

252 253 254 255 256
static int
ydb_getf_do_nothing(DBT const* UU(key), DBT const* UU(val), void* UU(extra)) {
    return 0;
}

257 258
/* env methods */

259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
static void
env_fs_report_in_yellow(DB_ENV *UU(env)) {
    char tbuf[26];
    time_t tnow = time(NULL);
    fprintf(stderr, "%.24s Tokudb file system space is low\n", ctime_r(&tnow, tbuf)); fflush(stderr);
}

static void
env_fs_report_in_red(DB_ENV *UU(env)) {
    char tbuf[26];
    time_t tnow = time(NULL);
    fprintf(stderr, "%.24s Tokudb file system space is really low and access is restricted\n", ctime_r(&tnow, tbuf)); fflush(stderr);
}

static inline uint64_t
env_fs_redzone(DB_ENV *env, uint64_t total) {
    return total * env->i->redzone / 100;
}

#define ZONEREPORTLIMIT 12
// Check the available space in the file systems used by tokudb and erect barriers when available space gets low.
static int
env_fs_poller(void *arg) {
    DB_ENV *env = (DB_ENV *) arg;
    int r;

    int in_yellow; // set true to issue warning to user
286
    int in_red;    // set true to prevent certain operations (returning ENOSPC)
287 288 289 290 291 292 293

    // get the fs sizes for the home dir
    uint64_t avail_size, total_size;
    r = toku_get_filesystem_sizes(env->i->dir, &avail_size, NULL, &total_size);
    assert(r == 0);
    in_yellow = (avail_size < 2 * env_fs_redzone(env, total_size));
    in_red = (avail_size < env_fs_redzone(env, total_size));
294

295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
    // get the fs sizes for the data dir if different than the home dir
    if (strcmp(env->i->dir, env->i->real_data_dir) != 0) {
        r = toku_get_filesystem_sizes(env->i->real_data_dir, &avail_size, NULL, &total_size);
        assert(r == 0);
        in_yellow += (avail_size < 2 * env_fs_redzone(env, total_size));
        in_red += (avail_size < env_fs_redzone(env, total_size));
    }

    // get the fs sizes for the log dir if different than the home dir and data dir
    if (strcmp(env->i->dir, env->i->real_log_dir) != 0 && strcmp(env->i->real_data_dir, env->i->real_log_dir) != 0) {
        r = toku_get_filesystem_sizes(env->i->real_log_dir, &avail_size, NULL, &total_size);
        assert(r == 0);
        in_yellow += (avail_size < 2 * env_fs_redzone(env, total_size));
        in_red += (avail_size < env_fs_redzone(env, total_size));
    }

    env->i->fs_seq++;                    // how many times through this polling loop?
    uint64_t now = env->i->fs_seq;

    // Don't issue report if we have not been out of this fs_state for a while, unless we're at system startup
    switch (env->i->fs_state) {
    case FS_RED:
Barry Perlman's avatar
Barry Perlman committed
317
        if (!in_red) {
318 319 320 321 322 323
            if (in_yellow) {
                env->i->fs_state = FS_YELLOW;
            } else {
                env->i->fs_state = FS_GREEN;
            }
        }
324 325 326
        break;
    case FS_YELLOW:
        if (in_red) {
327 328
            if ((now - env->i->last_seq_entered_red > ZONEREPORTLIMIT) || (now < ZONEREPORTLIMIT))
                env_fs_report_in_red(env);
329
            env->i->fs_state = FS_RED;
330
            env->i->last_seq_entered_red = now;
331 332 333 334 335 336
        } else if (!in_yellow) {
            env->i->fs_state = FS_GREEN;
        }
        break;
    case FS_GREEN:
        if (in_red) {
337 338
            if ((now - env->i->last_seq_entered_red > ZONEREPORTLIMIT) || (now < ZONEREPORTLIMIT))
                env_fs_report_in_red(env);
339
            env->i->fs_state = FS_RED;
340
            env->i->last_seq_entered_red = now;
341
        } else if (in_yellow) {
342 343
            if ((now - env->i->last_seq_entered_yellow > ZONEREPORTLIMIT) || (now < ZONEREPORTLIMIT))
                env_fs_report_in_yellow(env);
344
            env->i->fs_state = FS_YELLOW;
345
            env->i->last_seq_entered_yellow = now;
346 347
        }
        break;
348 349
    default:
        assert(0);
350 351 352 353 354 355 356 357 358 359
    }
    return 0;
}
#undef ZONEREPORTLIMIT

static void
env_fs_init(DB_ENV *env) {
    env->i->fs_state = FS_GREEN;
    env->i->fs_poll_time = 5;  // seconds
    env->i->redzone = 5;       // percent of total space
Yoni Fogel's avatar
Yoni Fogel committed
360
    env->i->fs_poller_is_init = false;
361 362 363 364 365
}

// Initialize the minicron that polls file system space
static int
env_fs_init_minicron(DB_ENV *env) {
366
    int r = toku_minicron_setup(&env->i->fs_poller, env->i->fs_poll_time*1000, env_fs_poller, env); 
367
    assert(r == 0);
Yoni Fogel's avatar
Yoni Fogel committed
368
    env->i->fs_poller_is_init = true;
369 370 371 372 373 374 375 376 377
    return r;
}

// Destroy the file system space minicron
static void
env_fs_destroy(DB_ENV *env) {
    if (env->i->fs_poller_is_init) {
        int r = toku_minicron_shutdown(&env->i->fs_poller);
        assert(r == 0);
Yoni Fogel's avatar
Yoni Fogel committed
378
        env->i->fs_poller_is_init = false;
379 380 381
    }
}

382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
static int
env_fsync_log_on_minicron(void *arg) {
    DB_ENV *env = (DB_ENV *) arg;
    int r = env->log_flush(env, 0);
    assert(r == 0);
    return 0;
}

static void
env_fsync_log_init(DB_ENV *env) {
    env->i->fsync_log_period_ms = 0;
    env->i->fsync_log_cron_is_init = false;
}

static void UU()
env_change_fsync_log_period(DB_ENV* env, uint32_t period_ms) {
    env->i->fsync_log_period_ms = period_ms;
    if (env->i->fsync_log_cron_is_init) {
        toku_minicron_change_period(&env->i->fsync_log_cron, period_ms);
    }
}

static void
env_fsync_log_cron_init(DB_ENV *env) {
    int r = toku_minicron_setup(&env->i->fsync_log_cron, env->i->fsync_log_period_ms, env_fsync_log_on_minicron, env);
    assert(r == 0);
    env->i->fsync_log_cron_is_init = true;
}

static void
env_fsync_log_cron_destroy(DB_ENV *env) {
    if (env->i->fsync_log_cron_is_init) {
        int r = toku_minicron_shutdown(&env->i->fsync_log_cron);
        assert(r == 0);
        env->i->fsync_log_cron_is_init = false;
    }
}

420
static void
421 422 423
env_setup_real_dir(DB_ENV *env, char **real_dir, const char *nominal_dir) {
    toku_free(*real_dir);
    *real_dir = NULL;
424 425

    assert(env->i->dir);
426
    if (nominal_dir) 
427
        *real_dir = toku_construct_full_name(2, env->i->dir, nominal_dir);
428
    else
429 430 431 432 433 434
        *real_dir = toku_strdup(env->i->dir);
}

static void
env_setup_real_data_dir(DB_ENV *env) {
    env_setup_real_dir(env, &env->i->real_data_dir, env->i->data_dir);
435 436 437 438
}

static void
env_setup_real_log_dir(DB_ENV *env) {
439
    env_setup_real_dir(env, &env->i->real_log_dir, env->i->lg_dir);
440 441
}

442 443 444 445 446
static void
env_setup_real_tmp_dir(DB_ENV *env) {
    env_setup_real_dir(env, &env->i->real_tmp_dir, env->i->tmp_dir);
}

447 448 449 450 451
static void keep_cachetable_callback (DB_ENV *env, CACHETABLE cachetable)
{
    env->i->cachetable = cachetable;
}

452 453
static int 
ydb_do_recovery (DB_ENV *env) {
454
    assert(env->i->real_log_dir);
455
    int r = tokudb_recover(env,
456 457 458
                           toku_keep_prepared_txn_callback,
                           keep_cachetable_callback,
                           env->i->logger,
459
                           env->i->dir, env->i->real_log_dir, env->i->bt_compare,
460
                           env->i->update_function,
461
                           env->i->generate_row_for_put, env->i->generate_row_for_del,
462
                           env->i->cachetable_size);
463
    return r;
464 465
}

466 467
static int 
needs_recovery (DB_ENV *env) {
468
    assert(env->i->real_log_dir);
Yoni Fogel's avatar
Yoni Fogel committed
469
    int recovery_needed = tokudb_needs_recovery(env->i->real_log_dir, true);
470
    return recovery_needed ? DB_RUNRECOVERY : 0;
471 472
}

Yoni Fogel's avatar
Yoni Fogel committed
473
static int toku_env_txn_checkpoint(DB_ENV * env, uint32_t kbyte, uint32_t min, uint32_t flags);
474 475 476 477 478 479

// Instruct db to use the default (built-in) key comparison function
// by setting the flag bits in the db and brt structs
static int
db_use_builtin_key_cmp(DB *db) {
    HANDLE_PANICKED_DB(db);
480
    int r = 0;
481 482 483 484 485
    if (db_opened(db))
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Comparison functions cannot be set after DB open.\n");
    else if (db->i->key_compare_was_set)
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Key comparison function already set.\n");
    else {
Yoni Fogel's avatar
Yoni Fogel committed
486
        uint32_t tflags;
487
        toku_ft_get_flags(db->i->ft_handle, &tflags);
488 489

        tflags |= TOKU_DB_KEYCMP_BUILTIN;
490 491
        toku_ft_set_flags(db->i->ft_handle, tflags);
        db->i->key_compare_was_set = true;
492 493 494 495
    }
    return r;
}

496 497
// Keys used in persistent environment dictionary:
// Following keys added in version 12
498
static const char * orig_env_ver_key = "original_version";
499
static const char * curr_env_ver_key = "current_version";  
500
// Following keys added in version 14, add more keys for future versions
501
static const char * creation_time_key         = "creation_time";
502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531

static char * get_upgrade_time_key(int version) {
    static char upgrade_time_key[sizeof("upgrade_v_time") + 12];
    {
        int n;
        n = snprintf(upgrade_time_key, sizeof(upgrade_time_key), "upgrade_v%d_time", version);
        assert(n >= 0 && n < (int)sizeof(upgrade_time_key));
    }
    return &upgrade_time_key[0];
}

static char * get_upgrade_footprint_key(int version) {
    static char upgrade_footprint_key[sizeof("upgrade_v_footprint") + 12];
    {
        int n;
        n = snprintf(upgrade_footprint_key, sizeof(upgrade_footprint_key), "upgrade_v%d_footprint", version);
        assert(n >= 0 && n < (int)sizeof(upgrade_footprint_key));
    }
    return &upgrade_footprint_key[0];
}

static char * get_upgrade_last_lsn_key(int version) {
    static char upgrade_last_lsn_key[sizeof("upgrade_v_last_lsn") + 12];
    {
        int n;
        n = snprintf(upgrade_last_lsn_key, sizeof(upgrade_last_lsn_key), "upgrade_v%d_last_lsn", version);
        assert(n >= 0 && n < (int)sizeof(upgrade_last_lsn_key));
    }
    return &upgrade_last_lsn_key[0];
}
532 533 534

// Values read from (or written into) persistent environment,
// kept here for read-only access from engine status.
535 536 537 538 539 540 541 542 543 544 545
// Note, persistent_upgrade_status info is separate in part to simplify its exclusion from engine status until relevant.
typedef enum {
    PERSISTENT_UPGRADE_ORIGINAL_ENV_VERSION = 0,
    PERSISTENT_UPGRADE_STORED_ENV_VERSION_AT_STARTUP,    // read from curr_env_ver_key, prev version as of this startup
    PERSISTENT_UPGRADE_LAST_LSN_OF_V13,
    PERSISTENT_UPGRADE_V14_TIME,
    PERSISTENT_UPGRADE_V14_FOOTPRINT,
    PERSISTENT_UPGRADE_STATUS_NUM_ROWS
} persistent_upgrade_status_entry;

typedef struct {
Yoni Fogel's avatar
Yoni Fogel committed
546
    bool initialized;
547 548 549 550 551
    TOKU_ENGINE_STATUS_ROW_S status[PERSISTENT_UPGRADE_STATUS_NUM_ROWS];
} PERSISTENT_UPGRADE_STATUS_S, *PERSISTENT_UPGRADE_STATUS;

static PERSISTENT_UPGRADE_STATUS_S persistent_upgrade_status;

552
#define PERSISTENT_UPGRADE_STATUS_INIT(k,c,t,l,inc) TOKUDB_STATUS_INIT(persistent_upgrade_status, k, c, t, "upgrade: " l, inc)
553 554 555 556 557 558

static void
persistent_upgrade_status_init (void) {
    // Note, this function initializes the keyname, type, and legend fields.
    // Value fields are initialized to zero by compiler.

559 560 561 562 563
    PERSISTENT_UPGRADE_STATUS_INIT(PERSISTENT_UPGRADE_ORIGINAL_ENV_VERSION,           nullptr, UINT64,   "original version (at time of environment creation)", TOKU_ENGINE_STATUS);
    PERSISTENT_UPGRADE_STATUS_INIT(PERSISTENT_UPGRADE_STORED_ENV_VERSION_AT_STARTUP,  nullptr, UINT64,   "version at time of startup", TOKU_ENGINE_STATUS);
    PERSISTENT_UPGRADE_STATUS_INIT(PERSISTENT_UPGRADE_LAST_LSN_OF_V13,                nullptr, UINT64,   "last LSN of version 13", TOKU_ENGINE_STATUS);
    PERSISTENT_UPGRADE_STATUS_INIT(PERSISTENT_UPGRADE_V14_TIME,                       nullptr, UNIXTIME, "time of upgrade to version 14", TOKU_ENGINE_STATUS);
    PERSISTENT_UPGRADE_STATUS_INIT(PERSISTENT_UPGRADE_V14_FOOTPRINT,                  nullptr, UINT64,   "footprint from version 13 to 14", TOKU_ENGINE_STATUS);
564 565 566 567
    persistent_upgrade_status.initialized = true;
}

#define PERSISTENT_UPGRADE_STATUS_VALUE(x) persistent_upgrade_status.status[x].value.num
568 569 570 571

// Requires: persistent environment dictionary is already open.
// Input arg is lsn of clean shutdown of previous version,
// or ZERO_LSN if no upgrade or if crash between log upgrade and here.
572 573 574 575 576
// NOTE: To maintain compatibility with previous versions, do not change the 
//       format of any information stored in the persistent environment dictionary.
//       For example, some values are stored as 32 bits, even though they are immediately
//       converted to 64 bits when read.  Do not change them to be stored as 64 bits.
//
577
static int
578
maybe_upgrade_persistent_environment_dictionary(DB_ENV * env, DB_TXN * txn, LSN last_lsn_of_clean_shutdown_read_from_log) {
579 580
    int r;
    DBT key, val;
581
    DB *persistent_environment = env->i->persistent_environment;
582

583 584 585
    if (!persistent_upgrade_status.initialized)
        persistent_upgrade_status_init();

586
    toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
587
    toku_init_dbt(&val);
588
    r = toku_db_get(persistent_environment, txn, &key, &val, 0);
589
    assert(r == 0);
590
    uint32_t stored_env_version = toku_dtoh32(*(uint32_t*)val.data);
591
    PERSISTENT_UPGRADE_STATUS_VALUE(PERSISTENT_UPGRADE_STORED_ENV_VERSION_AT_STARTUP) = stored_env_version;
592
    if (stored_env_version > FT_LAYOUT_VERSION)
593
        r = TOKUDB_DICTIONARY_TOO_NEW;
594
    else if (stored_env_version < FT_LAYOUT_MIN_SUPPORTED_VERSION)
595
        r = TOKUDB_DICTIONARY_TOO_OLD;
596 597
    else if (stored_env_version < FT_LAYOUT_VERSION) {
        const uint32_t curr_env_ver_d = toku_htod32(FT_LAYOUT_VERSION);
598
        toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
599
        toku_fill_dbt(&val, &curr_env_ver_d, sizeof(curr_env_ver_d));
Yoni Fogel's avatar
Yoni Fogel committed
600
        r = toku_db_put(persistent_environment, txn, &key, &val, 0, false);
601
        assert_zero(r);
602

603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635
        time_t upgrade_time_d = toku_htod64(time(NULL));
        uint64_t upgrade_footprint_d = toku_htod64(toku_log_upgrade_get_footprint());
        uint64_t upgrade_last_lsn_d = toku_htod64(last_lsn_of_clean_shutdown_read_from_log.lsn);
        for (int version = stored_env_version+1; version <= FT_LAYOUT_VERSION; version++) {
            uint32_t put_flag = DB_NOOVERWRITE;
            if (version <= FT_LAYOUT_VERSION_19) {
                // See #5902.
                // To prevent a crash (and any higher complexity code) we'll simply
                // silently not overwrite anything if it exists.
                // The keys existing for version <= 19 is not necessarily an error.
                // If this happens for versions > 19 it IS an error and we'll use DB_NOOVERWRITE.
                put_flag = DB_NOOVERWRITE_NO_ERROR;
            }


            char* upgrade_time_key = get_upgrade_time_key(version);
            toku_fill_dbt(&key, upgrade_time_key, strlen(upgrade_time_key));
            toku_fill_dbt(&val, &upgrade_time_d, sizeof(upgrade_time_d));
            r = toku_db_put(persistent_environment, txn, &key, &val, put_flag, false);
            assert_zero(r);

            char* upgrade_footprint_key = get_upgrade_footprint_key(version);
            toku_fill_dbt(&key, upgrade_footprint_key, strlen(upgrade_footprint_key));
            toku_fill_dbt(&val, &upgrade_footprint_d, sizeof(upgrade_footprint_d));
            r = toku_db_put(persistent_environment, txn, &key, &val, put_flag, false);
            assert_zero(r);

            char* upgrade_last_lsn_key = get_upgrade_last_lsn_key(version);
            toku_fill_dbt(&key, upgrade_last_lsn_key, strlen(upgrade_last_lsn_key));
            toku_fill_dbt(&val, &upgrade_last_lsn_d, sizeof(upgrade_last_lsn_d));
            r = toku_db_put(persistent_environment, txn, &key, &val, put_flag, false);
            assert_zero(r);
        }
636

637
    }
638
    return r;
639 640
}

Barry Perlman's avatar
Barry Perlman committed
641
// Capture contents of persistent_environment dictionary so that it can be read by engine status
642
static void
Barry Perlman's avatar
Barry Perlman committed
643
capture_persistent_env_contents (DB_ENV * env, DB_TXN * txn) {
644 645 646 647 648 649 650
    int r;
    DBT key, val;
    DB *persistent_environment = env->i->persistent_environment;

    toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
    toku_init_dbt(&val);
    r = toku_db_get(persistent_environment, txn, &key, &val, 0);
651
    assert_zero(r);
652
    uint32_t curr_env_version = toku_dtoh32(*(uint32_t*)val.data);
653
    assert(curr_env_version == FT_LAYOUT_VERSION);
654 655 656 657

    toku_fill_dbt(&key, orig_env_ver_key, strlen(orig_env_ver_key));
    toku_init_dbt(&val);
    r = toku_db_get(persistent_environment, txn, &key, &val, 0);
658
    assert_zero(r);
659 660
    uint64_t persistent_original_env_version = toku_dtoh32(*(uint32_t*)val.data);
    PERSISTENT_UPGRADE_STATUS_VALUE(PERSISTENT_UPGRADE_ORIGINAL_ENV_VERSION) = persistent_original_env_version;
661 662 663
    assert(persistent_original_env_version <= curr_env_version);

    // make no assertions about timestamps, clock may have been reset
664
    if (persistent_original_env_version >= FT_LAYOUT_VERSION_14) {
665 666 667
        toku_fill_dbt(&key, creation_time_key, strlen(creation_time_key));
        toku_init_dbt(&val);
        r = toku_db_get(persistent_environment, txn, &key, &val, 0);
668
        assert_zero(r);
669
        STATUS_VALUE(YDB_LAYER_TIME_CREATION) = toku_dtoh64((*(time_t*)val.data));
670 671 672
    }

    if (persistent_original_env_version != curr_env_version) {
673
        // an upgrade was performed at some time, capture info about the upgrade
674 675 676

        char * last_lsn_key = get_upgrade_last_lsn_key(curr_env_version);
        toku_fill_dbt(&key, last_lsn_key, strlen(last_lsn_key));
677 678
        toku_init_dbt(&val);
        r = toku_db_get(persistent_environment, txn, &key, &val, 0);
679 680
        assert_zero(r);
        PERSISTENT_UPGRADE_STATUS_VALUE(PERSISTENT_UPGRADE_LAST_LSN_OF_V13) = toku_dtoh64(*(uint64_t*)val.data);
681

682 683
        char * time_key = get_upgrade_time_key(curr_env_version);
        toku_fill_dbt(&key, time_key, strlen(time_key));
684 685
        toku_init_dbt(&val);
        r = toku_db_get(persistent_environment, txn, &key, &val, 0);
686 687
        assert_zero(r);
        PERSISTENT_UPGRADE_STATUS_VALUE(PERSISTENT_UPGRADE_V14_TIME) = toku_dtoh64(*(time_t*)val.data);
688

689 690
        char * footprint_key = get_upgrade_footprint_key(curr_env_version);
        toku_fill_dbt(&key, footprint_key, strlen(footprint_key));
691 692
        toku_init_dbt(&val);
        r = toku_db_get(persistent_environment, txn, &key, &val, 0);
693 694
        assert_zero(r);
        PERSISTENT_UPGRADE_STATUS_VALUE(PERSISTENT_UPGRADE_V14_FOOTPRINT) = toku_dtoh64(*(uint64_t*)val.data);
695 696 697 698
    }

}

699 700 701
// return 0 if log exists or ENOENT if log does not exist
static int
ydb_recover_log_exists(DB_ENV *env) {
702
    int r = tokudb_recover_log_exists(env->i->real_log_dir);
703 704 705 706
    return r;
}

// Validate that all required files are present, no side effects.
707 708
// Return 0 if all is well, ENOENT if some files are present but at least one is missing, 
// other non-zero value if some other error occurs.
709 710 711
// Set *valid_newenv if creating a new environment (all files missing).
// (Note, if special dictionaries exist, then they were created transactionally and log should exist.)
static int 
Yoni Fogel's avatar
Yoni Fogel committed
712
validate_env(DB_ENV * env, bool * valid_newenv, bool need_rollback_cachefile) {
713
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
714
    bool expect_newenv = false;        // set true if we expect to create a new env
715 716
    toku_struct_stat buf;
    char* path = NULL;
717

718
    // Test for persistent environment
719
    path = toku_construct_full_name(2, env->i->dir, toku_product_name_strings.environmentdictionary);
720 721
    assert(path);
    r = toku_stat(path, &buf);
722
    if (r == 0) {
Yoni Fogel's avatar
Yoni Fogel committed
723
        expect_newenv = false;  // persistent info exists
724
    }
725
    else {
726 727
        int stat_errno = get_error_errno();
        if (stat_errno == ENOENT) {
Yoni Fogel's avatar
Yoni Fogel committed
728
            expect_newenv = true;
729 730 731 732 733 734
            r = 0;
        }
        else {
            r = toku_ydb_do_error(env, stat_errno, "Unable to access persistent environment\n");
            assert(r);
        }
735
    }
736
    toku_free(path);
737

738
    // Test for existence of rollback cachefile if it is expected to exist
739
    if (r == 0 && need_rollback_cachefile) {
740
        path = toku_construct_full_name(2, env->i->dir, toku_product_name_strings.rollback_cachefile);
741 742 743 744 745 746 747
        assert(path);
        r = toku_stat(path, &buf);
        if (r == 0) {  
            if (expect_newenv)  // rollback cachefile exists, but persistent env is missing
                r = toku_ydb_do_error(env, ENOENT, "Persistent environment is missing\n");
        }
        else {
748 749 750 751 752 753 754 755 756 757 758
            int stat_errno = get_error_errno();
            if (stat_errno == ENOENT) {
                if (!expect_newenv)  // rollback cachefile is missing but persistent env exists
                    r = toku_ydb_do_error(env, ENOENT, "rollback cachefile directory is missing\n");
                else 
                    r = 0;           // both rollback cachefile and persistent env are missing
            }
            else {
                r = toku_ydb_do_error(env, stat_errno, "Unable to access rollback cachefile\n");
                assert(r);
            }
759
        }
760
        toku_free(path);
761 762
    }

763 764
    // Test for fileops directory
    if (r == 0) {
765
        path = toku_construct_full_name(2, env->i->dir, toku_product_name_strings.fileopsdirectory);
766 767 768 769 770 771 772
        assert(path);
        r = toku_stat(path, &buf);
        if (r == 0) {  
            if (expect_newenv)  // fileops directory exists, but persistent env is missing
                r = toku_ydb_do_error(env, ENOENT, "Persistent environment is missing\n");
        }
        else {
773 774 775 776 777 778 779 780 781 782 783
            int stat_errno = get_error_errno();
            if (stat_errno == ENOENT) {
                if (!expect_newenv)  // fileops directory is missing but persistent env exists
                    r = toku_ydb_do_error(env, ENOENT, "Fileops directory is missing\n");
                else 
                    r = 0;           // both fileops directory and persistent env are missing
            }
            else {
                r = toku_ydb_do_error(env, stat_errno, "Unable to access fileops directory\n");
                assert(r);
            }
784
        }
785
        toku_free(path);
786 787 788
    }

    // Test for recovery log
789
    if ((r == 0) && (env->i->open_flags & DB_INIT_LOG)) {
790 791 792 793 794 795 796 797
        // if using transactions, test for existence of log
        r = ydb_recover_log_exists(env);  // return 0 or ENOENT
        if (expect_newenv && (r != ENOENT))
            r = toku_ydb_do_error(env, ENOENT, "Persistent environment information is missing (but log exists)\n");
        else if (!expect_newenv && r == ENOENT)
            r = toku_ydb_do_error(env, ENOENT, "Recovery log is missing (persistent environment information is present)\n");
        else
            r = 0;
798 799 800
    }

    if (r == 0)
801
        *valid_newenv = expect_newenv;
802
    else 
Yoni Fogel's avatar
Yoni Fogel committed
803
        *valid_newenv = false;
804 805 806
    return r;
}

Barry Perlman's avatar
Barry Perlman committed
807 808 809 810
// The version of the environment (on disk) is the version of the recovery log.  
// If the recovery log is of the current version, then there is no upgrade to be done.  
// If the recovery log is of an old version, then replacing it with a new recovery log
// of the current version is how the upgrade is done.  
Barry Perlman's avatar
Barry Perlman committed
811
// Note, the upgrade procedure takes a checkpoint, so we must release the ydb lock.
812
static int
Yoni Fogel's avatar
Yoni Fogel committed
813
ydb_maybe_upgrade_env (DB_ENV *env, LSN * last_lsn_of_clean_shutdown_read_from_log, bool * upgrade_in_progress) {
814 815
    int r = 0;
    if (env->i->open_flags & DB_INIT_TXN && env->i->open_flags & DB_INIT_LOG) {
816
        r = toku_maybe_upgrade_log(env->i->dir, env->i->real_log_dir, last_lsn_of_clean_shutdown_read_from_log, upgrade_in_progress);
817 818 819 820
    }
    return r;
}

Yoni Fogel's avatar
Yoni Fogel committed
821 822 823
static void
unlock_single_process(DB_ENV *env) {
    int r;
824
    r = toku_single_process_unlock(&env->i->envdir_lockfd);
825
    lazy_assert_zero(r);
826
    r = toku_single_process_unlock(&env->i->datadir_lockfd);
827
    lazy_assert_zero(r);
828
    r = toku_single_process_unlock(&env->i->logdir_lockfd);
829
    lazy_assert_zero(r);
830
    r = toku_single_process_unlock(&env->i->tmpdir_lockfd);
831
    lazy_assert_zero(r);
Yoni Fogel's avatar
Yoni Fogel committed
832
}
833 834 835 836 837 838

// Open the environment.
// If this is a new environment, then create the necessary files.
// Return 0 on success, ENOENT if any of the expected necessary files are missing.
// (The set of necessary files is defined in the function validate_env() above.)
static int 
Yoni Fogel's avatar
Yoni Fogel committed
839
env_open(DB_ENV * env, const char *home, uint32_t flags, int mode) {
840
    HANDLE_PANICKED_ENV(env);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
841
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
842 843
    bool newenv;  // true iff creating a new environment
    uint32_t unused_flags=flags;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
844
    CHECKPOINTER cp;
845
    DB_TXN *txn = NULL;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
846

847
    if (env_opened(env)) {
848
        r = toku_ydb_do_error(env, EINVAL, "The environment is already open\n");
Yoni Fogel's avatar
Yoni Fogel committed
849
        goto cleanup;
850
    }
Yoni Fogel's avatar
Yoni Fogel committed
851

852 853
    most_recent_env = NULL;

854 855
    assert(sizeof(time_t) == sizeof(uint64_t));

856 857 858
    HANDLE_EXTRA_FLAGS(env, flags, 
                       DB_CREATE|DB_PRIVATE|DB_INIT_LOG|DB_INIT_TXN|DB_RECOVER|DB_INIT_MPOOL|DB_INIT_LOCK|DB_THREAD);

859 860
    // DB_CREATE means create if env does not exist, and Tokudb requires it because
    // Tokudb requries DB_PRIVATE.
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
861
    if ((flags & DB_PRIVATE) && !(flags & DB_CREATE)) {
862
        r = toku_ydb_do_error(env, ENOENT, "DB_PRIVATE requires DB_CREATE (seems gratuitous to us, but that's BDB's behavior\n");
Yoni Fogel's avatar
Yoni Fogel committed
863
        goto cleanup;
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
864 865
    }

866
    if (!(flags & DB_PRIVATE)) {
867
        r = toku_ydb_do_error(env, ENOENT, "TokuDB requires DB_PRIVATE\n");
Yoni Fogel's avatar
Yoni Fogel committed
868
        goto cleanup;
Yoni Fogel's avatar
Yoni Fogel committed
869
    }
870

Yoni Fogel's avatar
Yoni Fogel committed
871
    if ((flags & DB_INIT_LOG) && !(flags & DB_INIT_TXN)) {
872
        r = toku_ydb_do_error(env, EINVAL, "TokuDB requires transactions for logging\n");
Yoni Fogel's avatar
Yoni Fogel committed
873 874
        goto cleanup;
    }
875

876
    if (!home) home = ".";
Yoni Fogel's avatar
Yoni Fogel committed
877

878
    // Verify that the home exists.
879 880 881 882 883 884
    toku_struct_stat buf;
    r = toku_stat(home, &buf);
    if (r != 0) {
        int e = get_error_errno();
        r = toku_ydb_do_error(env, e, "Error from toku_stat(\"%s\",...)\n", home);
        goto cleanup;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
885
    }
886
    unused_flags &= ~DB_PRIVATE;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
887

888
    if (env->i->dir) {
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
889
        toku_free(env->i->dir);
890
    }
Yoni Fogel's avatar
Yoni Fogel committed
891
    env->i->dir = toku_strdup(home);
892
    if (env->i->dir == 0) {
893
        r = toku_ydb_do_error(env, ENOMEM, "Out of memory\n");
Yoni Fogel's avatar
Yoni Fogel committed
894
        goto cleanup;
895
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
896 897
    env->i->open_flags = flags;
    env->i->open_mode = mode;
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
898

899 900
    env_setup_real_data_dir(env);
    env_setup_real_log_dir(env);
901
    env_setup_real_tmp_dir(env);
902

903
    r = toku_single_process_lock(env->i->dir, "environment", &env->i->envdir_lockfd);
Yoni Fogel's avatar
Yoni Fogel committed
904
    if (r!=0) goto cleanup;
905
    r = toku_single_process_lock(env->i->real_data_dir, "data", &env->i->datadir_lockfd);
Yoni Fogel's avatar
Yoni Fogel committed
906
    if (r!=0) goto cleanup;
907
    r = toku_single_process_lock(env->i->real_log_dir, "logs", &env->i->logdir_lockfd);
Yoni Fogel's avatar
Yoni Fogel committed
908
    if (r!=0) goto cleanup;
909
    r = toku_single_process_lock(env->i->real_tmp_dir, "temp", &env->i->tmpdir_lockfd);
Yoni Fogel's avatar
Yoni Fogel committed
910 911
    if (r!=0) goto cleanup;

Yoni Fogel's avatar
Yoni Fogel committed
912 913
    bool need_rollback_cachefile;
    need_rollback_cachefile = false;
914
    if (flags & (DB_INIT_TXN | DB_INIT_LOG)) {
Yoni Fogel's avatar
Yoni Fogel committed
915
        need_rollback_cachefile = true;
916 917
    }

918
    ydb_layer_status_init();  // do this before possibly upgrading, so upgrade work is counted in status counters
919

920 921
    LSN last_lsn_of_clean_shutdown_read_from_log;
    last_lsn_of_clean_shutdown_read_from_log = ZERO_LSN;
Yoni Fogel's avatar
Yoni Fogel committed
922 923
    bool upgrade_in_progress;
    upgrade_in_progress = false;
924
    r = ydb_maybe_upgrade_env(env, &last_lsn_of_clean_shutdown_read_from_log, &upgrade_in_progress);
Yoni Fogel's avatar
Yoni Fogel committed
925
    if (r!=0) goto cleanup;
926

927
    if (upgrade_in_progress) {
928 929
        // Delete old rollback file.  There was a clean shutdown, so it has nothing useful,
        // and there is no value in upgrading it.  It is simpler to just create a new one.
930
        char* rollback_filename = toku_construct_full_name(2, env->i->dir, toku_product_name_strings.rollback_cachefile);
931 932
        assert(rollback_filename);
        r = unlink(rollback_filename);
933 934 935
        if (r != 0) {
            assert(get_error_errno() == ENOENT);
        }
936
        toku_free(rollback_filename);
Yoni Fogel's avatar
Yoni Fogel committed
937
        need_rollback_cachefile = false;  // we're not expecting it to exist now
938 939
    }
    
940
    r = validate_env(env, &newenv, need_rollback_cachefile);  // make sure that environment is either new or complete
Yoni Fogel's avatar
Yoni Fogel committed
941
    if (r != 0) goto cleanup;
942

943
    unused_flags &= ~DB_INIT_TXN & ~DB_INIT_LOG;
944

945 946 947 948 949 950 951
    // do recovery only if there exists a log and recovery is requested
    // otherwise, a log is created when the logger is opened later
    if (!newenv) {
        if (flags & DB_INIT_LOG) {
            // the log does exist
            if (flags & DB_RECOVER) {
                r = ydb_do_recovery(env);
Yoni Fogel's avatar
Yoni Fogel committed
952
                if (r != 0) goto cleanup;
953 954 955
            } else {
                // the log is required to have clean shutdown if recovery is not requested
                r = needs_recovery(env);
Yoni Fogel's avatar
Yoni Fogel committed
956
                if (r != 0) goto cleanup;
957
            }
958 959
        }
    }
960 961
    
    toku_loader_cleanup_temp_files(env);
962

Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
963
    if (flags & (DB_INIT_TXN | DB_INIT_LOG)) {
964
        assert(env->i->logger);
Yoni Fogel's avatar
Yoni Fogel committed
965
        toku_logger_write_log_files(env->i->logger, (bool)((flags & DB_INIT_LOG) != 0));
966 967 968 969 970 971
        if (!toku_logger_is_open(env->i->logger)) {
            r = toku_logger_open(env->i->real_log_dir, env->i->logger);
            if (r!=0) {
                toku_ydb_do_error(env, r, "Could not open logger\n");
            }
        }
972
    } else {
973 974
        r = toku_logger_close(&env->i->logger); // if no logging system, then kill the logger
        assert_zero(r);
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
975 976
    }

977 978 979 980 981 982 983
    unused_flags &= ~DB_INIT_MPOOL; // we always init an mpool.
    unused_flags &= ~DB_CREATE;     // we always do DB_CREATE
    unused_flags &= ~DB_INIT_LOCK;  // we check this later (e.g. in db->open)
    unused_flags &= ~DB_RECOVER;

// This is probably correct, but it will be pain...
//    if ((flags & DB_THREAD)==0) {
984 985
//        r = toku_ydb_do_error(env, EINVAL, "TokuDB requires DB_THREAD");
//        goto cleanup;
986 987 988 989
//    }
    unused_flags &= ~DB_THREAD;

    if (unused_flags!=0) {
990
        r = toku_ydb_do_error(env, EINVAL, "Extra flags not understood by tokudb: %u\n", unused_flags);
Yoni Fogel's avatar
Yoni Fogel committed
991
        goto cleanup;
992 993
    }

994 995
    if (env->i->cachetable==NULL) {
        // If we ran recovery then the cachetable should be set here.
996
        toku_cachetable_create(&env->i->cachetable, env->i->cachetable_size, ZERO_LSN, env->i->logger);
997
    }
998

Yoni Fogel's avatar
Yoni Fogel committed
999 1000
    toku_cachetable_set_env_dir(env->i->cachetable, env->i->dir);

1001 1002
    int using_txns;
    using_txns = env->i->open_flags & DB_INIT_TXN;
1003
    if (env->i->logger) {
1004 1005 1006 1007
        // if this is a newborn env or if this is an upgrade, then create a brand new rollback file
        assert (using_txns);
        toku_logger_set_cachetable(env->i->logger, env->i->cachetable);
        if (!toku_logger_rollback_is_open(env->i->logger)) {
Yoni Fogel's avatar
Yoni Fogel committed
1008
            bool create_new_rollback_file = newenv | upgrade_in_progress;
1009
            r = toku_logger_open_rollback(env->i->logger, env->i->cachetable, create_new_rollback_file);
1010 1011 1012 1013
            if (r != 0) {
                r = toku_ydb_do_error(env, r, "cant open rollback");
                goto cleanup;
            }
1014
        }
1015
    }
1016

1017
    if (using_txns) {
1018
        r = toku_txn_begin(env, 0, &txn, 0);
1019
        assert_zero(r);
1020
    }
1021

1022 1023
    {
        r = toku_db_create(&env->i->persistent_environment, env, 0);
1024
        assert_zero(r);
1025
        r = db_use_builtin_key_cmp(env->i->persistent_environment);
1026
        assert_zero(r);
1027
        r = toku_db_open_iname(env->i->persistent_environment, txn, toku_product_name_strings.environmentdictionary, DB_CREATE, mode);
1028 1029 1030 1031
        if (r != 0) {
            r = toku_ydb_do_error(env, r, "cant open persistent env");
            goto cleanup;
        }
1032 1033 1034 1035 1036 1037 1038 1039
        if (newenv) {
            // create new persistent_environment
            DBT key, val;
            uint32_t persistent_original_env_version = FT_LAYOUT_VERSION;
            const uint32_t environment_version = toku_htod32(persistent_original_env_version);

            toku_fill_dbt(&key, orig_env_ver_key, strlen(orig_env_ver_key));
            toku_fill_dbt(&val, &environment_version, sizeof(environment_version));
Yoni Fogel's avatar
Yoni Fogel committed
1040
            r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0, false);
1041 1042 1043 1044
            assert_zero(r);

            toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
            toku_fill_dbt(&val, &environment_version, sizeof(environment_version));
Yoni Fogel's avatar
Yoni Fogel committed
1045
            r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0, false);
1046 1047 1048 1049 1050
            assert_zero(r);

            time_t creation_time_d = toku_htod64(time(NULL));
            toku_fill_dbt(&key, creation_time_key, strlen(creation_time_key));
            toku_fill_dbt(&val, &creation_time_d, sizeof(creation_time_d));
Yoni Fogel's avatar
Yoni Fogel committed
1051
            r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0, false);
1052 1053 1054 1055 1056 1057 1058
            assert_zero(r);
        }
        else {
            r = maybe_upgrade_persistent_environment_dictionary(env, txn, last_lsn_of_clean_shutdown_read_from_log);
            assert_zero(r);
        }
        capture_persistent_env_contents(env, txn);
1059 1060 1061
    }
    {
        r = toku_db_create(&env->i->directory, env, 0);
1062
        assert_zero(r);
1063
        r = db_use_builtin_key_cmp(env->i->directory);
1064
        assert_zero(r);
1065
        r = toku_db_open_iname(env->i->directory, txn, toku_product_name_strings.fileopsdirectory, DB_CREATE, mode);
1066 1067 1068 1069
        if (r != 0) {
            r = toku_ydb_do_error(env, r, "cant open %s", toku_product_name_strings.fileopsdirectory);
            goto cleanup;
        }
1070 1071
    }
    if (using_txns) {
1072
        r = locked_txn_commit(txn, 0);
1073
        assert_zero(r);
1074
    }
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1075 1076
    cp = toku_cachetable_get_checkpointer(env->i->cachetable);
    r = toku_checkpoint(cp, env->i->logger, NULL, NULL, NULL, NULL, STARTUP_CHECKPOINT);
1077
    assert_zero(r);
1078
    env_fs_poller(env);          // get the file system state at startup
1079 1080
    env_fs_init_minicron(env);
    env_fsync_log_cron_init(env);
Yoni Fogel's avatar
Yoni Fogel committed
1081 1082
cleanup:
    if (r!=0) {
1083 1084 1085
        if (txn) {
            locked_txn_abort(txn);
        }
Yoni Fogel's avatar
Yoni Fogel committed
1086 1087 1088 1089
        if (env && env->i) {
            unlock_single_process(env);
        }
    }
1090
    if (r == 0) {
1091
        set_errno(0); // tabula rasa.   If there's a crash after env was successfully opened, no misleading errno will have been left around by this code.
1092
        most_recent_env = env;
1093 1094
        uint64_t num_rows;
        env_get_engine_status_num_rows(env, &num_rows);
1095
        toku_assert_set_fpointers(toku_maybe_get_engine_status_text, toku_maybe_set_env_panic, num_rows);
1096
    }
Yoni Fogel's avatar
Yoni Fogel committed
1097
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1098
}
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
1099

1100
static int 
Yoni Fogel's avatar
Yoni Fogel committed
1101
env_close(DB_ENV * env, uint32_t flags) {
1102
    int r = 0;
1103
    const char * err_msg = NULL;
1104

1105 1106
    most_recent_env = NULL; // Set most_recent_env to NULL so that we don't have a dangling pointer (and if there's an error, the toku assert code would try to look at the env.)

1107
    // if panicked, or if any open transactions, or any open dbs, then do nothing.
1108

1109 1110 1111
    if (toku_env_is_panicked(env)) {
        goto panic_and_quit_early;
    }
1112
    if (env->i->logger && toku_logger_txns_exist(env->i->logger)) {
1113
        err_msg = "Cannot close environment due to open transactions\n";
1114
        r = toku_ydb_do_error(env, EINVAL, "%s", err_msg);
1115 1116
        goto panic_and_quit_early;
    }
1117
    if (env->i->open_dbs) { //Verify that there are no open dbs.
1118
        if (toku_omt_size(env->i->open_dbs) > 0) {
1119
            err_msg = "Cannot close environment due to open DBs\n";
1120
            r = toku_ydb_do_error(env, EINVAL, "%s", err_msg);
1121 1122
            goto panic_and_quit_early;
        }
1123
    }
1124 1125 1126 1127 1128 1129
    if (env->i->persistent_environment) {
        r = toku_db_close(env->i->persistent_environment);
        if (r) {
            err_msg = "Cannot close persistent environment dictionary (DB->close error)\n";
            toku_ydb_do_error(env, r, "%s", err_msg);
            goto panic_and_quit_early;
1130
        }
1131 1132 1133 1134 1135 1136 1137
    }
    if (env->i->directory) {
        r = toku_db_close(env->i->directory);
        if (r) {
            err_msg = "Cannot close Directory dictionary (DB->close error)\n";
            toku_ydb_do_error(env, r, "%s", err_msg);
            goto panic_and_quit_early;
1138 1139
        }
    }
1140
    if (env->i->cachetable) {
1141
        toku_cachetable_minicron_shutdown(env->i->cachetable);
1142
        if (env->i->logger) {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1143 1144
            CHECKPOINTER cp = toku_cachetable_get_checkpointer(env->i->cachetable);
            r = toku_checkpoint(cp, env->i->logger, NULL, NULL, NULL, NULL, SHUTDOWN_CHECKPOINT);
1145
            if (r) {
1146
                err_msg = "Cannot close environment (error during checkpoint)\n";
1147
                toku_ydb_do_error(env, r, "%s", err_msg);
1148 1149
                goto panic_and_quit_early;
            }
1150
            toku_logger_close_rollback(env->i->logger);
1151
            //Do a second checkpoint now that the rollback cachefile is closed.
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1152
            r = toku_checkpoint(cp, env->i->logger, NULL, NULL, NULL, NULL, SHUTDOWN_CHECKPOINT);
1153
            if (r) {
1154
                err_msg = "Cannot close environment (error during checkpoint)\n";
1155
                toku_ydb_do_error(env, r, "%s", err_msg);
1156 1157
                goto panic_and_quit_early;
            }
1158
            toku_logger_shutdown(env->i->logger); 
1159
        }
1160
        toku_cachetable_close(&env->i->cachetable);
1161 1162
    }
    if (env->i->logger) {
1163
        r = toku_logger_close(&env->i->logger);
1164 1165
        if (r) {
            err_msg = "Cannot close environment (logger close error)\n";
1166
            env->i->logger = NULL;
1167
            toku_ydb_do_error(env, r, "%s", err_msg);
1168
            goto panic_and_quit_early;
1169
        }
1170 1171 1172
    }
    // Even if nothing else went wrong, but we were panicked, then raise an error.
    // But if something else went wrong then raise that error (above)
1173
    if (toku_env_is_panicked(env)) {
1174
        goto panic_and_quit_early;
1175 1176 1177
    } else {
        assert(env->i->panic_string == 0);
    }
1178

1179
    env_fs_destroy(env);
1180
    env_fsync_log_cron_destroy(env);
1181
    env->i->ltm.destroy();
1182 1183
    if (env->i->data_dir)
        toku_free(env->i->data_dir);
1184 1185
    if (env->i->lg_dir)
        toku_free(env->i->lg_dir);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1186 1187
    if (env->i->tmp_dir)
        toku_free(env->i->tmp_dir);
1188
    if (env->i->real_data_dir)
1189
        toku_free(env->i->real_data_dir);
1190
    if (env->i->real_log_dir)
1191
        toku_free(env->i->real_log_dir);
1192
    if (env->i->real_tmp_dir)
1193
        toku_free(env->i->real_tmp_dir);
1194 1195
    if (env->i->open_dbs)
        toku_omt_destroy(&env->i->open_dbs);
1196
    if (env->i->dir)
1197
        toku_free(env->i->dir);
1198 1199 1200
    toku_mutex_destroy(&env->i->open_dbs_lock);

    // Immediately before freeing internal environment unlock the directories
1201
    unlock_single_process(env);
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
1202 1203
    toku_free(env->i);
    toku_free(env);
1204
    toku_sync_fetch_and_add(&tokudb_num_envs, -1);
1205
    if (flags != 0) {
1206
        r = EINVAL;
1207
    }
1208 1209 1210
    return r;

panic_and_quit_early:
Yoni Fogel's avatar
Yoni Fogel committed
1211 1212
    //release lock files.
    unlock_single_process(env);
1213 1214 1215 1216 1217
    //r is the panic error
    if (toku_env_is_panicked(env)) {
        char *panic_string = env->i->panic_string;
        r = toku_ydb_do_error(env, toku_env_is_panicked(env), "Cannot close environment due to previous error: %s\n", panic_string);
    }
1218
    else {
1219
        env_panic(env, r, err_msg);
1220
    }
1221
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1222
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1223

1224
static int 
Yoni Fogel's avatar
Yoni Fogel committed
1225
env_log_archive(DB_ENV * env, char **list[], uint32_t flags) {
1226
    return toku_logger_log_archive(env->i->logger, list, flags);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1227
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1228

1229
static int 
1230
env_log_flush(DB_ENV * env, const DB_LSN * lsn __attribute__((__unused__))) {
1231
    HANDLE_PANICKED_ENV(env);
1232 1233 1234 1235 1236 1237
    // do nothing if no logger
    if (env->i->logger) {
        // We just flush everything. MySQL uses lsn == 0 which means flush everything. 
        // For anyone else using the log, it is correct to flush too much, so we are OK.
        toku_logger_fsync(env->i->logger);
    }
1238
    return 0;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1239
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1240

1241
static int 
Yoni Fogel's avatar
Yoni Fogel committed
1242
env_set_cachesize(DB_ENV * env, uint32_t gbytes, uint32_t bytes, int ncache) {
1243
    HANDLE_PANICKED_ENV(env);
1244
    if (ncache != 1) {
1245
        return EINVAL;
1246
    }
Yoni Fogel's avatar
Yoni Fogel committed
1247
    uint64_t cs64 = ((uint64_t) gbytes << 30) + bytes;
1248
    unsigned long cs = cs64;
1249
    if (cs64 > cs) {
1250
        return EINVAL;
1251
    }
1252
    env->i->cachetable_size = cs;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1253 1254 1255
    return 0;
}

1256 1257
static int env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, uint32_t flags);

1258
static int
Yoni Fogel's avatar
Yoni Fogel committed
1259
locked_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, uint32_t flags) {
1260 1261
    int ret, r;
    HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn);
1262
    HANDLE_READ_ONLY_TXN(txn);
1263 1264 1265 1266

    DB_TXN *child_txn = NULL;
    int using_txns = env->i->open_flags & DB_INIT_TXN;
    if (using_txns) {
1267 1268
        ret = toku_txn_begin(env, txn, &child_txn, 0);
        lazy_assert_zero(ret);
1269 1270 1271 1272
    }

    // cannot begin a checkpoint
    toku_multi_operation_client_lock();
1273
    r = env_dbremove(env, child_txn, fname, dbname, flags);
1274 1275 1276
    toku_multi_operation_client_unlock();

    if (using_txns) {
1277
        if (r == 0) {
1278 1279
            ret = locked_txn_commit(child_txn, 0);
            lazy_assert_zero(ret);
1280 1281
        } else {
            ret = locked_txn_abort(child_txn);
1282
            lazy_assert_zero(ret);
1283 1284
        }
    }
1285 1286 1287
    return r;
}

1288 1289
static int env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, uint32_t flags);

1290
static int
Yoni Fogel's avatar
Yoni Fogel committed
1291
locked_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, uint32_t flags) {
1292
    int ret, r;
1293
    HANDLE_READ_ONLY_TXN(txn);
1294 1295 1296 1297 1298
    HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn);

    DB_TXN *child_txn = NULL;
    int using_txns = env->i->open_flags & DB_INIT_TXN;
    if (using_txns) {
1299 1300
        ret = toku_txn_begin(env, txn, &child_txn, 0);
        lazy_assert_zero(ret);
1301 1302 1303 1304
    }

    // cannot begin a checkpoint
    toku_multi_operation_client_lock();
1305
    r = env_dbrename(env, child_txn, fname, dbname, newname, flags);
1306 1307 1308 1309
    toku_multi_operation_client_unlock();

    if (using_txns) {
        if (r == 0) {
1310 1311
            ret = locked_txn_commit(child_txn, 0);
            lazy_assert_zero(ret);
1312 1313
        } else {
            ret = locked_txn_abort(child_txn);
1314
            lazy_assert_zero(ret);
1315 1316
        }
    }
1317 1318 1319
    return r;
}

Rich Prohaska's avatar
Rich Prohaska committed
1320 1321
#if DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR >= 3

1322
static int 
Yoni Fogel's avatar
Yoni Fogel committed
1323
env_get_cachesize(DB_ENV * env, uint32_t *gbytes, uint32_t *bytes, int *ncache) {
1324
    HANDLE_PANICKED_ENV(env);
Rich Prohaska's avatar
Rich Prohaska committed
1325 1326 1327 1328 1329 1330 1331 1332
    *gbytes = env->i->cachetable_size >> 30;
    *bytes = env->i->cachetable_size & ((1<<30)-1);
    *ncache = 1;
    return 0;
}

#endif

1333
static int 
1334
env_set_data_dir(DB_ENV * env, const char *dir) {
1335
    HANDLE_PANICKED_ENV(env);
Yoni Fogel's avatar
Yoni Fogel committed
1336 1337
    int r;
    
1338
    if (env_opened(env) || !dir) {
1339
        r = toku_ydb_do_error(env, EINVAL, "You cannot set the data dir after opening the env\n");
1340
    }
1341
    else if (env->i->data_dir)
1342
        r = toku_ydb_do_error(env, EINVAL, "You cannot set the data dir more than once.\n");
1343 1344 1345
    else {
        env->i->data_dir = toku_strdup(dir);
        if (env->i->data_dir==NULL) {
1346
            assert(get_error_errno() == ENOMEM);
1347
            r = toku_ydb_do_error(env, ENOMEM, "Out of memory\n");
Yoni Fogel's avatar
Yoni Fogel committed
1348
        }
1349
        else r = 0;
Yoni Fogel's avatar
Yoni Fogel committed
1350
    }
1351
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1352
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1353

1354
static void 
1355
env_set_errcall(DB_ENV * env, toku_env_errcall_t errcall) {
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1356
    env->i->errcall = errcall;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1357
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1358

1359
static void 
1360
env_set_errfile(DB_ENV*env, FILE*errfile) {
1361 1362 1363
    env->i->errfile = errfile;
}

1364
static void 
1365
env_set_errpfx(DB_ENV * env, const char *errpfx) {
1366
    env->i->errpfx = errpfx;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1367
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1368

1369
static int 
Yoni Fogel's avatar
Yoni Fogel committed
1370
env_set_flags(DB_ENV * env, uint32_t flags, int onoff) {
1371
    HANDLE_PANICKED_ENV(env);
Yoni Fogel's avatar
Yoni Fogel committed
1372

Yoni Fogel's avatar
Yoni Fogel committed
1373
    uint32_t change = 0;
Yoni Fogel's avatar
Yoni Fogel committed
1374 1375 1376 1377
    if (flags & DB_AUTO_COMMIT) {
        change |=  DB_AUTO_COMMIT;
        flags  &= ~DB_AUTO_COMMIT;
    }
1378
    if (flags != 0 && onoff) {
1379
        return toku_ydb_do_error(env, EINVAL, "TokuDB does not (yet) support any nonzero ENV flags other than DB_AUTO_COMMIT\n");
1380
    }
Yoni Fogel's avatar
Yoni Fogel committed
1381 1382
    if   (onoff) env->i->open_flags |=  change;
    else         env->i->open_flags &= ~change;
Rich Prohaska's avatar
Rich Prohaska committed
1383
    return 0;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1384
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1385

1386
static int 
Yoni Fogel's avatar
Yoni Fogel committed
1387
env_set_lg_bsize(DB_ENV * env, uint32_t bsize) {
1388
    HANDLE_PANICKED_ENV(env);
1389
    return toku_logger_set_lg_bsize(env->i->logger, bsize);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1390
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1391

1392
static int 
1393
env_set_lg_dir(DB_ENV * env, const char *dir) {
1394
    HANDLE_PANICKED_ENV(env);
1395
    if (env_opened(env)) {
1396
        return toku_ydb_do_error(env, EINVAL, "Cannot set log dir after opening the env\n");
1397
    }
1398 1399

    if (env->i->lg_dir) toku_free(env->i->lg_dir);
1400 1401
    if (dir) {
        env->i->lg_dir = toku_strdup(dir);
1402
        if (!env->i->lg_dir) {
1403 1404
            return toku_ydb_do_error(env, ENOMEM, "Out of memory\n");
        }
1405
    }
1406 1407
    else env->i->lg_dir = NULL;
    return 0;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1408
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1409

1410
static int 
Yoni Fogel's avatar
Yoni Fogel committed
1411
env_set_lg_max(DB_ENV * env, uint32_t lg_max) {
1412
    HANDLE_PANICKED_ENV(env);
1413 1414 1415
    return toku_logger_set_lg_max(env->i->logger, lg_max);
}

1416
static int 
Yoni Fogel's avatar
Yoni Fogel committed
1417
env_get_lg_max(DB_ENV * env, uint32_t *lg_maxp) {
1418 1419
    HANDLE_PANICKED_ENV(env);
    return toku_logger_get_lg_max(env->i->logger, lg_maxp);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1420
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1421

1422
static int 
Yoni Fogel's avatar
Yoni Fogel committed
1423
env_set_lk_detect(DB_ENV * env, uint32_t UU(detect)) {
1424
    HANDLE_PANICKED_ENV(env);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1425
    return toku_ydb_do_error(env, EINVAL, "TokuDB does not (yet) support set_lk_detect\n");
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1426
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1427

1428
static int 
1429
env_set_lk_max_memory(DB_ENV *env, uint64_t lock_memory_limit) {
1430
    HANDLE_PANICKED_ENV(env);
1431
    int r = 0;
1432 1433 1434
    if (env_opened(env)) {
        r = EINVAL;
    } else {
1435
        r = env->i->ltm.set_max_lock_memory(lock_memory_limit);
1436
    }
1437 1438 1439
    return r;
}

1440
static int 
1441
env_get_lk_max_memory(DB_ENV *env, uint64_t *lk_maxp) {
1442
    HANDLE_PANICKED_ENV(env);
1443 1444 1445
    uint32_t max_lock_memory = env->i->ltm.get_max_lock_memory();
    *lk_maxp = max_lock_memory;
    return 0;
1446 1447
}

Yoni Fogel's avatar
Yoni Fogel committed
1448
//void toku__env_set_noticecall (DB_ENV *env, void (*noticecall)(DB_ENV *, db_notices)) {
Bradley C. Kuszmaul's avatar
Fixup  
Bradley C. Kuszmaul committed
1449 1450
//    env->i->noticecall = noticecall;
//}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1451

1452
static int 
1453
env_set_tmp_dir(DB_ENV * env, const char *tmp_dir) {
1454
    HANDLE_PANICKED_ENV(env);
1455
    if (env_opened(env)) {
1456
        return toku_ydb_do_error(env, EINVAL, "Cannot set the tmp dir after opening an env\n");
1457 1458
    }
    if (!tmp_dir) {
1459
        return toku_ydb_do_error(env, EINVAL, "Tmp dir bust be non-null\n");
1460
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1461 1462
    if (env->i->tmp_dir)
        toku_free(env->i->tmp_dir);
Yoni Fogel's avatar
Yoni Fogel committed
1463
    env->i->tmp_dir = toku_strdup(tmp_dir);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1464
    return env->i->tmp_dir ? 0 : ENOMEM;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1465
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1466

1467
static int 
Yoni Fogel's avatar
Yoni Fogel committed
1468
env_set_verbose(DB_ENV * env, uint32_t UU(which), int UU(onoff)) {
1469
    HANDLE_PANICKED_ENV(env);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1470
    return 1;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1471
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1472

1473
static int 
Yoni Fogel's avatar
Yoni Fogel committed
1474
toku_env_txn_checkpoint(DB_ENV * env, uint32_t kbyte __attribute__((__unused__)), uint32_t min __attribute__((__unused__)), uint32_t flags __attribute__((__unused__))) {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1475 1476
    CHECKPOINTER cp = toku_cachetable_get_checkpointer(env->i->cachetable);
    int r = toku_checkpoint(cp, env->i->logger,
1477 1478 1479
                            checkpoint_callback_f,  checkpoint_callback_extra,
                            checkpoint_callback2_f, checkpoint_callback2_extra,
                            CLIENT_CHECKPOINT);
1480
    if (r) {
1481 1482
        // Panicking the whole environment may be overkill, but I'm not sure what else to do.
        env_panic(env, r, "checkpoint error\n");
1483
        toku_ydb_do_error(env, r, "Checkpoint\n");
1484 1485
    }
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1486 1487
}

1488
static int 
Yoni Fogel's avatar
Yoni Fogel committed
1489
env_txn_stat(DB_ENV * env, DB_TXN_STAT ** UU(statp), uint32_t UU(flags)) {
1490
    HANDLE_PANICKED_ENV(env);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1491
    return 1;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1492 1493
}

1494 1495 1496 1497 1498 1499 1500
//
// We can assume the client calls this function right after recovery 
// to return a list of prepared transactions to the user. When called,
// we can assume that no other work is being done in the system, 
// as we are in the state of being after recovery, 
// but before client operations should commence
//
1501
static int
Yoni Fogel's avatar
Yoni Fogel committed
1502
env_txn_xa_recover (DB_ENV *env, TOKU_XA_XID xids[/*count*/], long count, /*out*/ long *retp, uint32_t flags) {
1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514
    struct tokulogger_preplist *MALLOC_N(count,preps);
    int r = toku_logger_recover_txn(env->i->logger, preps, count, retp, flags);
    if (r==0) {
        assert(*retp<=count);
        for (int i=0; i<*retp; i++) {
            xids[i] = preps[i].xid;
        }
    }
    toku_free(preps);
    return r;
}

1515 1516 1517 1518 1519 1520 1521
//
// We can assume the client calls this function right after recovery 
// to return a list of prepared transactions to the user. When called,
// we can assume that no other work is being done in the system, 
// as we are in the state of being after recovery, 
// but before client operations should commence
//
1522
static int
Yoni Fogel's avatar
Yoni Fogel committed
1523
env_txn_recover (DB_ENV *env, DB_PREPLIST preplist[/*count*/], long count, /*out*/ long *retp, uint32_t flags) {
1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534
    struct tokulogger_preplist *MALLOC_N(count,preps);
    int r = toku_logger_recover_txn(env->i->logger, preps, count, retp, flags);
    if (r==0) {
        assert(*retp<=count);
        for (int i=0; i<*retp; i++) {
            preplist[i].txn = preps[i].txn;
            memcpy(preplist[i].gid, preps[i].xid.data, preps[i].xid.gtrid_length + preps[i].xid.bqual_length);
        }
    }
    toku_free(preps);
    return r;
1535 1536 1537
}

static int
1538
env_get_txn_from_xid (DB_ENV *env, /*in*/ TOKU_XA_XID *xid, /*out*/ DB_TXN **txnp) {
1539
    return toku_txn_manager_get_root_txn_from_xid(toku_logger_get_txn_manager(env->i->logger), xid, txnp);
1540 1541
}

1542
static int
Yoni Fogel's avatar
Yoni Fogel committed
1543
env_checkpointing_set_period(DB_ENV * env, uint32_t seconds) {
1544
    HANDLE_PANICKED_ENV(env);
1545 1546 1547 1548 1549 1550
    int r = 0;
    if (!env_opened(env)) {
        r = EINVAL;
    } else {
        toku_set_checkpoint_period(env->i->cachetable, seconds);
    }
1551 1552 1553
    return r;
}

Zardosht Kasheff's avatar
Zardosht Kasheff committed
1554
static int
Yoni Fogel's avatar
Yoni Fogel committed
1555
env_cleaner_set_period(DB_ENV * env, uint32_t seconds) {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1556
    HANDLE_PANICKED_ENV(env);
1557 1558 1559 1560 1561 1562
    int r = 0;
    if (!env_opened(env)) {
        r = EINVAL;
    } else {
        toku_set_cleaner_period(env->i->cachetable, seconds);
    }
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1563 1564 1565 1566
    return r;
}

static int
Yoni Fogel's avatar
Yoni Fogel committed
1567
env_cleaner_set_iterations(DB_ENV * env, uint32_t iterations) {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1568
    HANDLE_PANICKED_ENV(env);
1569 1570 1571 1572 1573 1574
    int r = 0;
    if (!env_opened(env)) {
        r = EINVAL;
    } else {
        toku_set_cleaner_iterations(env->i->cachetable, iterations);
    }
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1575 1576 1577
    return r;
}

1578
static int
1579
env_create_loader(DB_ENV *env,
1580 1581 1582 1583 1584 1585 1586 1587
                  DB_TXN *txn,
                  DB_LOADER **blp,
                  DB *src_db,
                  int N,
                  DB *dbs[],
                  uint32_t db_flags[/*N*/],
                  uint32_t dbt_flags[/*N*/],
                  uint32_t loader_flags) {
1588
    int r = toku_loader_create_loader(env, txn, blp, src_db, N, dbs, db_flags, dbt_flags, loader_flags, true);
1589 1590 1591
    return r;
}

1592
static int
Yoni Fogel's avatar
Yoni Fogel committed
1593
env_checkpointing_get_period(DB_ENV * env, uint32_t *seconds) {
1594 1595 1596 1597
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else 
1598
        *seconds = toku_get_checkpoint_period_unlocked(env->i->cachetable);
1599 1600 1601
    return r;
}

Zardosht Kasheff's avatar
Zardosht Kasheff committed
1602
static int
Yoni Fogel's avatar
Yoni Fogel committed
1603
env_cleaner_get_period(DB_ENV * env, uint32_t *seconds) {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1604 1605 1606 1607
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else 
1608
        *seconds = toku_get_cleaner_period_unlocked(env->i->cachetable);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1609 1610 1611 1612
    return r;
}

static int
Yoni Fogel's avatar
Yoni Fogel committed
1613
env_cleaner_get_iterations(DB_ENV * env, uint32_t *iterations) {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1614 1615 1616 1617 1618 1619 1620 1621
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else 
        *iterations = toku_get_cleaner_iterations(env->i->cachetable);
    return r;
}

1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668
static int
env_checkpointing_postpone(DB_ENV * env) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else toku_checkpoint_safe_client_lock();
    return r;
}

static int
env_checkpointing_resume(DB_ENV * env) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else toku_checkpoint_safe_client_unlock();
    return r;
}

static int
env_checkpointing_begin_atomic_operation(DB_ENV * env) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else toku_multi_operation_client_lock();
    return r;
}

static int
env_checkpointing_end_atomic_operation(DB_ENV * env) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else toku_multi_operation_client_unlock();
    return r;
}

static int
env_set_default_bt_compare(DB_ENV * env, int (*bt_compare) (DB *, const DBT *, const DBT *)) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (env_opened(env)) r = EINVAL;
    else {
        env->i->bt_compare = bt_compare;
    }
    return r;
}

1669 1670 1671 1672 1673
static void
env_set_update (DB_ENV *env, int (*update_function)(DB *, const DBT *key, const DBT *old_val, const DBT *extra, void (*set_val)(const DBT *new_val, void *set_extra), void *set_extra)) {
    env->i->update_function = update_function;
}

1674
static int
1675
env_set_generate_row_callback_for_put(DB_ENV *env, generate_row_for_put_func generate_row_for_put) {
1676 1677 1678 1679
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (env_opened(env)) r = EINVAL;
    else {
1680
        env->i->generate_row_for_put = generate_row_for_put;
1681 1682 1683 1684 1685
    }
    return r;
}

static int
1686 1687 1688 1689 1690 1691 1692 1693 1694
env_set_generate_row_callback_for_del(DB_ENV *env, generate_row_for_del_func generate_row_for_del) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (env_opened(env)) r = EINVAL;
    else {
        env->i->generate_row_for_del = generate_row_for_del;
    }
    return r;
}
1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707
static int
env_set_redzone(DB_ENV *env, int redzone) {
    HANDLE_PANICKED_ENV(env);
    int r;
    if (env_opened(env))
        r = EINVAL;
    else {
        env->i->redzone = redzone;
        r = 0;
    }
    return r;
}

1708
static int
1709
env_get_lock_timeout(DB_ENV *env, uint64_t *lock_timeout_msec) {
1710
    *lock_timeout_msec = env->i->ltm.get_lock_wait_time();
1711 1712 1713 1714
    return 0;
}

static int
1715
env_set_lock_timeout(DB_ENV *env, uint64_t lock_timeout_msec) {
1716
    env->i->ltm.set_lock_wait_time(lock_timeout_msec);
1717 1718 1719
    return 0;
}

1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735
static void
format_time(const time_t *timer, char *buf) {
    ctime_r(timer, buf);
    size_t len = strlen(buf);
    assert(len < 26);
    char end;

    assert(len>=1);
    end = buf[len-1];
    while (end == '\n' || end == '\r') {
        buf[len-1] = '\0';
        len--;
        assert(len>=1);
        end = buf[len-1];
    }
}
1736

1737 1738 1739 1740 1741
////////////////////////////////////////////////////////////////////////////////////////////////
// Local definition of status information from portability layer, which should not include db.h.
// Local status structs are used to concentrate file system information collected from various places
// and memory information collected from memory.c.
//
1742 1743 1744 1745 1746 1747
typedef enum {
    FS_ENOSPC_REDZONE_STATE = 0,  // possible values are enumerated by fs_redzone_state
    FS_ENOSPC_THREADS_BLOCKED,    // how many threads currently blocked on ENOSPC
    FS_ENOSPC_REDZONE_CTR,        // number of operations rejected by enospc prevention (red zone)
    FS_ENOSPC_MOST_RECENT,        // most recent time that file system was completely full
    FS_ENOSPC_COUNT,              // total number of times ENOSPC was returned from an attempt to write
1748
    FS_FSYNC_TIME,
1749
    FS_FSYNC_COUNT,
1750 1751 1752
    FS_LONG_FSYNC_TIME,
    FS_LONG_FSYNC_COUNT,
    FS_STATUS_NUM_ROWS,           // must be last
1753 1754 1755
} fs_status_entry;

typedef struct {
Yoni Fogel's avatar
Yoni Fogel committed
1756
    bool initialized;
1757 1758 1759 1760 1761
    TOKU_ENGINE_STATUS_ROW_S status[FS_STATUS_NUM_ROWS];
} FS_STATUS_S, *FS_STATUS;

static FS_STATUS_S fsstat;

1762
#define FS_STATUS_INIT(k,c,t,l,inc) TOKUDB_STATUS_INIT(fsstat, k, c, t, "filesystem: " l, inc)
1763 1764 1765

static void
fs_status_init(void) {
1766
    FS_STATUS_INIT(FS_ENOSPC_REDZONE_STATE,   nullptr, FS_STATE, "ENOSPC redzone state", TOKU_ENGINE_STATUS);
1767
    FS_STATUS_INIT(FS_ENOSPC_THREADS_BLOCKED, FILESYSTEM_THREADS_BLOCKED_BY_FULL_DISK, UINT64,   "threads currently blocked by full disk", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
1768 1769 1770
    FS_STATUS_INIT(FS_ENOSPC_REDZONE_CTR,     nullptr, UINT64,   "number of operations rejected by enospc prevention (red zone)", TOKU_ENGINE_STATUS);
    FS_STATUS_INIT(FS_ENOSPC_MOST_RECENT,     nullptr, UNIXTIME, "most recent disk full", TOKU_ENGINE_STATUS);
    FS_STATUS_INIT(FS_ENOSPC_COUNT,           nullptr, UINT64,   "number of write operations that returned ENOSPC", TOKU_ENGINE_STATUS);
1771
    FS_STATUS_INIT(FS_FSYNC_TIME,             FILESYSTEM_FSYNC_TIME, UINT64,   "fsync time", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
1772
    FS_STATUS_INIT(FS_FSYNC_COUNT,            FILESYSTEM_FSYNC_NUM, UINT64,   "fsync count", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
1773 1774
    FS_STATUS_INIT(FS_LONG_FSYNC_TIME,        FILESYSTEM_LONG_FSYNC_TIME, UINT64,   "long fsync time", TOKU_ENGINE_STATUS);
    FS_STATUS_INIT(FS_LONG_FSYNC_COUNT,       FILESYSTEM_LONG_FSYNC_NUM, UINT64,   "long fsync count", TOKU_ENGINE_STATUS);
1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792
    fsstat.initialized = true;
}
#undef FS_STATUS_INIT

#define FS_STATUS_VALUE(x) fsstat.status[x].value.num

static void
fs_get_status(DB_ENV * env, fs_redzone_state * redzone_state) {
    if (!fsstat.initialized)
        fs_status_init();
    
    time_t   enospc_most_recent_timestamp;
    uint64_t enospc_threads_blocked, enospc_total;
    toku_fs_get_write_info(&enospc_most_recent_timestamp, &enospc_threads_blocked, &enospc_total);
    if (enospc_threads_blocked)
        FS_STATUS_VALUE(FS_ENOSPC_REDZONE_STATE) = FS_BLOCKED;
    else
        FS_STATUS_VALUE(FS_ENOSPC_REDZONE_STATE) = env->i->fs_state;
1793
    *redzone_state = (fs_redzone_state) FS_STATUS_VALUE(FS_ENOSPC_REDZONE_STATE);
1794 1795 1796 1797 1798
    FS_STATUS_VALUE(FS_ENOSPC_THREADS_BLOCKED) = enospc_threads_blocked;
    FS_STATUS_VALUE(FS_ENOSPC_REDZONE_CTR) = env->i->enospc_redzone_ctr;
    FS_STATUS_VALUE(FS_ENOSPC_MOST_RECENT) = enospc_most_recent_timestamp;
    FS_STATUS_VALUE(FS_ENOSPC_COUNT) = enospc_total;
    
1799 1800
    uint64_t fsync_count, fsync_time, long_fsync_threshold, long_fsync_count, long_fsync_time;
    toku_get_fsync_times(&fsync_count, &fsync_time, &long_fsync_threshold, &long_fsync_count, &long_fsync_time);
1801 1802
    FS_STATUS_VALUE(FS_FSYNC_COUNT) = fsync_count;
    FS_STATUS_VALUE(FS_FSYNC_TIME) = fsync_time;
1803 1804
    FS_STATUS_VALUE(FS_LONG_FSYNC_COUNT) = long_fsync_count;
    FS_STATUS_VALUE(FS_LONG_FSYNC_TIME) = long_fsync_time;
1805 1806 1807
}
#undef FS_STATUS_VALUE

1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824
// Local status struct used to get information from memory.c
typedef enum {
    MEMORY_MALLOC_COUNT = 0,
    MEMORY_FREE_COUNT,  
    MEMORY_REALLOC_COUNT,
    MEMORY_MALLOC_FAIL,  
    MEMORY_REALLOC_FAIL, 
    MEMORY_REQUESTED,    
    MEMORY_USED,         
    MEMORY_FREED,        
    MEMORY_MAX_IN_USE,
    MEMORY_MALLOCATOR_VERSION,
    MEMORY_MMAP_THRESHOLD,
    MEMORY_STATUS_NUM_ROWS
} memory_status_entry;

typedef struct {
Yoni Fogel's avatar
Yoni Fogel committed
1825
    bool initialized;
1826 1827 1828 1829 1830
    TOKU_ENGINE_STATUS_ROW_S status[MEMORY_STATUS_NUM_ROWS];
} MEMORY_STATUS_S, *MEMORY_STATUS;

static MEMORY_STATUS_S memory_status;

1831
#define STATUS_INIT(k,c,t,l,inc) TOKUDB_STATUS_INIT(memory_status, k, c, t, "memory: " l, inc)
1832 1833 1834 1835 1836

static void
memory_status_init(void) {
    // Note, this function initializes the keyname, type, and legend fields.
    // Value fields are initialized to zero by compiler.
1837 1838 1839 1840 1841 1842 1843 1844
    STATUS_INIT(MEMORY_MALLOC_COUNT,       nullptr, UINT64,  "number of malloc operations", TOKU_ENGINE_STATUS);
    STATUS_INIT(MEMORY_FREE_COUNT,         nullptr, UINT64,  "number of free operations", TOKU_ENGINE_STATUS);
    STATUS_INIT(MEMORY_REALLOC_COUNT,      nullptr, UINT64,  "number of realloc operations", TOKU_ENGINE_STATUS);
    STATUS_INIT(MEMORY_MALLOC_FAIL,        nullptr, UINT64,  "number of malloc operations that failed", TOKU_ENGINE_STATUS);
    STATUS_INIT(MEMORY_REALLOC_FAIL,       nullptr, UINT64,  "number of realloc operations that failed" , TOKU_ENGINE_STATUS);
    STATUS_INIT(MEMORY_REQUESTED,          nullptr, UINT64,  "number of bytes requested", TOKU_ENGINE_STATUS);
    STATUS_INIT(MEMORY_USED,               nullptr, UINT64,  "number of bytes used (requested + overhead)", TOKU_ENGINE_STATUS);
    STATUS_INIT(MEMORY_FREED,              nullptr, UINT64,  "number of bytes freed", TOKU_ENGINE_STATUS);
1845
    STATUS_INIT(MEMORY_MAX_IN_USE,         MEM_ESTIMATED_MAXIMUM_MEMORY_FOOTPRINT, UINT64,  "estimated maximum memory footprint", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
1846 1847
    STATUS_INIT(MEMORY_MALLOCATOR_VERSION, nullptr, CHARSTR, "mallocator version", TOKU_ENGINE_STATUS);
    STATUS_INIT(MEMORY_MMAP_THRESHOLD,     nullptr, UINT64,  "mmap threshold", TOKU_ENGINE_STATUS);
1848 1849 1850 1851 1852 1853 1854 1855 1856
    memory_status.initialized = true;  
}
#undef STATUS_INIT

#define MEMORY_STATUS_VALUE(x) memory_status.status[x].value.num

static void
memory_get_status(void) {
    if (!memory_status.initialized)
1857
        memory_status_init();
1858
    LOCAL_MEMORY_STATUS_S local_memstat;
1859
    toku_memory_get_status(&local_memstat);
1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873
    MEMORY_STATUS_VALUE(MEMORY_MALLOC_COUNT) = local_memstat.malloc_count;
    MEMORY_STATUS_VALUE(MEMORY_FREE_COUNT) = local_memstat.free_count;  
    MEMORY_STATUS_VALUE(MEMORY_REALLOC_COUNT) = local_memstat.realloc_count;
    MEMORY_STATUS_VALUE(MEMORY_MALLOC_FAIL) = local_memstat.malloc_fail;
    MEMORY_STATUS_VALUE(MEMORY_REALLOC_FAIL) = local_memstat.realloc_fail;
    MEMORY_STATUS_VALUE(MEMORY_REQUESTED) = local_memstat.requested; 
    MEMORY_STATUS_VALUE(MEMORY_USED) = local_memstat.used;
    MEMORY_STATUS_VALUE(MEMORY_FREED) = local_memstat.freed;
    MEMORY_STATUS_VALUE(MEMORY_MAX_IN_USE) = local_memstat.max_in_use;
    MEMORY_STATUS_VALUE(MEMORY_MMAP_THRESHOLD) = local_memstat.mmap_threshold;
    memory_status.status[MEMORY_MALLOCATOR_VERSION].value.str = local_memstat.mallocator_version;
}
#undef MEMORY_STATUS_VALUE

1874 1875 1876 1877 1878
// how many rows are in engine status?
static int
env_get_engine_status_num_rows (DB_ENV * UU(env), uint64_t * num_rowsp) {
    uint64_t num_rows = 0;
    num_rows += YDB_LAYER_STATUS_NUM_ROWS;
1879 1880
    num_rows += YDB_C_LAYER_STATUS_NUM_ROWS;
    num_rows += YDB_WRITE_LAYER_STATUS_NUM_ROWS;
1881 1882 1883 1884
    num_rows += LE_STATUS_NUM_ROWS;
    num_rows += CP_STATUS_NUM_ROWS;
    num_rows += CT_STATUS_NUM_ROWS;
    num_rows += LTM_STATUS_NUM_ROWS;
1885 1886 1887
    num_rows += FT_STATUS_NUM_ROWS;
    num_rows += FT_FLUSHER_STATUS_NUM_ROWS;
    num_rows += FT_HOT_STATUS_NUM_ROWS;
1888 1889 1890 1891 1892 1893 1894 1895
    num_rows += TXN_STATUS_NUM_ROWS;
    num_rows += LOGGER_STATUS_NUM_ROWS;
    num_rows += MEMORY_STATUS_NUM_ROWS;
    num_rows += FS_STATUS_NUM_ROWS;
    num_rows += INDEXER_STATUS_NUM_ROWS;
    num_rows += LOADER_STATUS_NUM_ROWS;
#if 0
    // enable when upgrade is supported
1896
    num_rows += FT_UPGRADE_STATUS_NUM_ROWS;
1897 1898 1899 1900 1901 1902
    num_rows += PERSISTENT_UPGRADE_STATUS_NUM_ROWS;
#endif
    *num_rowsp = num_rows;
    return 0;
}

1903 1904
// Do not take ydb lock or any other lock around or in this function.  
// If the engine is blocked because some thread is holding a lock, this function
1905 1906 1907
// can help diagnose the problem.
// This function only collects information, and it does not matter if something gets garbled
// because of a race condition.  
1908
// Note, engine status is still collected even if the environment or logger is panicked
1909
static int
1910
env_get_engine_status (DB_ENV * env, TOKU_ENGINE_STATUS_ROW engstat, uint64_t maxrows,  uint64_t *num_rows, fs_redzone_state* redzone_state, uint64_t * env_panicp, char * env_panic_string_buf, int env_panic_string_length, toku_engine_status_include_type include_flags) {
1911
    int r;
1912

1913
    if (env_panic_string_buf) {
1914 1915 1916 1917 1918 1919
        if (env && env->i && env->i->is_panicked && env->i->panic_string) {
            strncpy(env_panic_string_buf, env->i->panic_string, env_panic_string_length);
            env_panic_string_buf[env_panic_string_length - 1] = '\0';  // just in case
        }
        else 
            *env_panic_string_buf = '\0';
1920 1921
    }

1922 1923 1924 1925 1926
    if ( !(env)     ||
         !(env->i)  ||
         !(env_opened(env)) ||
         !num_rows ||
         !include_flags)
1927
        r = EINVAL;
1928
    else {
1929 1930
        r = 0;
        uint64_t row = 0;  // which row to fill next
1931 1932
        *env_panicp = env->i->is_panicked;

1933 1934
        {
            YDB_LAYER_STATUS_S ydb_stat;
1935
            ydb_layer_get_status(env, &ydb_stat);
1936
            for (int i = 0; i < YDB_LAYER_STATUS_NUM_ROWS && row < maxrows; i++) {
1937 1938 1939
                if (ydb_stat.status[i].include & include_flags) {
                    engstat[row++] = ydb_stat.status[i];
                }
1940 1941
            }
        }
1942 1943 1944 1945
        {
            YDB_C_LAYER_STATUS_S ydb_c_stat;
            ydb_c_layer_get_status(&ydb_c_stat);
            for (int i = 0; i < YDB_C_LAYER_STATUS_NUM_ROWS && row < maxrows; i++) {
1946 1947 1948
                if (ydb_c_stat.status[i].include & include_flags) {
                    engstat[row++] = ydb_c_stat.status[i];
                }
1949
            }
1950
        }
1951 1952 1953 1954
        {
            YDB_WRITE_LAYER_STATUS_S ydb_write_stat;
            ydb_write_layer_get_status(&ydb_write_stat);
            for (int i = 0; i < YDB_WRITE_LAYER_STATUS_NUM_ROWS && row < maxrows; i++) {
1955 1956 1957
                if (ydb_write_stat.status[i].include & include_flags) {
                    engstat[row++] = ydb_write_stat.status[i];
                }
1958
            }
1959 1960 1961 1962 1963
        }
        {
            LE_STATUS_S lestat;                    // Rice's vampire
            toku_le_get_status(&lestat);
            for (int i = 0; i < LE_STATUS_NUM_ROWS && row < maxrows; i++) {
1964 1965 1966
                if (lestat.status[i].include & include_flags) {
                    engstat[row++] = lestat.status[i];
                }
1967 1968 1969
            }
        }
        {
1970
            CHECKPOINT_STATUS_S cpstat;
1971
            toku_checkpoint_get_status(env->i->cachetable, &cpstat);
1972
            for (int i = 0; i < CP_STATUS_NUM_ROWS && row < maxrows; i++) {
1973 1974 1975
                if (cpstat.status[i].include & include_flags) {
                    engstat[row++] = cpstat.status[i];
                }
1976 1977 1978 1979 1980 1981
            }
        }
        {
            CACHETABLE_STATUS_S ctstat;
            toku_cachetable_get_status(env->i->cachetable, &ctstat);
            for (int i = 0; i < CT_STATUS_NUM_ROWS && row < maxrows; i++) {
1982 1983 1984
                if (ctstat.status[i].include & include_flags) {
                    engstat[row++] = ctstat.status[i];
                }
1985 1986
            }
        }
1987 1988 1989 1990
        {
            LTM_STATUS_S ltmstat;
            env->i->ltm.get_status(&ltmstat);
            for (int i = 0; i < LTM_STATUS_NUM_ROWS && row < maxrows; i++) {
1991 1992 1993
                if (ltmstat.status[i].include & include_flags) {
                    engstat[row++] = ltmstat.status[i];
                }
1994 1995
            }
        }
1996
        {
1997 1998 1999
            FT_STATUS_S ftstat;
            toku_ft_get_status(&ftstat);
            for (int i = 0; i < FT_STATUS_NUM_ROWS && row < maxrows; i++) {
2000 2001 2002
                if (ftstat.status[i].include & include_flags) {
                    engstat[row++] = ftstat.status[i];
                }
2003
            }
Leif Walsh's avatar
Leif Walsh committed
2004 2005
        }
        {
2006 2007 2008
            FT_FLUSHER_STATUS_S flusherstat;
            toku_ft_flusher_get_status(&flusherstat);
            for (int i = 0; i < FT_FLUSHER_STATUS_NUM_ROWS && row < maxrows; i++) {
2009 2010 2011
                if (flusherstat.status[i].include & include_flags) {
                    engstat[row++] = flusherstat.status[i];
                }
2012
            }
Leif Walsh's avatar
Leif Walsh committed
2013
        }
2014
        {
2015 2016 2017
            FT_HOT_STATUS_S hotstat;
            toku_ft_hot_get_status(&hotstat);
            for (int i = 0; i < FT_HOT_STATUS_NUM_ROWS && row < maxrows; i++) {
2018 2019 2020
                if (hotstat.status[i].include & include_flags) {
                    engstat[row++] = hotstat.status[i];
                }
2021 2022 2023 2024
            }
        }
        {
            TXN_STATUS_S txnstat;
2025
            toku_txn_get_status(&txnstat);
2026
            for (int i = 0; i < TXN_STATUS_NUM_ROWS && row < maxrows; i++) {
2027 2028 2029
                if (txnstat.status[i].include & include_flags) {
                    engstat[row++] = txnstat.status[i];
                }
2030 2031 2032 2033 2034 2035
            }
        }
        {
            LOGGER_STATUS_S loggerstat;
            toku_logger_get_status(env->i->logger, &loggerstat);
            for (int i = 0; i < LOGGER_STATUS_NUM_ROWS && row < maxrows; i++) {
2036 2037 2038
                if (loggerstat.status[i].include & include_flags) {
                    engstat[row++] = loggerstat.status[i];
                }
2039 2040 2041 2042 2043 2044 2045
            }
        }

        {
            INDEXER_STATUS_S indexerstat;
            toku_indexer_get_status(&indexerstat);
            for (int i = 0; i < INDEXER_STATUS_NUM_ROWS && row < maxrows; i++) {
2046 2047 2048
                if (indexerstat.status[i].include & include_flags) {
                    engstat[row++] = indexerstat.status[i];
                }
2049 2050 2051 2052 2053 2054
            }
        }
        {
            LOADER_STATUS_S loaderstat;
            toku_loader_get_status(&loaderstat);
            for (int i = 0; i < LOADER_STATUS_NUM_ROWS && row < maxrows; i++) {
2055 2056 2057
                if (loaderstat.status[i].include & include_flags) {
                    engstat[row++] = loaderstat.status[i];
                }
2058 2059 2060
            }
        }

2061
        {
2062
            // memory_status is local to this file
2063
            memory_get_status();
2064
            for (int i = 0; i < MEMORY_STATUS_NUM_ROWS && row < maxrows; i++) {
2065 2066 2067
                if (memory_status.status[i].include & include_flags) {
                    engstat[row++] = memory_status.status[i];
                }
2068
            }
2069 2070
        }
        {
2071 2072 2073 2074
            // Note, fs_get_status() and the fsstat structure are local to this file because they
            // are used to concentrate file system information collected from various places.
            fs_get_status(env, redzone_state);
            for (int i = 0; i < FS_STATUS_NUM_ROWS && row < maxrows; i++) {
2075 2076 2077
                if (fsstat.status[i].include & include_flags) {
                    engstat[row++] = fsstat.status[i];
                }
2078
            }
2079
        }
2080 2081
#if 0
        // enable when upgrade is supported
2082
        {
2083
            for (int i = 0; i < PERSISTENT_UPGRADE_STATUS_NUM_ROWS && row < maxrows; i++) {
2084 2085 2086
                if (persistent_upgrade_status.status[i].include & include_flags) {
                    engstat[row++] = persistent_upgrade_status.status[i];
                }
2087
            }
2088
            FT_UPGRADE_STATUS_S ft_upgradestat;
2089
            toku_ft_upgrade_get_status(&ft_upgradestat);
2090
            for (int i = 0; i < FT_UPGRADE_STATUS_NUM_ROWS && row < maxrows; i++) {
2091 2092 2093
                if (ft_upgradestat.status[i].include & include_flags) {
                    engstat[row++] = ft_upgradestat.status[i];
                }
2094 2095
            }

2096
        }
2097
#endif
2098 2099 2100
        if (r==0) {
            *num_rows = row;
        }
2101 2102 2103 2104
    }
    return r;
}

2105
// Fill buff with text description of engine status up to bufsiz bytes.
2106 2107
// Intended for use by test programs that do not have the handlerton available,
// and for use by toku_assert logic to print diagnostic info on crash.
2108 2109
static int
env_get_engine_status_text(DB_ENV * env, char * buff, int bufsiz) {
2110
    uint32_t stringsize = 1024;
2111
    uint64_t panic;
2112
    char panicstring[stringsize];
2113
    int n = 0;  // number of characters printed so far
2114
    uint64_t num_rows;
2115
    uint64_t max_rows;
2116
    fs_redzone_state redzone_state;
2117

2118 2119
    n = snprintf(buff, bufsiz - n, "BUILD_ID = %d\n", BUILD_ID);

2120 2121 2122 2123
    (void) env_get_engine_status_num_rows (env, &max_rows);
    TOKU_ENGINE_STATUS_ROW_S mystat[max_rows];
    int r = env->get_engine_status (env, mystat, max_rows, &num_rows, &redzone_state, &panic, panicstring, stringsize, TOKU_ENGINE_STATUS);

2124
    if (r) {
2125
        n += snprintf(buff + n, bufsiz - n, "Engine status not available: ");
2126
        if (!env) {
2127
            n += snprintf(buff + n, bufsiz - n, "no environment\n");
2128 2129
        }
        else if (!(env->i)) {
2130
            n += snprintf(buff + n, bufsiz - n, "environment internal struct is null\n");
2131 2132 2133 2134
        }
        else if (!env_opened(env)) {
            n += snprintf(buff + n, bufsiz - n, "environment is not open\n");
        }
2135 2136
    }
    else {
2137
        if (panic) {
2138
            n += snprintf(buff + n, bufsiz - n, "Env panic code: %" PRIu64 "\n", panic);
2139 2140 2141 2142 2143 2144 2145 2146 2147 2148
            if (strlen(panicstring)) {
                invariant(strlen(panicstring) <= stringsize);
                n += snprintf(buff + n, bufsiz - n, "Env panic string: %s\n", panicstring);
            }
        }

        for (uint64_t row = 0; row < num_rows; row++) {
            n += snprintf(buff + n, bufsiz - n, "%s: ", mystat[row].legend);
            switch (mystat[row].type) {
            case FS_STATE:
2149
                n += snprintf(buff + n, bufsiz - n, "%" PRIu64 "\n", mystat[row].value.num);
2150 2151
                break;
            case UINT64:
2152
                n += snprintf(buff + n, bufsiz - n, "%" PRIu64 "\n", mystat[row].value.num);
2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169
                break;
            case CHARSTR:
                n += snprintf(buff + n, bufsiz - n, "%s\n", mystat[row].value.str);
                break;
            case UNIXTIME:
                {
                    char tbuf[26];
                    format_time((time_t*)&mystat[row].value.num, tbuf);
                    n += snprintf(buff + n, bufsiz - n, "%s\n", tbuf);
                }
                break;
            case TOKUTIME:
                {
                    double t = tokutime_to_seconds(mystat[row].value.num);
                    n += snprintf(buff + n, bufsiz - n, "%.6f\n", t);
                }
                break;
2170 2171 2172 2173 2174
            case PARCOUNT:
                {
                    uint64_t v = read_partitioned_counter(mystat[row].value.parcount);
                    n += snprintf(buff + n, bufsiz - n, "%" PRIu64 "\n", v);
                }
2175
                break;
2176 2177 2178 2179 2180
            default:
                n += snprintf(buff + n, bufsiz - n, "UNKNOWN STATUS TYPE: %d\n", mystat[row].type);
                break;                
            }
        }
2181
    }
2182
        
2183
    if (n > bufsiz) {
2184
        const char * errmsg = "BUFFER TOO SMALL\n";
2185 2186
        int len = strlen(errmsg) + 1;
        (void) snprintf(buff + (bufsiz - 1) - len, len, "%s", errmsg);
2187 2188 2189 2190 2191
    }

    return r;
}

2192 2193 2194 2195
// intended for use by toku_assert logic, when env is not known
static int 
toku_maybe_get_engine_status_text (char * buff, int buffsize) {
    DB_ENV * env = most_recent_env;
2196
    int r;
2197
    if (engine_status_enable && env != NULL) {
2198
        r = env_get_engine_status_text(env, buff, buffsize);
2199 2200
    }
    else {
2201
        r = EOPNOTSUPP;
2202
        snprintf(buff, buffsize, "Engine status not available: disabled by user.  This should only happen in test programs.\n");
2203
    }
2204 2205 2206
    return r;
}

2207 2208 2209
// Set panic code and panic string if not already panicked,
// intended for use by toku_assert when about to abort().
static void 
2210
toku_maybe_set_env_panic(int code, const char * msg) {
2211
    if (code == 0) 
2212
        code = -1;
2213
    if (msg == NULL)
2214
        msg = "Unknown cause from abort (failed assert)\n";
2215
    env_is_panicked = code;  // disable library destructor no matter what
2216
    DB_ENV * env = most_recent_env;
2217
    if (env && 
2218 2219 2220
        env->i &&
        (env->i->is_panicked == 0)) {
        env_panic(env, code, msg);
2221 2222
    }
}
2223

2224 2225 2226 2227 2228 2229 2230
// handlerton's call to fractal tree layer on failed assert in handlerton
static int 
env_crash(DB_ENV * UU(db_env), const char* msg, const char * fun, const char* file, int line, int caller_errno) {
    toku_do_assert_fail(msg, fun, file, line, caller_errno);
    return -1;  // placate compiler
}

2231 2232 2233 2234 2235 2236 2237 2238
static int
env_get_cursor_for_persistent_environment(DB_ENV* env, DB_TXN* txn, DBC** c) {
    if (!env_opened(env)) {
        return EINVAL;
    }
    return toku_db_cursor(env->i->persistent_environment, txn, c, 0);
}

2239 2240 2241 2242 2243 2244 2245 2246
static int
env_get_cursor_for_directory(DB_ENV* env, DB_TXN* txn, DBC** c) {
    if (!env_opened(env)) {
        return EINVAL;
    }
    return toku_db_cursor(env->i->directory, txn, c, 0);
}

2247
static int 
Yoni Fogel's avatar
Yoni Fogel committed
2248
toku_env_create(DB_ENV ** envp, uint32_t flags) {
Yoni Fogel's avatar
Yoni Fogel committed
2249 2250 2251 2252 2253 2254
    int r = ENOSYS;
    DB_ENV* result = NULL;

    if (flags!=0)    { r = EINVAL; goto cleanup; }
    MALLOC(result);
    if (result == 0) { r = ENOMEM; goto cleanup; }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2255
    memset(result, 0, sizeof *result);
2256 2257

    // locked methods
2258
    result->err = (void (*)(const DB_ENV * env, int error, const char *fmt, ...)) toku_env_err;
2259 2260 2261 2262
#define SENV(name) result->name = locked_env_ ## name
    SENV(dbremove);
    SENV(dbrename);
    //SENV(set_noticecall);
2263
#undef SENV
2264
#define USENV(name) result->name = env_ ## name
2265
    // methods with locking done internally
2266 2267 2268 2269 2270 2271
    USENV(put_multiple);
    USENV(del_multiple);
    USENV(update_multiple);
    // unlocked methods
    USENV(open);
    USENV(close);
2272 2273 2274 2275
    USENV(set_default_bt_compare);
    USENV(set_update);
    USENV(set_generate_row_callback_for_put);
    USENV(set_generate_row_callback_for_del);
2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286
    USENV(set_lg_bsize);
    USENV(set_lg_dir);
    USENV(set_lg_max);
    USENV(get_lg_max);
    USENV(set_lk_max_memory);
    USENV(get_lk_max_memory);
    USENV(get_iname);
    USENV(set_errcall);
    USENV(set_errfile);
    USENV(set_errpfx);
    USENV(set_data_dir);
2287 2288 2289 2290 2291 2292 2293
    USENV(checkpointing_set_period);
    USENV(checkpointing_get_period);
    USENV(cleaner_set_period);
    USENV(cleaner_get_period);
    USENV(cleaner_set_iterations);
    USENV(cleaner_get_iterations);
    USENV(set_cachesize);
2294 2295 2296 2297 2298 2299 2300 2301 2302 2303
#if DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR >= 3
    USENV(get_cachesize);
#endif
#if DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR <= 4
    USENV(set_lk_max);
#endif
    USENV(set_lk_detect);
    USENV(set_flags);
    USENV(set_tmp_dir);
    USENV(set_verbose);
2304 2305 2306 2307 2308 2309 2310
    USENV(txn_recover);
    USENV(txn_xa_recover);
    USENV(get_txn_from_xid);
    USENV(txn_stat);
    USENV(get_lock_timeout);
    USENV(set_lock_timeout);
    USENV(set_redzone);
2311
    USENV(log_flush);
2312
    USENV(log_archive);
2313
    USENV(create_loader);
2314
    USENV(get_cursor_for_persistent_environment);
2315
    USENV(get_cursor_for_directory);
2316
    USENV(change_fsync_log_period);
2317
#undef USENV
2318
    
2319
    // unlocked methods
2320
    result->create_indexer = toku_indexer_create_indexer;
2321 2322 2323 2324 2325 2326 2327 2328 2329
    result->txn_checkpoint = toku_env_txn_checkpoint;
    result->checkpointing_postpone = env_checkpointing_postpone;
    result->checkpointing_resume = env_checkpointing_resume;
    result->checkpointing_begin_atomic_operation = env_checkpointing_begin_atomic_operation;
    result->checkpointing_end_atomic_operation = env_checkpointing_end_atomic_operation;
    result->get_engine_status_num_rows = env_get_engine_status_num_rows;
    result->get_engine_status = env_get_engine_status;
    result->get_engine_status_text = env_get_engine_status_text;
    result->crash = env_crash;  // handlerton's call to fractal tree layer on failed assert
2330
    result->txn_begin = toku_txn_begin;
2331

2332
    MALLOC(result->i);
Yoni Fogel's avatar
Yoni Fogel committed
2333
    if (result->i == 0) { r = ENOMEM; goto cleanup; }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2334
    memset(result->i, 0, sizeof *result->i);
Yoni Fogel's avatar
Yoni Fogel committed
2335 2336 2337 2338
    result->i->envdir_lockfd  = -1;
    result->i->datadir_lockfd = -1;
    result->i->logdir_lockfd  = -1;
    result->i->tmpdir_lockfd  = -1;
2339
    env_fs_init(result);
2340
    env_fsync_log_init(result);
Yoni Fogel's avatar
Yoni Fogel committed
2341

2342 2343 2344
    result->i->bt_compare = toku_builtin_compare_fun;

    r = toku_logger_create(&result->i->logger);
2345
    if (r!=0) goto cleanup; // In particular, logger_create can return the huge page error.
2346 2347
    assert(result->i->logger);

2348 2349 2350 2351
    // Create the locktree manager, passing in the create/destroy/escalate callbacks.
    // The extra parameter for escalation is simply a pointer to this environment.
    // The escalate callback will need it to translate txnids to DB_TXNs
    result->i->ltm.create(toku_db_lt_on_create_callback, toku_db_lt_on_destroy_callback, toku_db_txn_escalate_callback, result);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2352

2353
    r = toku_omt_create(&result->i->open_dbs);
2354
    toku_mutex_init(&result->i->open_dbs_lock, NULL);
2355 2356
    assert_zero(r);
    assert(result->i->open_dbs);
2357

Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2358
    *envp = result;
Yoni Fogel's avatar
Yoni Fogel committed
2359
    r = 0;
2360
    toku_sync_fetch_and_add(&tokudb_num_envs, 1);
Yoni Fogel's avatar
Yoni Fogel committed
2361 2362 2363
cleanup:
    if (r!=0) {
        if (result) {
2364
            toku_free(result->i);
Yoni Fogel's avatar
Yoni Fogel committed
2365 2366 2367 2368
            toku_free(result);
        }
    }
    return r;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2369 2370
}

2371
int 
Yoni Fogel's avatar
Yoni Fogel committed
2372
DB_ENV_CREATE_FUN (DB_ENV ** envp, uint32_t flags) {
2373 2374
    int r = toku_env_create(envp, flags); 
    return r;
2375 2376
}

2377 2378 2379 2380 2381
// return 0 if v and dbv refer to same db (including same dname)
// return <0 if v is earlier in omt than dbv
// return >0 if v is later in omt than dbv
static int
find_db_by_db (OMTVALUE v, void *dbv) {
2382 2383
    DB *db = (DB *) v;            // DB* that is stored in the omt
    DB *dbfind = (DB *) dbv;      // extra, to be compared to v
2384
    int cmp;
2385 2386 2387 2388
    const char *dname     = db->i->dname;
    const char *dnamefind = dbfind->i->dname;
    cmp = strcmp(dname, dnamefind);
    if (cmp != 0) return cmp;
2389
    if (db < dbfind) return -1;
2390
    if (db > dbfind) return  1;
2391 2392 2393 2394
    return 0;
}

// Tell env that there is a new db handle (with non-unique dname in db->i-dname)
2395
void
2396
env_note_db_opened(DB_ENV *env, DB *db) {
2397 2398
    toku_mutex_lock(&env->i->open_dbs_lock);
    assert(db->i->dname); // internal (non-user) dictionary has no dname
2399 2400 2401
    int r;
    OMTVALUE dbv;
    uint32_t idx;
2402
    STATUS_VALUE(YDB_LAYER_NUM_OPEN_DBS) = toku_omt_size(env->i->open_dbs);
2403 2404
    STATUS_VALUE(YDB_LAYER_NUM_DB_OPEN)++;
    if (STATUS_VALUE(YDB_LAYER_NUM_OPEN_DBS) > STATUS_VALUE(YDB_LAYER_MAX_OPEN_DBS))
2405
        STATUS_VALUE(YDB_LAYER_MAX_OPEN_DBS) = STATUS_VALUE(YDB_LAYER_NUM_OPEN_DBS);
2406
    r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx);
2407 2408
    assert(r==DB_NOTFOUND); //Must not already be there.
    r = toku_omt_insert_at(env->i->open_dbs, db, idx);
2409
    assert_zero(r);
2410
    toku_mutex_unlock(&env->i->open_dbs_lock);
2411 2412
}

2413
// Effect: Tell the DB_ENV that the DB is no longer in use by the user of the API.  The DB may still be in use by the fractal tree internals.
2414 2415 2416 2417 2418
void
env_note_db_closed(DB_ENV *env, DB *db) {
    toku_mutex_lock(&env->i->open_dbs_lock);
    assert(db->i->dname); // internal (non-user) dictionary has no dname
    assert(toku_omt_size(env->i->open_dbs) > 0);
2419 2420 2421
    int r;
    OMTVALUE dbv;
    uint32_t idx;
2422
    STATUS_VALUE(YDB_LAYER_NUM_DB_CLOSE)++;
2423
    r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx);
2424
    assert_zero(r); //Must already be there.
2425 2426
    assert((DB*)dbv == db);
    r = toku_omt_delete_at(env->i->open_dbs, idx);
2427
    STATUS_VALUE(YDB_LAYER_NUM_OPEN_DBS) = toku_omt_size(env->i->open_dbs);
2428
    assert_zero(r);
2429
    toku_mutex_unlock(&env->i->open_dbs_lock);
2430 2431 2432 2433
}

static int
find_open_db_by_dname (OMTVALUE v, void *dnamev) {
2434
    DB *db = (DB *) v;            // DB* that is stored in the omt
2435
    int cmp;
2436
    const char *dname     = db->i->dname;
2437
    const char *dnamefind = (char *) dnamev;
2438 2439
    cmp = strcmp(dname, dnamefind);
    return cmp;
2440 2441 2442
}

// return true if there is any db open with the given dname
Yoni Fogel's avatar
Yoni Fogel committed
2443
static bool
2444 2445
env_is_db_with_dname_open(DB_ENV *env, const char *dname) {
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
2446
    bool rval;
2447 2448
    OMTVALUE dbv;
    uint32_t idx;
2449
    toku_mutex_lock(&env->i->open_dbs_lock);
2450
    r = toku_omt_find_zero(env->i->open_dbs, find_open_db_by_dname, (void*)dname, &dbv, &idx);
2451
    if (r==0) {
2452
        DB *db = (DB *) dbv;
2453
        assert(strcmp(dname, db->i->dname) == 0);
Yoni Fogel's avatar
Yoni Fogel committed
2454
        rval = true;
2455 2456 2457
    }
    else {
        assert(r==DB_NOTFOUND);
Yoni Fogel's avatar
Yoni Fogel committed
2458
        rval = false;
2459
    }
2460
    toku_mutex_unlock(&env->i->open_dbs_lock);
2461 2462 2463
    return rval;
}

2464 2465 2466 2467 2468 2469 2470 2471 2472 2473
//We do not (yet?) support deleting subdbs by deleting the enclosing 'fname'
static int
env_dbremove_subdb(DB_ENV * env, DB_TXN * txn, const char *fname, const char *dbname, int32_t flags) {
    int r;
    if (!fname || !dbname) r = EINVAL;
    else {
        char subdb_full_name[strlen(fname) + sizeof("/") + strlen(dbname)];
        int bytes = snprintf(subdb_full_name, sizeof(subdb_full_name), "%s/%s", fname, dbname);
        assert(bytes==(int)sizeof(subdb_full_name)-1);
        const char *null_subdbname = NULL;
2474
        r = env_dbremove(env, txn, subdb_full_name, null_subdbname, flags);
2475
    }
2476 2477 2478
    return r;
}

2479 2480 2481 2482 2483 2484
// see if we can acquire a table lock for the given dname.
// requires: write lock on dname in the directory. dictionary
//          open, close, and begin checkpoint cannot occur.
// returns: true if we could open, lock, and close a dictionary
//          with the given dname, false otherwise.
static bool
2485
can_acquire_table_lock(DB_ENV *env, DB_TXN *txn, const char *iname_in_env) {
2486 2487 2488 2489 2490 2491
    int r;
    bool got_lock = false;
    DB *db;

    r = toku_db_create(&db, env, 0);
    assert_zero(r);
2492
    r = toku_db_open_iname(db, txn, iname_in_env, 0, 0);
2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505
    assert_zero(r);
    r = toku_db_pre_acquire_table_lock(db, txn);
    if (r == 0) {
        got_lock = true;
    } else {
        got_lock = false;
    }
    r = toku_db_close(db);
    assert_zero(r);

    return got_lock;
}

2506 2507
static int
env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, uint32_t flags) {
2508
    int r;
2509
    HANDLE_PANICKED_ENV(env);
2510 2511 2512
    if (!env_opened(env) || flags != 0) {
        return EINVAL;
    }
2513
    HANDLE_READ_ONLY_TXN(txn);
2514 2515
    if (dbname != NULL) {
        // env_dbremove_subdb() converts (fname, dbname) to dname
2516
        return env_dbremove_subdb(env, txn, fname, dbname, flags);
2517
    }
Yoni Fogel's avatar
Yoni Fogel committed
2518

2519 2520
    const char * dname = fname;
    assert(dbname == NULL);
Yoni Fogel's avatar
Yoni Fogel committed
2521

2522 2523
    // We check for an open db here as a "fast path" to error.
    // We'll need to check again below to be sure.
2524
    if (env_is_db_with_dname_open(env, dname)) {
2525
        return toku_ydb_do_error(env, EINVAL, "Cannot remove dictionary with an open handle.\n");
2526
    }
2527 2528 2529 2530
    
    DBT dname_dbt;  
    DBT iname_dbt;  
    toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1);
2531
    toku_init_dbt_flags(&iname_dbt, DB_DBT_REALLOC);
Yoni Fogel's avatar
Yoni Fogel committed
2532

2533
    // get iname
2534
    r = toku_db_get(env->i->directory, txn, &dname_dbt, &iname_dbt, DB_SERIALIZABLE);  // allocates memory for iname
2535
    char *iname = (char *) iname_dbt.data;
2536
    DB *db = NULL;
2537 2538 2539
    if (r != 0) {
        if (r == DB_NOTFOUND) {
            r = ENOENT;
2540
        }
2541 2542 2543
        goto exit;
    }
    // remove (dname,iname) from directory
Yoni Fogel's avatar
Yoni Fogel committed
2544
    r = toku_db_del(env->i->directory, txn, &dname_dbt, DB_DELETE_ANY, true);
2545 2546 2547 2548 2549
    if (r != 0) {
        goto exit;
    }
    r = toku_db_create(&db, env, 0);
    lazy_assert_zero(r);
2550
    r = toku_db_open_iname(db, txn, iname, 0, 0);
2551 2552 2553 2554 2555 2556
    lazy_assert_zero(r);
    if (txn) {
        // Now that we have a writelock on dname, verify that there are still no handles open. (to prevent race conditions)
        if (env_is_db_with_dname_open(env, dname)) {
            r = toku_ydb_do_error(env, EINVAL, "Cannot remove dictionary with an open handle.\n");
            goto exit;
2557
        }
2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571
        // we know a live db handle does not exist.
        //
        // use the internally opened db to try and get a table lock
        //
        // if we can't get it, then some txn needs the ft and we
        // should return lock not granted.
        //
        // otherwise, we're okay in marking this ft as remove on
        // commit. no new handles can open for this dictionary
        // because the txn has directory write locks on the dname
        r = toku_db_pre_acquire_table_lock(db, txn);
        if (r != 0) {
            r = DB_LOCK_NOTGRANTED;
            goto exit;
2572
        }
2573
        // The ft will be unlinked when the txn commits
2574
        toku_ft_unlink_on_commit(db->i->ft_handle, db_txn_struct_i(txn)->tokutxn);
2575 2576 2577 2578
    }
    else {
        // unlink the ft without a txn
        toku_ft_unlink(db->i->ft_handle);
2579
    }
2580

2581 2582 2583 2584 2585 2586 2587 2588
exit:
    if (db) {
        int ret = toku_db_close(db);
        assert(ret == 0);
    }
    if (iname) {
        toku_free(iname);
    }
2589
    return r;
2590 2591
}

2592
static int
Yoni Fogel's avatar
Yoni Fogel committed
2593
env_dbrename_subdb(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, uint32_t flags) {
2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607
    int r;
    if (!fname || !dbname || !newname) r = EINVAL;
    else {
        char subdb_full_name[strlen(fname) + sizeof("/") + strlen(dbname)];
        {
            int bytes = snprintf(subdb_full_name, sizeof(subdb_full_name), "%s/%s", fname, dbname);
            assert(bytes==(int)sizeof(subdb_full_name)-1);
        }
        char new_full_name[strlen(fname) + sizeof("/") + strlen(dbname)];
        {
            int bytes = snprintf(new_full_name, sizeof(new_full_name), "%s/%s", fname, dbname);
            assert(bytes==(int)sizeof(new_full_name)-1);
        }
        const char *null_subdbname = NULL;
2608
        r = env_dbrename(env, txn, subdb_full_name, null_subdbname, new_full_name, flags);
2609
    }
2610 2611 2612
    return r;
}

2613 2614
static int
env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, uint32_t flags) {
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2615
    int r;
2616
    HANDLE_PANICKED_ENV(env);
2617 2618 2619
    if (!env_opened(env) || flags != 0) {
        return EINVAL;
    }
2620
    HANDLE_READ_ONLY_TXN(txn);
2621 2622
    if (dbname != NULL) {
        // env_dbrename_subdb() converts (fname, dbname) to dname and (fname, newname) to newdname
2623
        return env_dbrename_subdb(env, txn, fname, dbname, newname, flags);
2624
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2625

2626 2627
    const char * dname = fname;
    assert(dbname == NULL);
2628

2629 2630
    // We check for open dnames for the old and new name as a "fast path" to error.
    // We will need to check these again later.
2631
    if (env_is_db_with_dname_open(env, dname)) {
2632
        return toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary with an open handle.\n");
2633 2634
    }
    if (env_is_db_with_dname_open(env, newname)) {
2635
        return toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary; Dictionary with target name has an open handle.\n");
2636
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2637
    
2638 2639 2640 2641 2642
    DBT old_dname_dbt;  
    DBT new_dname_dbt;  
    DBT iname_dbt;  
    toku_fill_dbt(&old_dname_dbt, dname, strlen(dname)+1);
    toku_fill_dbt(&new_dname_dbt, newname, strlen(newname)+1);
2643
    toku_init_dbt_flags(&iname_dbt, DB_DBT_REALLOC);
2644

2645
    // get iname
2646
    r = toku_db_get(env->i->directory, txn, &old_dname_dbt, &iname_dbt, DB_SERIALIZABLE);  // allocates memory for iname
2647
    char *iname = (char *) iname_dbt.data;
2648
    if (r == DB_NOTFOUND) {
2649
        r = ENOENT;
2650
    } else if (r == 0) {
2651
        // verify that newname does not already exist
2652
        r = db_getf_set(env->i->directory, txn, DB_SERIALIZABLE, &new_dname_dbt, ydb_getf_do_nothing, NULL);
2653 2654 2655 2656 2657
        if (r == 0) {
            r = EEXIST;
        }
        else if (r == DB_NOTFOUND) {
            // remove old (dname,iname) and insert (newname,iname) in directory
Yoni Fogel's avatar
Yoni Fogel committed
2658
            r = toku_db_del(env->i->directory, txn, &old_dname_dbt, DB_DELETE_ANY, true);
2659
            if (r != 0) { goto exit; }
Yoni Fogel's avatar
Yoni Fogel committed
2660
            r = toku_db_put(env->i->directory, txn, &new_dname_dbt, &iname_dbt, 0, true);
2661 2662
            if (r != 0) { goto exit; }

2663
            //Now that we have writelocks on both dnames, verify that there are still no handles open. (to prevent race conditions)
2664
            if (env_is_db_with_dname_open(env, dname)) {
2665
                r = toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary with an open handle.\n");
2666
                goto exit;
2667
            }
2668
            if (env_is_db_with_dname_open(env, newname)) {
2669
                r = toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary; Dictionary with target name has an open handle.\n");
2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682
                goto exit;
            }

            // we know a live db handle does not exist.
            //
            // use the internally opened db to try and get a table lock
            // 
            // if we can't get it, then some txn needs the ft and we
            // should return lock not granted.
            //
            // otherwise, we're okay in marking this ft as remove on
            // commit. no new handles can open for this dictionary
            // because the txn has directory write locks on the dname
2683
            if (txn && !can_acquire_table_lock(env, txn, iname)) {
2684
                r = DB_LOCK_NOTGRANTED;
2685
            }
2686 2687
            // We don't do anything at the ft or cachetable layer for rename.
            // We just update entries in the environment's directory.
2688
        }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2689
    }
2690

2691 2692 2693 2694
exit:
    if (iname) {
        toku_free(iname);
    }
2695
    return r;
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
2696
}
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
2697

2698
int 
Yoni Fogel's avatar
Yoni Fogel committed
2699
DB_CREATE_FUN (DB ** db, DB_ENV * env, uint32_t flags) {
2700 2701
    int r = toku_db_create(db, env, flags); 
    return r;
2702 2703 2704 2705
}

/* need db_strerror_r for multiple threads */

2706
const char *
2707
db_strerror(int error) {
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2708 2709 2710 2711 2712 2713 2714
    char *errorstr;
    if (error >= 0) {
        errorstr = strerror(error);
        if (errorstr)
            return errorstr;
    }
    
2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729
    switch (error) {
        case DB_BADFORMAT:
            return "Database Bad Format (probably a corrupted database)";
        case DB_NOTFOUND:
            return "Not found";
        case TOKUDB_OUT_OF_LOCKS:
            return "Out of locks";
        case TOKUDB_DICTIONARY_TOO_OLD:
            return "Dictionary too old for this version of TokuDB";
        case TOKUDB_DICTIONARY_TOO_NEW:
            return "Dictionary too new for this version of TokuDB";
        case TOKUDB_CANCELED:
            return "User cancelled operation";
        case TOKUDB_NO_DATA:
            return "Ran out of data (not EOF)";
2730 2731
        case TOKUDB_HUGE_PAGES_ENABLED:
            return "Transparent huge pages are enabled but TokuDB's memory allocator will oversubscribe main memory with transparent huge pages.  This check can be disabled by setting the environment variable TOKU_HUGE_PAGES_OK.";
2732
    }
2733

2734
    static char unknown_result[100];    // Race condition if two threads call this at the same time. However even in a bad case, it should be some sort of null-terminated string.
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2735 2736 2737 2738 2739
    errorstr = unknown_result;
    snprintf(errorstr, sizeof unknown_result, "Unknown error code: %d", error);
    return errorstr;
}

2740 2741
const char *
db_version(int *major, int *minor, int *patch) {
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2742 2743 2744 2745 2746 2747
    if (major)
        *major = DB_VERSION_MAJOR;
    if (minor)
        *minor = DB_VERSION_MINOR;
    if (patch)
        *patch = DB_VERSION_PATCH;
2748
    return toku_product_name_strings.db_version;
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
2749
}
2750
 
Yoni Fogel's avatar
Yoni Fogel committed
2751 2752 2753 2754 2755 2756 2757 2758 2759 2760
// HACK: To ensure toku_pthread_yield gets included in the .so
// non-static would require a prototype in a header
// static (since unused) would give a warning
// static + unused would not actually help toku_pthread_yield get in the .so
// static + used avoids all the warnings and makes sure toku_pthread_yield is in the .so
static void __attribute__((__used__))
include_toku_pthread_yield (void) {
    toku_pthread_yield();
}

2761
// For test purposes only, translate dname to iname
2762 2763
// YDB lock is NOT held when this function is called,
// as it is called by user
2764 2765 2766
static int 
env_get_iname(DB_ENV* env, DBT* dname_dbt, DBT* iname_dbt) {
    DB *directory = env->i->directory;
2767
    int r = autotxn_db_get(directory, NULL, dname_dbt, iname_dbt, DB_SERIALIZABLE|DB_PRELOCKED); // allocates memory for iname
2768 2769 2770
    return r;
}

2771 2772 2773 2774
// TODO 2216:  Patch out this (dangerous) function when loader is working and 
//             we don't need to test the low-level redirect anymore.
// for use by test programs only, just a wrapper around brt call:
int
2775
toku_test_db_redirect_dictionary(DB * db, const char * dname_of_new_file, DB_TXN *dbtxn) {
2776 2777 2778 2779 2780
    int r;
    DBT dname_dbt;
    DBT iname_dbt;
    char * new_iname_in_env;

2781
    FT_HANDLE brt = db->i->ft_handle;
2782 2783 2784
    TOKUTXN tokutxn = db_txn_struct_i(dbtxn)->tokutxn;

    toku_fill_dbt(&dname_dbt, dname_of_new_file, strlen(dname_of_new_file)+1);
2785
    toku_init_dbt_flags(&iname_dbt, DB_DBT_REALLOC);
2786
    r = toku_db_get(db->dbenv->i->directory, dbtxn, &dname_dbt, &iname_dbt, DB_SERIALIZABLE);  // allocates memory for iname
2787
    assert_zero(r);
2788
    new_iname_in_env = (char *) iname_dbt.data;
2789

2790
    toku_multi_operation_client_lock(); //Must hold MO lock for dictionary_redirect.
Yoni Fogel's avatar
Yoni Fogel committed
2791
    r = toku_dictionary_redirect(new_iname_in_env, brt, tokutxn);
2792
    toku_multi_operation_client_unlock();
2793 2794 2795 2796

    toku_free(new_iname_in_env);
    return r;
}
2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807

//Tets only function
uint64_t
toku_test_get_latest_lsn(DB_ENV *env) {
    LSN rval = ZERO_LSN;
    if (env && env->i->logger) {
        rval = toku_logger_last_lsn(env->i->logger);
    }
    return rval.lsn;
}

2808 2809
int 
toku_test_get_checkpointing_user_data_status (void) {
2810 2811
    return toku_cachetable_get_checkpointing_user_data_status();
}
2812

2813 2814
#undef STATUS_VALUE
#undef PERSISTENT_UPGRADE_STATUS_VALUE
2815

2816
#include <toku_race_tools.h>
2817
void __attribute__((constructor)) toku_ydb_helgrind_ignore(void);
2818
void
2819
toku_ydb_helgrind_ignore(void) {
2820
    TOKU_VALGRIND_HG_DISABLE_CHECKING(&ydb_layer_status, sizeof ydb_layer_status);
2821
}