ha_tokudb.h 12.5 KB
Newer Older
1
#ifdef USE_PRAGMA_INTERFACE
2
#pragma interface               /* gcc class implementation */
3 4 5 6 7
#endif

#include <db.h>

typedef struct st_tokudb_share {
8 9 10 11 12 13
    char *table_name;
    uint table_name_length, use_count;
    pthread_mutex_t mutex;
    THR_LOCK lock;

    ulonglong auto_ident;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
14
    ulonglong last_auto_increment, auto_inc_create_value;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
15 16 17
    //
    // estimate on number of rows in table
    //
Zardosht Kasheff's avatar
Zardosht Kasheff committed
18
    ha_rows rows;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
19 20 21 22 23
    //
    // estimate on number of rows added in the process of a locked tables
    // this is so we can better estimate row count during a lock table
    //
    ha_rows rows_from_locked_table;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
24 25 26 27 28 29 30
    DB *status_block;
    //
    // DB that is indexed on the primary key
    //
    DB *file;
    //
    // array of all DB's that make up table, includes DB that
Zardosht Kasheff's avatar
Zardosht Kasheff committed
31 32
    // is indexed on the primary key, add 1 in case primary
    // key is hidden
Zardosht Kasheff's avatar
Zardosht Kasheff committed
33
    //
Zardosht Kasheff's avatar
Zardosht Kasheff committed
34 35
    DB *key_file[MAX_KEY +1];
    u_int32_t key_type[MAX_KEY +1];
Zardosht Kasheff's avatar
Zardosht Kasheff committed
36
    uint status, version, capabilities;
37
    uint ref_length;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
38 39 40 41 42 43 44 45 46
    bool fixed_length_primary_key, fixed_length_row; 
    //
    // whether table has an auto increment column
    //
    bool has_auto_inc;
    //
    // index of auto increment column in table->field, if auto_inc exists
    //
    uint ai_field_index;
47 48
} TOKUDB_SHARE;

Zardosht Kasheff's avatar
Zardosht Kasheff committed
49
#define HA_TOKU_VERSION 2
Zardosht Kasheff's avatar
Zardosht Kasheff committed
50 51 52
//
// no capabilities yet
//
Zardosht Kasheff's avatar
Zardosht Kasheff committed
53
#define HA_TOKU_CAP 0
Zardosht Kasheff's avatar
Zardosht Kasheff committed
54 55 56 57 58 59 60 61 62


//
// These are keys that will be used for retrieving metadata in status.tokudb
// To get the version, one looks up the value associated with key hatoku_version
// in status.tokudb
//
typedef enum {
    hatoku_version = 0,
Zardosht Kasheff's avatar
Zardosht Kasheff committed
63 64 65
    hatoku_capabilities,
    hatoku_max_ai, //maximum auto increment value found so far
    hatoku_ai_create_value
Zardosht Kasheff's avatar
Zardosht Kasheff committed
66 67
} HA_METADATA_KEY ;

Zardosht Kasheff's avatar
Zardosht Kasheff committed
68 69 70 71 72 73 74 75 76 77 78 79
//
// for storing NULL byte in keys
//
#define NULL_COL_VAL 0
#define NONNULL_COL_VAL 1

//
// for storing if rest of key is +/- infinity
//
#define COL_NEG_INF 0 
#define COL_POS_INF 1

Zardosht Kasheff's avatar
Zardosht Kasheff committed
80 81 82 83 84
typedef struct st_prim_key_part_info {
    uint offset;
    uint part_index;
} PRIM_KEY_PART_INFO;

Zardosht Kasheff's avatar
Zardosht Kasheff committed
85 86 87 88 89
typedef enum {
    lock_read = 0,
    lock_write
} TABLE_LOCK_TYPE;

90
class ha_tokudb : public handler {
91
private:
92 93 94
    THR_LOCK_DATA lock;         ///< MySQL lock
    TOKUDB_SHARE *share;        ///< Shared lock info

Zardosht Kasheff's avatar
Zardosht Kasheff committed
95 96 97 98 99 100 101 102 103
    //
    // last key returned by ha_tokudb's cursor
    //
    DBT last_key;
    //
    // current row pointed to by ha_tokudb's cursor
    // TODO: make sure current_row gets set properly
    //
    DBT current_row;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
104 105 106
    //
    // pointer used for multi_alloc of key_buff, key_buff2, primary_key_buff
    //
107
    void *alloc_ptr;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
108 109 110 111 112
    //
    // buffer used to temporarily store a "packed row" 
    // data pointer of a DBT will end up pointing to this
    // see pack_row for usage
    //
113
    uchar *rec_buff;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
    //
    // number of bytes allocated in rec_buff
    //
    ulong alloced_rec_buff_length;
    //
    // buffer used to temporarily store a "packed key" 
    // data pointer of a DBT will end up pointing to this
    //
    uchar *key_buff; 
    //
    // buffer used to temporarily store a "packed key" 
    // data pointer of a DBT will end up pointing to this
    // This is used in functions that require the packing
    // of more than one key
    //
    uchar *key_buff2; 
    //
    // buffer used to temporarily store a "packed key" 
    // data pointer of a DBT will end up pointing to this
    // currently this is only used for a primary key in
    // the function update_row, hence the name. It 
    // does not carry any state throughout the class.
    //
    uchar *primary_key_buff;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
138 139 140 141

    //
    // transaction used by ha_tokudb's cursor
    //
142
    DB_TXN *transaction;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
143

Zardosht Kasheff's avatar
Zardosht Kasheff committed
144 145 146
    //
    // instance of cursor being used for init_xxx and rnd_xxx functions
    //
147
    DBC *cursor;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
148 149 150
    //
    // flags that are returned in table_flags()
    //
Zardosht Kasheff's avatar
Zardosht Kasheff committed
151
    ulonglong int_table_flags;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
152
    // 
Zardosht Kasheff's avatar
Zardosht Kasheff committed
153 154
    // count on the number of rows that gets changed, such as when write_row occurs
    // this is meant to help keep estimate on number of elements in DB
Zardosht Kasheff's avatar
Zardosht Kasheff committed
155 156 157
    // 
    ulonglong added_rows;
    ulonglong deleted_rows;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172


    //
    // count on number of rows inserted by statement
    // this is to help give user progress on what is happening
    // the reason that the variables added_rows and deleted_rows
    // are not used is that those variables are also used to help
    // estimate the number of rows in the DB. There are tricky things that
    // can happen with "lock tables", so I do not want to couple these
    // two features together. There is a little duplicate work, but I think it is fine
    //
    ulonglong num_added_rows_in_stmt;
    ulonglong num_deleted_rows_in_stmt;
    ulonglong num_updated_rows_in_stmt;

Zardosht Kasheff's avatar
Zardosht Kasheff committed
173 174 175 176 177 178 179 180 181 182 183
    //
    // index into key_file that holds DB* that is indexed on
    // the primary_key. this->key_file[primary_index] == this->file
    //
    uint primary_key;
    uint last_dup_key;
    //
    // if set to 0, then the primary key is not hidden
    // if non-zero (not necessarily 1), primary key is hidden
    //
    uint hidden_primary_key;
184
    bool key_read, using_ignore;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
185

Zardosht Kasheff's avatar
Zardosht Kasheff committed
186 187 188 189 190 191 192
    //
    // After a cursor encounters an error, the cursor will be unusable
    // In case MySQL attempts to do a cursor operation (such as rnd_next
    // or index_prev), we will gracefully return this error instead of crashing
    //
    int last_cursor_error;

Zardosht Kasheff's avatar
Zardosht Kasheff committed
193 194 195 196 197
    //
    // For instances where we successfully prelock a range or a table,
    // we set this to TRUE so that successive cursor calls can know
    // know to limit the locking overhead in a call to the fractal tree
    //
Zardosht Kasheff's avatar
Zardosht Kasheff committed
198
    bool range_lock_grabbed;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
199

Zardosht Kasheff's avatar
Zardosht Kasheff committed
200
    PRIM_KEY_PART_INFO* primary_key_offsets;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
201

Zardosht Kasheff's avatar
Zardosht Kasheff committed
202 203 204 205 206 207 208 209
    //
    // buffer for updating the status of long insert, delete, and update
    // statements. Right now, the the messages are 
    // "[inserted|updated|deleted] about %llu rows",
    // so a buffer of 200 is good enough.
    //
    char write_status_msg[200]; //buffer of 200 should be a good upper bound.

210 211 212 213 214
    bool fix_rec_buff_for_blob(ulong length);
#define TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH 5 // QQQ why 5?
    uchar current_ident[TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH];

    ulong max_row_length(const uchar * buf);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
215 216
    int pack_row(DBT * row, const uchar * record, bool strip_pk);
    u_int32_t place_key_into_mysql_buff(uchar * record, uchar* data, uint index);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
217
    void unpack_key(uchar * record, DBT const *key, uint index);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
218
    u_int32_t place_key_into_dbt_buff(KEY* key_info, uchar * buff, const uchar * record, bool* has_null, int key_length);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
219 220
    DBT* create_dbt_key_from_key(DBT * key, KEY* key_info, uchar * buff, const uchar * record, bool* has_null, int key_length = MAX_KEY_LENGTH);
    DBT *create_dbt_key_from_table(DBT * key, uint keynr, uchar * buff, const uchar * record, bool* has_null, int key_length = MAX_KEY_LENGTH);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
221
    DBT *pack_key(DBT * key, uint keynr, uchar * buff, const uchar * key_ptr, uint key_length, uchar inf_byte);
222
    int remove_key(DB_TXN * trans, uint keynr, const uchar * record, DBT * prim_key);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
223
    int remove_keys(DB_TXN * trans, const uchar * record, DBT * prim_key, key_map * keys);
224
    int key_cmp(uint keynr, const uchar * old_row, const uchar * new_row);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
225
    int update_primary_key(DB_TXN * trans, bool primary_key_changed, const uchar * old_row, DBT * old_key, const uchar * new_row, DBT * prim_key);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
226
    int handle_cursor_error(int error, int err_to_return, uint keynr);
227
    DBT *get_pos(DBT * to, uchar * pos);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
228 229
 
    int open_secondary_table(DB** ptr, KEY* key_info, const char* name, int mode, u_int32_t* key_type);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
230
    int acquire_table_lock (DB_TXN* trans, TABLE_LOCK_TYPE lt);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
231
    int estimate_num_rows(DB* db, u_int64_t* num_rows);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
232
    bool has_auto_increment_flag(uint* index);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
233
    int write_metadata(DB* db, HA_METADATA_KEY curr_key_data, void* data, uint size );
Zardosht Kasheff's avatar
Zardosht Kasheff committed
234 235 236 237
    int update_max_auto_inc(DB* db, ulonglong val);
    int write_auto_inc_create(DB* db, ulonglong val);
    void init_auto_increment();

Zardosht Kasheff's avatar
Zardosht Kasheff committed
238
 
239 240 241 242 243
public:
    ha_tokudb(handlerton * hton, TABLE_SHARE * table_arg);
    ~ha_tokudb() {
    } 
    const char *table_type() const {
244
        return "TOKUDB";
245 246
    } 
    const char *index_type(uint inx) {
247 248 249
        return "BTREE";
    }
    const char **bas_ext() const;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
250 251 252 253 254

    //
    // Returns a bit mask of capabilities of storage engine. Capabilities 
    // defined in sql/handler.h
    //
255 256
    ulonglong table_flags(void) const {
        return int_table_flags;
257 258
    } 
    ulong index_flags(uint inx, uint part, bool all_parts) const;
259

Zardosht Kasheff's avatar
Zardosht Kasheff committed
260 261 262
    //
    // Returns limit on the number of keys imposed by tokudb.
    //
263
    uint max_supported_keys() const {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
264
        return MAX_KEY;
265
    } 
Zardosht Kasheff's avatar
Zardosht Kasheff committed
266

267
    uint extra_rec_buf_length() const {
268
        return TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
269 270
    } 
    ha_rows estimate_rows_upper_bound();
Zardosht Kasheff's avatar
Zardosht Kasheff committed
271 272 273 274

    //
    // Returns the limit on the key length imposed by tokudb.
    //
275 276
    uint max_supported_key_length() const {
        return UINT_MAX32;
277
    } 
Zardosht Kasheff's avatar
Zardosht Kasheff committed
278 279 280 281

    //
    // Returns limit on key part length imposed by tokudb.
    //
282
    uint max_supported_key_part_length() const {
283
        return UINT_MAX32;
284 285
    } 
    const key_map *keys_to_use_for_scanning() {
286 287 288 289
        return &key_map_full;
    }

    double scan_time();
Zardosht Kasheff's avatar
Zardosht Kasheff committed
290
    double read_time(uint index, uint ranges, ha_rows rows);
291 292 293

    int open(const char *name, int mode, uint test_if_locked);
    int close(void);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
294
    void update_create_info(HA_CREATE_INFO* create_info);
295 296 297
    int create(const char *name, TABLE * form, HA_CREATE_INFO * create_info);
    int delete_table(const char *name);
    int rename_table(const char *from, const char *to);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
298
    int optimize(THD * thd, HA_CHECK_OPT * check_opt);
299
#if 0
300
    int analyze(THD * thd, HA_CHECK_OPT * check_opt);
301
#endif
302 303 304 305
    int write_row(uchar * buf);
    int update_row(const uchar * old_data, uchar * new_data);
    int delete_row(const uchar * buf);

Zardosht Kasheff's avatar
Zardosht Kasheff committed
306
    int prepare_index_scan();
307 308 309 310
    int index_init(uint index, bool sorted);
    int index_end();
    int index_read(uchar * buf, const uchar * key, uint key_len, enum ha_rkey_function find_flag);
    int index_read_idx(uchar * buf, uint index, const uchar * key, uint key_len, enum ha_rkey_function find_flag);
311
#if 0
312
    int index_read_last(uchar * buf, const uchar * key, uint key_len);
313
#endif
314 315 316 317 318 319 320 321 322 323 324
    int index_next(uchar * buf);
    int index_next_same(uchar * buf, const uchar * key, uint keylen);
    int index_prev(uchar * buf);
    int index_first(uchar * buf);
    int index_last(uchar * buf);

    int rnd_init(bool scan);
    int rnd_end();
    int rnd_next(uchar * buf);
    int rnd_pos(uchar * buf, uchar * pos);

Zardosht Kasheff's avatar
Zardosht Kasheff committed
325 326 327 328 329 330
    int read_range_first(const key_range *start_key,
                                 const key_range *end_key,
                                 bool eq_range, bool sorted);
    int read_range_next();


331 332 333 334 335 336 337 338 339 340 341
    void position(const uchar * record);
    int info(uint);
    int extra(enum ha_extra_function operation);
    int reset(void);
    int external_lock(THD * thd, int lock_type);
    int start_stmt(THD * thd, thr_lock_type lock_type);

    ha_rows records_in_range(uint inx, key_range * min_key, key_range * max_key);

    THR_LOCK_DATA **store_lock(THD * thd, THR_LOCK_DATA ** to, enum thr_lock_type lock_type);

Zardosht Kasheff's avatar
Zardosht Kasheff committed
342
    int get_status();
Zardosht Kasheff's avatar
Zardosht Kasheff committed
343
    void init_hidden_prim_key_info();
344 345 346 347 348 349 350
    inline void get_auto_primary_key(uchar * to) {
        pthread_mutex_lock(&share->mutex);
        share->auto_ident++;
        int5store(to, share->auto_ident);
        pthread_mutex_unlock(&share->mutex);
    }
    virtual void get_auto_increment(ulonglong offset, ulonglong increment, ulonglong nb_desired_values, ulonglong * first_value, ulonglong * nb_reserved_values);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
351
    bool is_auto_inc_singleton();
352 353 354 355 356 357 358
    void print_error(int error, myf errflag);
    uint8 table_cache_type() {
        return HA_CACHE_TBL_TRANSACT;
    }
    bool primary_key_is_clustered() {
        return true;
    }
Zardosht Kasheff's avatar
Zardosht Kasheff committed
359 360 361
    bool supports_clustered_keys() {
        return true;
    }
362 363 364
    int cmp_ref(const uchar * ref1, const uchar * ref2);
    bool check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes);

Zardosht Kasheff's avatar
Zardosht Kasheff committed
365 366 367 368
    int add_index(TABLE *table_arg, KEY *key_info, uint num_of_keys);
    int prepare_drop_index(TABLE *table_arg, uint *key_num, uint num_of_keys);
    int final_drop_index(TABLE *table_arg);

369 370 371
    // delete all rows from the table
    // effect: all dictionaries, including the main and indexes, should be empty
    int delete_all_rows();
Zardosht Kasheff's avatar
Zardosht Kasheff committed
372 373 374 375
    void extract_hidden_primary_key(uint keynr, DBT const *row, DBT const *found_key);
    void read_key_only(uchar * buf, uint keynr, DBT const *row, DBT const *found_key);
    void read_primary_key(uchar * buf, uint keynr, DBT const *row, DBT const *found_key);
    int read_row(uchar * buf, uint keynr, DBT const *row, DBT const *found_key);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
376
    void unpack_row(uchar * record, DBT const *row, DBT const *key, bool pk_stripped);
377

Zardosht Kasheff's avatar
Zardosht Kasheff committed
378 379
    int heavi_ret_val;

380
private:
Zardosht Kasheff's avatar
Zardosht Kasheff committed
381
    int read_full_row(uchar * buf);
382
    int __close(int mutex_is_locked);
383
    int read_last();
384
};