diff --git a/btr/btr0btr.c b/btr/btr0btr.c index 2a165d3ce9172c5bc967320157482a1600c236f3..0f980c8735a4968b25cc45f008b7e54afbb77aad 100644 --- a/btr/btr0btr.c +++ b/btr/btr0btr.c @@ -108,7 +108,8 @@ btr_page_empty( /*===========*/ page_t* page, /* in: page to be emptied */ page_zip_des_t* page_zip,/* out: compressed page, or NULL */ - mtr_t* mtr); /* in: mtr */ + mtr_t* mtr, /* in: mtr */ + dict_index_t* index); /* in: the index of the page */ /***************************************************************** Returns TRUE if the insert fits on the appropriate half-page with the chosen split_rec. */ @@ -261,7 +262,7 @@ btr_page_create( ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); page_create(page, NULL, mtr, - UT_LIST_GET_FIRST(tree->tree_indexes)->table->comp); + UT_LIST_GET_FIRST(tree->tree_indexes)); buf_block_align(page)->check_index_page_at_flush = TRUE; btr_page_set_index_id(page, NULL, tree->id, mtr); @@ -498,8 +499,8 @@ void btr_node_ptr_set_child_page_no( /*===========================*/ rec_t* rec, /* in: node pointer record */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 8 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed + part will be updated, or NULL */ const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint page_no,/* in: child node address */ mtr_t* mtr) /* in: mtr */ @@ -510,17 +511,18 @@ btr_node_ptr_set_child_page_no( ut_ad(rec_offs_validate(rec, NULL, offsets)); ut_ad(0 < btr_page_get_level(buf_frame_align(rec), mtr)); ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); - ut_ad(!page_zip || page_zip_available(page_zip, 8)); /* The child address is in the last field */ field = rec_get_nth_field(rec, offsets, rec_offs_n_fields(offsets) - 1, &len); - ut_ad(len == 4); + ut_ad(len == REC_NODE_PTR_SIZE); - mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr); if (UNIV_LIKELY_NULL(page_zip)) { - page_zip_write(page_zip, field, 4); + page_zip_write_node_ptr(page_zip, rec, + rec_offs_data_size(offsets), page_no, mtr); + } else { + mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr); } } @@ -658,13 +660,13 @@ Creates the root node for a new index tree. */ ulint btr_create( /*=======*/ - /* out: page number of the created root, FIL_NULL if - did not succeed */ - ulint type, /* in: type of the index */ - ulint space, /* in: space where created */ - dulint index_id,/* in: index id */ - ulint comp, /* in: nonzero=compact page format */ - mtr_t* mtr) /* in: mini-transaction handle */ + /* out: page number of the created root, + FIL_NULL if did not succeed */ + ulint type, /* in: type of the index */ + ulint space, /* in: space where created */ + dulint index_id,/* in: index id */ + dict_index_t* index, /* in: index */ + mtr_t* mtr) /* in: mini-transaction handle */ { ulint page_no; buf_frame_t* ibuf_hdr_frame; @@ -732,7 +734,7 @@ btr_create( } /* Create a new index page on the the allocated segment page */ - page = page_create(frame, NULL, mtr, comp); + page = page_create(frame, NULL, mtr, index); buf_block_align(page)->check_index_page_at_flush = TRUE; /* Set the index id of the page */ @@ -759,7 +761,8 @@ btr_create( page_zip = buf_block_get_page_zip(buf_block_align(page)); if (UNIV_LIKELY_NULL(page_zip)) { - if (UNIV_UNLIKELY(page_zip_compress(page_zip, page))) { + if (UNIV_UNLIKELY(page_zip_compress( + page_zip, page, index, mtr))) { /* An empty page should always be compressible */ ut_error; } @@ -842,7 +845,7 @@ btr_free_root( /***************************************************************** Reorganizes an index page. */ static -void +ibool btr_page_reorganize_low( /*====================*/ ibool recovery,/* in: TRUE if called in recovery: @@ -861,6 +864,7 @@ btr_page_reorganize_low( ulint data_size2; ulint max_ins_size1; ulint max_ins_size2; + ibool success = FALSE; ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); @@ -888,7 +892,7 @@ btr_page_reorganize_low( /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ - page_create(page, NULL, mtr, page_is_comp(page)); + page_create(page, NULL, mtr, index); buf_block_align(page)->check_index_page_at_flush = TRUE; /* Copy the records from the temporary space to the recreated page; @@ -900,11 +904,13 @@ btr_page_reorganize_low( page_set_max_trx_id(page, NULL, page_get_max_trx_id(new_page)); if (UNIV_LIKELY_NULL(page_zip)) { - if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page))) { + if (UNIV_UNLIKELY(!page_zip_compress( + page_zip, page, index, mtr))) { - /* Reorganizing a page should reduce entropy, - making the compressed page occupy less space. */ - ut_error; + /* Restore the old page and exit. */ + buf_frame_copy(page, new_page); + + goto func_exit; } } @@ -927,27 +933,33 @@ btr_page_reorganize_low( (unsigned long) data_size1, (unsigned long) data_size2, (unsigned long) max_ins_size1, (unsigned long) max_ins_size2); + } else { + success = TRUE; } +func_exit: buf_frame_free(new_page); /* Restore logging mode */ mtr_set_log_mode(mtr, log_mode); + + return(success); } /***************************************************************** Reorganizes an index page. */ -void +ibool btr_page_reorganize( /*================*/ + /* out: TRUE on success, FALSE on failure */ page_t* page, /* in: page to be reorganized */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mtr */ { - btr_page_reorganize_low(FALSE, page, + return(btr_page_reorganize_low(FALSE, page, buf_block_get_page_zip(buf_block_align(page)), - index, mtr); + index, mtr)); } /*************************************************************** @@ -985,7 +997,8 @@ btr_page_empty( /*===========*/ page_t* page, /* in: page to be emptied */ page_zip_des_t* page_zip,/* out: compressed page, or NULL */ - mtr_t* mtr) /* in: mtr */ + mtr_t* mtr, /* in: mtr */ + dict_index_t* index) /* in: index of the page */ { ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); @@ -996,7 +1009,7 @@ btr_page_empty( /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ - page_create(page, page_zip, mtr, page_is_comp(page)); + page_create(page, page_zip, mtr, index); buf_block_align(page)->check_index_page_at_flush = TRUE; } @@ -1086,7 +1099,9 @@ btr_root_raise_and_insert( node_ptr = dict_tree_build_node_ptr(tree, rec, new_page_no, heap, level); /* Reorganize the root to get free space */ - btr_page_reorganize_low(FALSE, root, NULL, cursor->index, mtr); + if (!btr_page_reorganize_low(FALSE, root, NULL, cursor->index, mtr)) { + ut_error; /* TODO: page_zip */ + } page_cursor = btr_cur_get_page_cur(cursor); @@ -1105,10 +1120,11 @@ btr_root_raise_and_insert( as there is no lower alphabetical limit to records in the leftmost node of a level: */ - btr_set_min_rec_mark(node_ptr_rec, NULL, mtr); + btr_set_min_rec_mark(node_ptr_rec, mtr); if (UNIV_LIKELY_NULL(page_zip) - && !UNIV_UNLIKELY(page_zip_compress(page_zip, root))) { + && !UNIV_UNLIKELY(page_zip_compress(page_zip, root, + cursor->index, mtr))) { /* The root page should only contain the node pointer to new_page at this point. Thus, the data should fit. */ @@ -1487,8 +1503,9 @@ btr_attach_half_pages( /*==================*/ dict_tree_t* tree, /* in: the index tree */ page_t* page, /* in/out: page to be split */ - page_zip_des_t* page_zip, /* in/out: compressed page with - at least 8 bytes available, or NULL */ + page_zip_des_t* page_zip, /* in/out: compressed page whose + uncompressed part will be updated, + or NULL */ rec_t* split_rec, /* in: first record on upper half page */ page_t* new_page, /* in: the new half page */ @@ -1515,7 +1532,6 @@ btr_attach_half_pages( MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr_memo_contains(mtr, buf_block_align(new_page), MTR_MEMO_PAGE_X_FIX)); - ut_ad(!page_zip || page_zip_available(page_zip, 8)); ut_a(page_is_comp(page) == page_is_comp(new_page)); /* Create a memory heap where the data tuple is stored */ @@ -1663,11 +1679,6 @@ btr_page_split_and_insert( page = btr_cur_get_page(cursor); page_zip = buf_block_get_page_zip(buf_block_align(page)); - if (UNIV_LIKELY_NULL(page_zip)) { - if (UNIV_UNLIKELY(!page_zip_available(page_zip, 8))) { - ut_error; /* TODO: split the page */ - } - } ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); @@ -1719,7 +1730,7 @@ btr_page_split_and_insert( /* 4. Do first the modifications in the tree structure */ - btr_attach_half_pages(tree, page, page_zip/* 8 */, first_rec, + btr_attach_half_pages(tree, page, page_zip, first_rec, new_page, direction, mtr); /* If the split is made on the leaf level and the insert will fit @@ -1751,8 +1762,7 @@ btr_page_split_and_insert( page_move_rec_list_start(new_page, buf_block_get_page_zip( buf_block_align(new_page)), - move_limit, buf_block_get_page_zip( - buf_block_align(page)), + move_limit, page_zip, cursor->index, mtr); left_page = new_page; @@ -1764,8 +1774,7 @@ btr_page_split_and_insert( page_move_rec_list_end(new_page, buf_block_get_page_zip( buf_block_align(new_page)), - move_limit, buf_block_get_page_zip( - buf_block_align(page)), + move_limit, page_zip, cursor->index, mtr); left_page = page; right_page = new_page; @@ -1821,7 +1830,11 @@ btr_page_split_and_insert( /* 8. If insert did not fit, try page reorganization */ - btr_page_reorganize(insert_page, cursor->index, mtr); + if (UNIV_UNLIKELY(!btr_page_reorganize( + insert_page, cursor->index, mtr))) { + + goto insert_failed; + } page_cur_search(insert_page, cursor->index, tuple, PAGE_CUR_LE, page_cursor); @@ -1831,7 +1844,7 @@ btr_page_split_and_insert( if (UNIV_UNLIKELY(rec == NULL)) { /* The insert did not fit on the page: loop back to the start of the function for a new split */ - +insert_failed: /* We play safe and reset the free bits for new_page */ ibuf_reset_free_bits(cursor->index, new_page); @@ -1948,14 +1961,11 @@ btr_parse_set_min_rec_mark( } if (page) { - page_zip_des_t* page_zip = buf_block_get_page_zip( - buf_block_align(page)); - ut_a(!page_is_comp(page) == !comp); rec = page + mach_read_from_2(ptr); - btr_set_min_rec_mark(rec, page_zip, mtr); + btr_set_min_rec_mark(rec, mtr); } return(ptr + 2); @@ -1967,23 +1977,18 @@ Sets a record as the predefined minimum record. */ void btr_set_min_rec_mark( /*=================*/ - rec_t* rec, /* in: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record */ + mtr_t* mtr) /* in: mtr */ { ulint info_bits; if (UNIV_LIKELY(page_rec_is_comp(rec))) { info_bits = rec_get_info_bits(rec, TRUE); - rec_set_info_bits_new(rec, page_zip, - info_bits | REC_INFO_MIN_REC_FLAG); + rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG); btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr); } else { - ut_ad(!page_zip); - info_bits = rec_get_info_bits(rec, FALSE); rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG); @@ -2058,7 +2063,7 @@ btr_lift_page_up( btr_search_drop_page_hash_index(page); /* Make the father empty */ - btr_page_empty(father_page, NULL, mtr); + btr_page_empty(father_page, NULL, mtr, index); /* Move records to the father */ if (!page_copy_rec_list_end(father_page, NULL, @@ -2070,7 +2075,7 @@ btr_lift_page_up( if (UNIV_LIKELY_NULL(father_page_zip)) { if (UNIV_UNLIKELY(!page_zip_compress( - father_page_zip, father_page))) { + father_page_zip, father_page, index, mtr))) { /* Restore the old page from temporary space */ if (UNIV_UNLIKELY(!page_zip_decompress( father_page_zip, father_page, mtr))) { @@ -2157,11 +2162,6 @@ btr_compress( is_left = left_page_no != FIL_NULL; - if (!is_left && UNIV_LIKELY_NULL(page_zip) - && !page_zip_available(page_zip, 8)) { - return(FALSE); - } - if (is_left) { merge_page = btr_page_get(space, left_page_no, RW_X_LATCH, @@ -2197,7 +2197,11 @@ btr_compress( /* We have to reorganize merge_page */ - btr_page_reorganize(merge_page, cursor->index, mtr); + if (UNIV_UNLIKELY(!btr_page_reorganize( + merge_page, cursor->index, mtr))) { + + return(FALSE); + } max_ins_size = page_get_max_insert_size(merge_page, n_recs); @@ -2228,7 +2232,7 @@ btr_compress( /* Replace the address of the old child node (= page) with the address of the merge page to the right */ - btr_node_ptr_set_child_page_no(node_ptr, page_zip/* 8 */, + btr_node_ptr_set_child_page_no(node_ptr, page_zip, rec_get_offsets(node_ptr, cursor->index, offsets_, ULINT_UNDEFINED, &heap), right_page_no, mtr); @@ -2315,13 +2319,14 @@ btr_discard_only_page_on_level( == dict_tree_get_page(tree))) { /* The father is the root page */ + dict_index_t* index = UT_LIST_GET_FIRST(tree->tree_indexes); + btr_page_empty(father_page, buf_block_get_page_zip(buf_block_align(father_page)), - mtr); + mtr, index); /* We play safe and reset the free bits for the father */ - ibuf_reset_free_bits(UT_LIST_GET_FIRST(tree->tree_indexes), - father_page); + ibuf_reset_free_bits(index, father_page); } else { ut_ad(page_get_n_recs(father_page) == 1); @@ -2383,21 +2388,11 @@ btr_discard_page( /* We have to mark the leftmost node pointer on the right side page as the predefined minimum record */ - page_zip_des_t* merge_page_zip; - merge_page_zip = buf_block_get_page_zip( - buf_block_align(merge_page)); - - if (UNIV_LIKELY_NULL(merge_page_zip) - && UNIV_UNLIKELY(!page_zip_alloc( - merge_page_zip, merge_page, 5))) { - ut_error; /* TODO: handle this gracefully */ - } - node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page)); ut_ad(page_rec_is_user_rec(node_ptr)); - btr_set_min_rec_mark(node_ptr, merge_page_zip, mtr); + btr_set_min_rec_mark(node_ptr, mtr); } btr_node_ptr_delete(tree, page, mtr); diff --git a/btr/btr0cur.c b/btr/btr0cur.c index 84bdf3f6fa7ecd9d8c4847812b2082156bf415c8..ef96a7fcff4f467b6dca00cde49616b297da9931 100644 --- a/btr/btr0cur.c +++ b/btr/btr0cur.c @@ -74,9 +74,12 @@ static void btr_cur_unmark_extern_fields( /*=========================*/ - rec_t* rec, /* in: record in a clustered index */ - mtr_t* mtr, /* in: mtr */ - const ulint* offsets);/* in: array returned by rec_get_offsets() */ + page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /* in/out: record in a clustered index */ + dict_index_t* index, /* in: index of the page */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + mtr_t* mtr); /* in: mtr, or NULL if not logged */ /*********************************************************************** Adds path information to the cursor for the current page, for which the binary search has been performed. */ @@ -98,9 +101,8 @@ btr_rec_free_updated_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least n_extern*12 bytes available, - or NULL */ + page_zip_des_t* page_zip,/* in: compressed page whose uncompressed + part will be updated, or NULL */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update, /* in: update vector */ ibool do_not_free_inherited,/* in: TRUE if called in a @@ -122,7 +124,7 @@ btr_rec_get_externally_stored_len( /********************************************************** The following function is used to set the deleted bit of a record. */ UNIV_INLINE -ibool +void btr_rec_set_deleted_flag( /*=====================*/ /* out: TRUE on success; @@ -132,20 +134,11 @@ btr_rec_set_deleted_flag( ulint flag) /* in: nonzero if delete marked */ { if (page_rec_is_comp(rec)) { - if (UNIV_LIKELY_NULL(page_zip) - && UNIV_UNLIKELY(!page_zip_alloc(page_zip, - ut_align_down(rec, UNIV_PAGE_SIZE), 5))) { - rec_set_deleted_flag_new(rec, NULL, flag); - return(FALSE); - } - rec_set_deleted_flag_new(rec, page_zip, flag); } else { ut_ad(!page_zip); rec_set_deleted_flag_old(rec, flag); } - - return(TRUE); } /*==================== B-TREE SEARCH =========================*/ @@ -826,7 +819,6 @@ btr_cur_insert_if_possible( page_zip_des_t* page_zip,/* in: compressed page of cursor */ dtuple_t* tuple, /* in: tuple to insert; the size info need not have been stored to tuple */ - ibool* reorg, /* out: TRUE if reorganization occurred */ mtr_t* mtr) /* in: mtr */ { page_cur_t* page_cursor; @@ -835,8 +827,6 @@ btr_cur_insert_if_possible( ut_ad(dtuple_check_typed(tuple)); - *reorg = FALSE; - page = btr_cur_get_page(cursor); ut_ad(mtr_memo_contains(mtr, buf_block_align(page), @@ -850,15 +840,14 @@ btr_cur_insert_if_possible( if (UNIV_UNLIKELY(!rec)) { /* If record did not fit, reorganize */ - btr_page_reorganize(page, cursor->index, mtr); + if (btr_page_reorganize(page, cursor->index, mtr)) { - *reorg = TRUE; - - page_cur_search(page, cursor->index, tuple, + page_cur_search(page, cursor->index, tuple, PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, page_zip, + rec = page_cur_tuple_insert(page_cursor, page_zip, tuple, cursor->index, mtr); + } } return(rec); @@ -1077,7 +1066,11 @@ btr_cur_optimistic_insert( entry, index, NULL, NULL, mtr); if (UNIV_UNLIKELY(!(*rec))) { /* If the record did not fit, reorganize */ - btr_page_reorganize(page, index, mtr); + if (UNIV_UNLIKELY(!btr_page_reorganize(page, index, mtr))) { + ut_a(page_zip); + + return(DB_FAIL); + } ut_ad(page_get_max_insert_size(page, 1) == max_size); @@ -1089,11 +1082,6 @@ btr_cur_optimistic_insert( entry, index, mtr); if (UNIV_UNLIKELY(!*rec)) { - if (UNIV_LIKELY_NULL(page_zip)) { - /* Likely a compressed page overflow */ - return(DB_FAIL); - } - fputs("InnoDB: Error: cannot insert tuple ", stderr); dtuple_print(stderr, entry); fputs(" into ", stderr); @@ -1449,14 +1437,7 @@ btr_cur_parse_update_in_place( pos, trx_id, roll_ptr); } - row_upd_rec_in_place(rec, offsets, update); - - if (UNIV_LIKELY_NULL(page_zip)) { - btr_cur_unmark_extern_fields(rec, NULL, offsets); - - page_zip_write(page_zip, rec - rec_offs_extra_size(offsets), - rec_offs_size(offsets)); - } + row_upd_rec_in_place(rec, offsets, update, page_zip); func_exit: mem_heap_free(heap); @@ -1507,6 +1488,17 @@ btr_cur_update_in_place( } #endif /* UNIV_DEBUG */ + block = buf_block_align(rec); + + /* Check that enough space is available on the compressed page. */ + page_zip = buf_block_get_page_zip(block); + if (UNIV_LIKELY_NULL(page_zip) + && UNIV_UNLIKELY(!page_zip_alloc(page_zip, + buf_block_get_frame(block), index, mtr, + rec_offs_size(offsets), 0))) { + return(DB_ZIP_OVERFLOW); + } + /* Do lock checking and undo logging */ err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, thr, &roll_ptr); @@ -1518,16 +1510,6 @@ btr_cur_update_in_place( return(err); } - block = buf_block_align(rec); - - page_zip = buf_block_get_page_zip(block); - if (UNIV_LIKELY_NULL(page_zip) - && UNIV_UNLIKELY(!page_zip_alloc(page_zip, - buf_block_get_frame(block), - 4 + rec_offs_size(offsets)))) { - return(DB_OVERFLOW); - } - if (block->is_hashed) { /* The function row_upd_changes_ord_field_binary works only if the update vector was built for a clustered index, we must @@ -1554,33 +1536,26 @@ btr_cur_update_in_place( was_delete_marked = rec_get_deleted_flag(rec, page_is_comp(buf_block_get_frame(block))); - row_upd_rec_in_place(rec, offsets, update); + row_upd_rec_in_place(rec, offsets, update, page_zip); if (block->is_hashed) { rw_lock_x_unlock(&btr_search_latch); } - btr_cur_update_in_place_log(flags, rec, index, update, trx, roll_ptr, - mtr); + btr_cur_update_in_place_log(flags, rec, index, update, + trx, roll_ptr, mtr); + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_rec(page_zip, rec, offsets); + } + if (was_delete_marked && !rec_get_deleted_flag(rec, page_is_comp(buf_block_get_frame(block)))) { /* The new updated record owns its possible externally stored fields */ - if (UNIV_LIKELY_NULL(page_zip)) { - /* Do not log the btr_cur_unmark_extern_fields() - if the page is compressed. Do the operation in - crash recovery of MLOG_COMP_REC_UPDATE_IN_PLACE - in that case. */ - mtr = NULL; - } - - btr_cur_unmark_extern_fields(rec, mtr, offsets); - } - - if (UNIV_LIKELY_NULL(page_zip)) { - page_zip_write(page_zip, rec - rec_offs_extra_size(offsets), - rec_offs_size(offsets)); + btr_cur_unmark_extern_fields( + page_zip, rec, index, offsets, mtr); } if (UNIV_LIKELY_NULL(heap)) { @@ -1601,7 +1576,9 @@ btr_cur_optimistic_update( /*======================*/ /* out: DB_SUCCESS, or DB_OVERFLOW if the updated record does not fit, DB_UNDERFLOW - if the page would become too empty */ + if the page would become too empty, or + DB_ZIP_OVERFLOW if there is not enough + space left on the compressed page */ ulint flags, /* in: undo logging and locking flags */ btr_cur_t* cursor, /* in: cursor on the record to update; cursor stays valid and positioned on the @@ -1618,7 +1595,6 @@ btr_cur_optimistic_update( ulint err; page_t* page; page_zip_des_t* page_zip; - page_zip_des_t* page_zip_used; rec_t* rec; rec_t* orig_rec; ulint max_size; @@ -1628,7 +1604,6 @@ btr_cur_optimistic_update( dulint roll_ptr; trx_t* trx; mem_heap_t* heap; - ibool reorganized = FALSE; ulint i; ulint* offsets; @@ -1653,7 +1628,9 @@ btr_cur_optimistic_update( /* The simplest and the most common case: the update does not change the size of any field and none of the updated fields is - externally stored in rec or update */ + externally stored in rec or update, and there is enough space + on the compressed page to log the update. */ + mem_heap_free(heap); return(btr_cur_update_in_place(flags, cursor, update, cmpl_info, thr, mtr)); @@ -1687,6 +1664,16 @@ btr_cur_optimistic_update( old_rec_size = rec_offs_size(offsets); new_rec_size = rec_get_converted_size(index, new_entry); + page_zip = buf_block_get_page_zip(buf_block_align(page)); + + if (UNIV_LIKELY_NULL(page_zip) + && !page_zip_alloc(page_zip, page, index, mtr, + new_rec_size, 0)) { + mem_heap_free(heap); + + return(DB_ZIP_OVERFLOW); + } + if (UNIV_UNLIKELY(new_rec_size >= page_get_free_space_of_empty( page_is_comp(page)) / 2)) { @@ -1695,9 +1682,6 @@ btr_cur_optimistic_update( return(DB_OVERFLOW); } - max_size = old_rec_size - + page_get_max_insert_size_after_reorganize(page, 1); - if (UNIV_UNLIKELY(page_get_data_size(page) - old_rec_size + new_rec_size < BTR_CUR_PAGE_COMPRESS_LIMIT)) { @@ -1709,6 +1693,9 @@ btr_cur_optimistic_update( return(DB_UNDERFLOW); } + max_size = old_rec_size + + page_get_max_insert_size_after_reorganize(page, 1); + if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT) && (max_size >= new_rec_size)) || (page_get_n_recs(page) <= 1))) { @@ -1740,18 +1727,7 @@ btr_cur_optimistic_update( btr_search_update_hash_on_delete(cursor); - page_zip = buf_block_get_page_zip(buf_block_align(page)); - if (UNIV_LIKELY(!page_zip) - || UNIV_UNLIKELY(!page_zip_available(page_zip, 32))) { - /* If there is not enough space in the page - modification log, ignore the log and - try compressing the page afterwards. */ - page_zip_used = NULL; - } else { - page_zip_used = page_zip; - } - - page_cur_delete_rec(page_cursor, index, offsets, page_zip_used, mtr); + page_cur_delete_rec(page_cursor, index, offsets, page_zip, mtr); page_cur_move_to_prev(page_cursor); @@ -1764,14 +1740,8 @@ btr_cur_optimistic_update( trx->id); } - rec = btr_cur_insert_if_possible(cursor, page_zip_used, - new_entry, &reorganized, mtr); - if (UNIV_UNLIKELY(!rec)) { - /* The above may only fail if page_zip_used != NULL */ - ut_a(page_zip_used); - - goto zip_overflow; - } + rec = btr_cur_insert_if_possible(cursor, page_zip, new_entry, mtr); + ut_a(rec); /* <- We calculated above the insert would fit */ if (!rec_get_deleted_flag(rec, page_is_comp(page))) { /* The new inserted record owns its possible externally @@ -1779,24 +1749,8 @@ btr_cur_optimistic_update( offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); - btr_cur_unmark_extern_fields(rec, mtr, offsets); - } - - if (UNIV_LIKELY_NULL(page_zip) && UNIV_UNLIKELY(!page_zip_used)) { - if (!page_zip_compress(page_zip, page)) { - -zip_overflow: - if (UNIV_UNLIKELY(!page_zip_decompress( - page_zip, page, mtr))) { - ut_error; - } - /* TODO: is this correct? */ - lock_rec_restore_from_page_infimum(orig_rec, page); - - mem_heap_free(heap); - - return(DB_OVERFLOW); - } + btr_cur_unmark_extern_fields( + page_zip, rec, index, offsets, mtr); } /* Restore the old explicit lock state on the record */ @@ -1884,16 +1838,13 @@ btr_cur_pessimistic_update( mem_heap_t* heap; ulint err; ulint optim_err; - ibool dummy_reorganized; dulint roll_ptr; trx_t* trx; ibool was_first; - ibool success; ulint n_extents = 0; ulint n_reserved; ulint* ext_vect; ulint n_ext_vect; - ulint reserve_flag; ulint* offsets = NULL; *big_rec = NULL; @@ -1912,8 +1863,12 @@ btr_cur_pessimistic_update( optim_err = btr_cur_optimistic_update(flags, cursor, update, cmpl_info, thr, mtr); - if (optim_err != DB_UNDERFLOW && optim_err != DB_OVERFLOW) { - + switch (optim_err) { + case DB_UNDERFLOW: + case DB_OVERFLOW: + case DB_ZIP_OVERFLOW: + break; + default: return(optim_err); } @@ -1926,6 +1881,8 @@ btr_cur_pessimistic_update( } if (optim_err == DB_OVERFLOW) { + ulint reserve_flag; + /* First reserve enough free space for the file segments of the index tree, so that the update will not fail because of lack of space */ @@ -1938,13 +1895,9 @@ btr_cur_pessimistic_update( reserve_flag = FSP_NORMAL; } - success = fsp_reserve_free_extents(&n_reserved, - index->space, - n_extents, reserve_flag, mtr); - if (!success) { - err = DB_OUT_OF_FILE_SPACE; - - return(err); + if (!fsp_reserve_free_extents(&n_reserved, index->space, + n_extents, reserve_flag, mtr)) { + return(DB_OUT_OF_FILE_SPACE); } } @@ -1973,8 +1926,8 @@ btr_cur_pessimistic_update( update it back again. */ ut_a(big_rec_vec == NULL); - - btr_rec_free_updated_extern_fields(index, rec, 0/*TODO*/, + + btr_rec_free_updated_extern_fields(index, rec, page_zip, offsets, update, TRUE, mtr); } @@ -2020,8 +1973,8 @@ btr_cur_pessimistic_update( page_cur_move_to_prev(page_cursor); - rec = btr_cur_insert_if_possible(cursor, page_zip, new_entry, - &dummy_reorganized, mtr); + /* TODO: set extern flags in new_entry */ + rec = btr_cur_insert_if_possible(cursor, page_zip, new_entry, mtr); ut_a(rec || optim_err != DB_UNDERFLOW); if (rec) { @@ -2029,13 +1982,19 @@ btr_cur_pessimistic_update( ULINT_UNDEFINED, &heap); lock_rec_restore_from_page_infimum(rec, page); + /* TODO: set these before insert */ rec_set_field_extern_bits(rec, index, ext_vect, n_ext_vect, mtr); if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { /* The new inserted record owns its possible externally stored fields */ - btr_cur_unmark_extern_fields(rec, mtr, offsets); + btr_cur_unmark_extern_fields( + page_zip, rec, index, offsets, mtr); + } + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_rec(page_zip, rec, offsets); } btr_cur_compress_if_useful(cursor, mtr); @@ -2044,14 +2003,9 @@ btr_cur_pessimistic_update( goto return_after_reservations; } - if (page_cur_is_before_first(page_cursor)) { - /* The record to be updated was positioned as the first user - record on its page */ - - was_first = TRUE; - } else { - was_first = FALSE; - } + /* Was the record to be updated positioned as the first user + record on its page? */ + was_first = page_cur_is_before_first(page_cursor); /* The first parameter means that no lock checking and undo logging is made in the insert */ @@ -2065,6 +2019,7 @@ btr_cur_pessimistic_update( ut_a(err == DB_SUCCESS); ut_a(dummy_big_rec == NULL); + /* TODO: set these before insert */ rec_set_field_extern_bits(rec, index, ext_vect, n_ext_vect, mtr); offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); @@ -2072,7 +2027,8 @@ btr_cur_pessimistic_update( /* The new inserted record owns its possible externally stored fields */ - btr_cur_unmark_extern_fields(rec, mtr, offsets); + btr_cur_unmark_extern_fields( + page_zip, rec, index, offsets, mtr); } lock_rec_restore_from_page_infimum(rec, page); @@ -2203,20 +2159,15 @@ btr_cur_parse_del_mark_set_clust_rec( is only being recovered, and there cannot be a hash index to it. */ - if (UNIV_UNLIKELY(!btr_rec_set_deleted_flag(rec, - page_zip, val))) { - /* page_zip overflow should have been detected - before writing MLOG_COMP_REC_CLUST_DELETE_MARK */ - ut_error; - } + btr_rec_set_deleted_flag(rec, page_zip, val); if (!(flags & BTR_KEEP_SYS_FLAG)) { mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; *offsets_ = (sizeof offsets_) / sizeof *offsets_; - /* TODO: page_zip_write(whole record)? */ - row_upd_rec_sys_fields_in_recovery(rec, page_zip, + row_upd_rec_sys_fields_in_recovery(rec, + page_zip, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), pos, trx_id, roll_ptr); @@ -2274,20 +2225,6 @@ btr_cur_del_mark_set_clust_rec( ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); page_zip = buf_block_get_page_zip(buf_block_align(rec)); - if (UNIV_LIKELY_NULL(page_zip)) { - ulint size = 5; - - if (!(flags & BTR_KEEP_SYS_FLAG)) { - size += 21;/* row_upd_rec_sys_fields() */ - } - - if (UNIV_UNLIKELY(!page_zip_alloc(page_zip, - ut_align_down(rec, UNIV_PAGE_SIZE), size))) { - - err = DB_OVERFLOW; - goto func_exit; - } - } err = lock_clust_rec_modify_check_and_lock(flags, rec, index, offsets, thr); @@ -2311,15 +2248,12 @@ btr_cur_del_mark_set_clust_rec( rw_lock_x_lock(&btr_search_latch); } - if (!btr_rec_set_deleted_flag(rec, page_zip/* 5 bytes */, val)) { - /* page_zip_alloc() said there is enough space */ - ut_error; - } + btr_rec_set_deleted_flag(rec, page_zip, val); trx = thr_get_trx(thr); if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields(rec, page_zip/* 21 bytes */, + row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr); } @@ -2407,11 +2341,7 @@ btr_cur_parse_del_mark_set_sec_rec( is only being recovered, and there cannot be a hash index to it. */ - if (!btr_rec_set_deleted_flag(rec, page_zip, val)) { - /* page_zip overflow should have been detected - before writing MLOG_COMP_REC_SEC_DELETE_MARK */ - ut_error; - } + btr_rec_set_deleted_flag(rec, page_zip, val); } return(ptr); @@ -2462,17 +2392,7 @@ btr_cur_del_mark_set_sec_rec( rw_lock_x_lock(&btr_search_latch); } - if (!btr_rec_set_deleted_flag(rec, page_zip, val)) { - /* Reorganize to try to get more modification log space. */ - btr_page_reorganize(buf_block_get_frame(block), - cursor->index, mtr); - /* TODO: search for rec, invalidate hash index */ - - if (!btr_rec_set_deleted_flag(rec, page_zip, val)) { - /* TODO: could we do anything else than crash? */ - ut_error; - } - } + btr_rec_set_deleted_flag(rec, page_zip, val); if (block->is_hashed) { rw_lock_x_unlock(&btr_search_latch); @@ -2581,7 +2501,6 @@ btr_cur_optimistic_delete( if (no_compress_needed) { page_zip_des_t* page_zip; - page_zip_des_t* page_zip_used; lock_update_delete(rec); @@ -2592,31 +2511,12 @@ btr_cur_optimistic_delete( page_zip = buf_block_get_page_zip( buf_block_align(btr_cur_get_page(cursor))); - if (UNIV_LIKELY(!page_zip) - || UNIV_UNLIKELY(!page_zip_available(page_zip, 32))) { - /* If there is not enough space in the page - modification log, ignore the log and - try compressing the page afterwards. */ - page_zip_used = NULL; - } else { - page_zip_used = page_zip; - } - page_cur_delete_rec(btr_cur_get_page_cur(cursor), cursor->index, offsets, - page_zip_used, mtr); + page_zip, mtr); ibuf_update_free_bits_low(cursor->index, page, max_ins_size, mtr); - - if (UNIV_LIKELY_NULL(page_zip) - && UNIV_UNLIKELY(!page_zip_used)) { - /* Reorganize the page to ensure that the - compression succeeds after deleting the record. */ - btr_page_reorganize(page, cursor->index, mtr); - - /* TODO: invalidate hash index, reposition cursor */ - } } if (UNIV_LIKELY_NULL(heap)) { @@ -2656,7 +2556,6 @@ btr_cur_pessimistic_delete( { page_t* page; page_zip_des_t* page_zip; - page_zip_des_t* page_zip_used; dict_tree_t* tree; rec_t* rec; dtuple_t* node_ptr; @@ -2694,6 +2593,7 @@ btr_cur_pessimistic_delete( heap = mem_heap_create(1024); rec = btr_cur_get_rec(cursor); + page_zip = buf_block_get_page_zip(buf_block_align(page)); offsets = rec_get_offsets(rec, cursor->index, NULL, ULINT_UNDEFINED, &heap); @@ -2705,7 +2605,7 @@ btr_cur_pessimistic_delete( ? !rec_get_node_ptr_flag(rec) : !rec_get_1byte_offs_flag(rec)) { btr_rec_free_externally_stored_fields(cursor->index, rec, - 0/*TODO*/, offsets, in_rollback, mtr); + offsets, page_zip, in_rollback, mtr); } if (UNIV_UNLIKELY(page_get_n_recs(page) < 2) @@ -2723,14 +2623,6 @@ btr_cur_pessimistic_delete( goto return_after_reservations; } - page_zip = buf_block_get_page_zip(buf_block_align(page)); - if (UNIV_LIKELY(!page_zip) - || UNIV_UNLIKELY(!page_zip_available(page_zip, 32))) { - page_zip_used = NULL; - } else { - page_zip_used = page_zip; - } - lock_update_delete(rec); level = btr_page_get_level(page, mtr); @@ -2746,13 +2638,7 @@ btr_cur_pessimistic_delete( non-leaf level, we must mark the new leftmost node pointer as the predefined minimum record */ - if (UNIV_LIKELY_NULL(page_zip_used) - && UNIV_UNLIKELY(!page_zip_available( - page_zip_used, 5 + 32))) { - page_zip_used = NULL; - } - - btr_set_min_rec_mark(next_rec, page_zip_used, mtr); + btr_set_min_rec_mark(next_rec, mtr); } else { /* Otherwise, if we delete the leftmost node pointer on a page, we have to change the father node pointer @@ -2774,18 +2660,10 @@ btr_cur_pessimistic_delete( btr_search_update_hash_on_delete(cursor); page_cur_delete_rec(btr_cur_get_page_cur(cursor), cursor->index, - offsets, page_zip_used, mtr); + offsets, page_zip, mtr); ut_ad(btr_check_node_ptr(tree, page, mtr)); - if (UNIV_LIKELY_NULL(page_zip) && UNIV_UNLIKELY(!page_zip_used)) { - /* Reorganize the page to ensure that the - compression succeeds after deleting the record. */ - btr_page_reorganize(page, cursor->index, mtr); - - /* TODO: invalidate hash index, reposition cursor */ - } - *err = DB_SUCCESS; return_after_reservations: @@ -3209,9 +3087,10 @@ static void btr_cur_set_ownership_of_extern_field( /*==================================*/ + page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed + part will be updated, or NULL */ rec_t* rec, /* in/out: clustered index record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ + dict_index_t* index, /* in: index of the page */ const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint i, /* in: field number */ ibool val, /* in: value to set */ @@ -3236,6 +3115,7 @@ btr_cur_set_ownership_of_extern_field( } if (UNIV_LIKELY(mtr != NULL)) { + /* TODO: log this differently for page_zip */ mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val, MLOG_1BYTE, mtr); } else { @@ -3243,7 +3123,8 @@ btr_cur_set_ownership_of_extern_field( } if (UNIV_LIKELY_NULL(page_zip)) { - page_zip_write(page_zip, data + local_len + BTR_EXTERN_LEN, 1); + page_zip_write_blob_ptr( + page_zip, rec, index, offsets, i, mtr); } } @@ -3256,9 +3137,10 @@ to free the field. */ void btr_cur_mark_extern_inherited_fields( /*=================================*/ + page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed + part will be updated, or NULL */ rec_t* rec, /* in/out: record in a clustered index */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - n_extern * 5 bytes available, or NULL */ + dict_index_t* index, /* in: index of the page */ const ulint* offsets,/* in: array returned by rec_get_offsets() */ upd_t* update, /* in: update vector */ mtr_t* mtr) /* in: mtr, or NULL if not logged */ @@ -3287,8 +3169,8 @@ btr_cur_mark_extern_inherited_fields( } } - btr_cur_set_ownership_of_extern_field(rec, page_zip, - offsets, i, FALSE, mtr); + btr_cur_set_ownership_of_extern_field(page_zip, rec, + index, offsets, i, FALSE, mtr); updated: ; } @@ -3361,9 +3243,12 @@ static void btr_cur_unmark_extern_fields( /*=========================*/ - rec_t* rec, /* in: record in a clustered index */ - mtr_t* mtr, /* in: mtr, or NULL if not logged */ - const ulint* offsets)/* in: array returned by rec_get_offsets() */ + page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /* in/out: record in a clustered index */ + dict_index_t* index, /* in: index of the page */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + mtr_t* mtr) /* in: mtr, or NULL if not logged */ { ulint n; ulint i; @@ -3374,8 +3259,9 @@ btr_cur_unmark_extern_fields( for (i = 0; i < n; i++) { if (rec_offs_nth_extern(offsets, i)) { - btr_cur_set_ownership_of_extern_field(rec, 0/*TODO*/, - offsets, i, TRUE, mtr); + btr_cur_set_ownership_of_extern_field(page_zip, rec, + index, offsets, + i, TRUE, mtr); } } } @@ -3505,7 +3391,8 @@ btr_blob_get_next_page_no( /*********************************************************************** Stores the fields in big_rec_vec to the tablespace and puts pointers to -them in rec. The fields are stored on pages allocated from leaf node +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node file segment of the index tree. */ ulint @@ -3515,9 +3402,6 @@ btr_store_big_rec_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in/out: record */ - page_zip_des_t* page_zip, /* in/out: compressed page with - at least 12*big_rec_vec->n_fields - bytes available, or NULL */ const ulint* offsets, /* in: rec_get_offsets(rec, index) */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ @@ -3525,8 +3409,7 @@ btr_store_big_rec_extern_fields( containing the latch to rec and to the tree */ { - byte* data; - ulint local_len; + byte* field_ref; ulint extern_len; ulint store_len; ulint page_no; @@ -3544,8 +3427,6 @@ btr_store_big_rec_extern_fields( MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(local_mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)); - ut_ad(!page_zip - || page_zip_available(page_zip, 12 * big_rec_vec->n_fields)); ut_a(index->type & DICT_CLUSTERED); space_id = buf_frame_get_space_id(rec); @@ -3555,10 +3436,14 @@ btr_store_big_rec_extern_fields( for (i = 0; i < big_rec_vec->n_fields; i++) { - data = rec_get_nth_field(rec, offsets, + { + ulint local_len; + field_ref = rec_get_nth_field(rec, offsets, big_rec_vec->fields[i].field_no, &local_len); - ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); - local_len -= BTR_EXTERN_FIELD_REF_SIZE; + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + field_ref += local_len; + } extern_len = big_rec_vec->fields[i].len; ut_a(extern_len > 0); @@ -3574,7 +3459,7 @@ btr_store_big_rec_extern_fields( hint_page_no = prev_page_no + 1; } - /* TODO: do not compress BLOB pages */ + /* TODO: allocate compressed BLOB storage */ page = btr_page_alloc(index->tree, hint_page_no, FSP_NO_DIR, 0, &mtr); if (page == NULL) { @@ -3611,6 +3496,7 @@ btr_store_big_rec_extern_fields( store_len = extern_len; } + /* TODO: log these writes differently for page_zip */ mlog_write_string(page + FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE, big_rec_vec->fields[i].data @@ -3627,44 +3513,56 @@ btr_store_big_rec_extern_fields( extern_len -= store_len; rec_page = buf_page_get(space_id, - buf_frame_get_page_no(data), + buf_frame_get_page_no( + field_ref), RW_X_LATCH, &mtr); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK); #endif /* UNIV_SYNC_DEBUG */ - mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, 0, + mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0, MLOG_4BYTES, &mtr); - mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4, + mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4, big_rec_vec->fields[i].len - extern_len, MLOG_4BYTES, &mtr); - if (UNIV_LIKELY_NULL(page_zip)) { - page_zip_write(page_zip, - data + local_len + BTR_EXTERN_LEN, 8); - } if (prev_page_no == FIL_NULL) { - mlog_write_ulint(data + local_len - + BTR_EXTERN_SPACE_ID, + page_zip_des_t* page_zip; + + mlog_write_ulint(field_ref + + BTR_EXTERN_SPACE_ID, space_id, MLOG_4BYTES, &mtr); - mlog_write_ulint(data + local_len - + BTR_EXTERN_PAGE_NO, + mlog_write_ulint(field_ref + + BTR_EXTERN_PAGE_NO, page_no, MLOG_4BYTES, &mtr); - mlog_write_ulint(data + local_len - + BTR_EXTERN_OFFSET, + mlog_write_ulint(field_ref + BTR_EXTERN_OFFSET, FIL_PAGE_DATA, MLOG_4BYTES, &mtr); + ut_ad(rec_offs_nth_extern(offsets, + big_rec_vec->fields[i].field_no)); +#if 0 /* TODO:remove */ /* Set the bit denoting that this field in rec is stored externally */ rec_set_nth_field_extern_bit(rec, index, big_rec_vec->fields[i].field_no, &mtr); +#endif + page_zip = buf_block_get_page_zip( + buf_block_align(rec)); + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_blob_ptr(page_zip, rec, + index, offsets, + big_rec_vec-> + fields[i].field_no, + &mtr); + } } prev_page_no = page_no; @@ -3678,7 +3576,7 @@ btr_store_big_rec_extern_fields( /*********************************************************************** Frees the space in an externally stored field to the file space -management if the field in data is owned the externally stored field, +management if the field in data is owned by the externally stored field, in a rollback we may have the additional condition that the field must not be inherited. */ @@ -3693,12 +3591,12 @@ btr_free_externally_stored_field( from purge where 'data' is located on an undo log page, not an index page) */ - byte* data, /* in/out: internally stored data - + reference to the externally - stored part */ - ulint local_len, /* in: length of data */ - page_zip_des_t* page_zip, /* in/out: compressed page with - at least 12 bytes available, or NULL */ + rec_t* rec, /* in/out: record */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ + page_zip_des_t* page_zip, /* in: compressed page whose + uncompressed part will be updated, + or NULL */ + ulint i, /* in: field number */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free inherited fields */ @@ -3708,41 +3606,41 @@ btr_free_externally_stored_field( { page_t* page; page_t* rec_page; + byte* field_ref; ulint space_id; ulint page_no; ulint offset; ulint extern_len; ulint next_page_no; ulint part_len; + ulint local_len; mtr_t mtr; - ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree), MTR_MEMO_X_LOCK)); - ut_ad(mtr_memo_contains(local_mtr, buf_block_align(data), + ut_ad(mtr_memo_contains(local_mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)); - ut_ad(!page_zip || page_zip_available(page_zip, 12)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + field_ref = rec_get_nth_field(rec, offsets, i, &local_len); ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); local_len -= BTR_EXTERN_FIELD_REF_SIZE; + field_ref += local_len; for (;;) { mtr_start(&mtr); - rec_page = buf_page_get(buf_frame_get_space_id(data), - buf_frame_get_page_no(data), RW_X_LATCH, &mtr); + rec_page = buf_page_get(buf_frame_get_space_id(rec), + buf_frame_get_page_no(rec), RW_X_LATCH, &mtr); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK); #endif /* UNIV_SYNC_DEBUG */ - space_id = mach_read_from_4(data + local_len - + BTR_EXTERN_SPACE_ID); + space_id = mach_read_from_4(field_ref + BTR_EXTERN_SPACE_ID); - page_no = mach_read_from_4(data + local_len - + BTR_EXTERN_PAGE_NO); + page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO); - offset = mach_read_from_4(data + local_len - + BTR_EXTERN_OFFSET); - extern_len = mach_read_from_4(data + local_len - + BTR_EXTERN_LEN + 4); + offset = mach_read_from_4(field_ref + BTR_EXTERN_OFFSET); + extern_len = mach_read_from_4(field_ref + BTR_EXTERN_LEN + 4); /* If extern len is 0, then there is no external storage data at all */ @@ -3754,7 +3652,7 @@ btr_free_externally_stored_field( return; } - if (mach_read_from_1(data + local_len + BTR_EXTERN_LEN) + if (mach_read_from_1(field_ref + BTR_EXTERN_LEN) & BTR_EXTERN_OWNER_FLAG) { /* This field does not own the externally stored field: do not free! */ @@ -3765,7 +3663,7 @@ btr_free_externally_stored_field( } if (do_not_free_inherited - && mach_read_from_1(data + local_len + BTR_EXTERN_LEN) + && mach_read_from_1(field_ref + BTR_EXTERN_LEN) & BTR_EXTERN_INHERITED_FLAG) { /* Rollback and inherited field: do not free! */ @@ -3791,16 +3689,13 @@ btr_free_externally_stored_field( btr_page_free_low(index->tree, page, 0, &mtr); - mlog_write_ulint(data + local_len + BTR_EXTERN_PAGE_NO, + /* TODO: log these writes differently for page_zip */ + mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO, next_page_no, MLOG_4BYTES, &mtr); - mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4, + mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4, extern_len - part_len, MLOG_4BYTES, &mtr); - if (page_zip) { - page_zip_write(page_zip, - data + local_len + BTR_EXTERN_LEN, 8); - } if (next_page_no == FIL_NULL) { ut_a(extern_len - part_len == 0); } @@ -3809,6 +3704,11 @@ btr_free_externally_stored_field( ut_a(next_page_no == FIL_NULL); } + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_blob_ptr(page_zip, rec, index, offsets, + i, &mtr); + } + mtr_commit(&mtr); } } @@ -3822,10 +3722,9 @@ btr_rec_free_externally_stored_fields( dict_index_t* index, /* in: index of the data, the index tree MUST be X-latched */ rec_t* rec, /* in/out: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least n_extern*12 bytes available, - or NULL */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + page_zip_des_t* page_zip,/* in: compressed page whose uncompressed + part will be updated, or NULL */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free inherited fields */ @@ -3834,8 +3733,6 @@ btr_rec_free_externally_stored_fields( tree */ { ulint n_fields; - byte* data; - ulint len; ulint i; ut_ad(rec_offs_validate(rec, index, offsets)); @@ -3849,10 +3746,9 @@ btr_rec_free_externally_stored_fields( for (i = 0; i < n_fields; i++) { if (rec_offs_nth_extern(offsets, i)) { - data = rec_get_nth_field(rec, offsets, i, &len); - btr_free_externally_stored_field(index, data, len, - page_zip, - do_not_free_inherited, mtr); + btr_free_externally_stored_field(index, rec, offsets, + page_zip, i, + do_not_free_inherited, mtr); } } } @@ -3867,9 +3763,8 @@ btr_rec_free_updated_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in/out: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least n_extern*12 bytes available, - or NULL */ + page_zip_des_t* page_zip,/* in: compressed page whose uncompressed + part will be updated, or NULL */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update, /* in: update vector */ ibool do_not_free_inherited,/* in: TRUE if called in a @@ -3880,8 +3775,6 @@ btr_rec_free_updated_extern_fields( { upd_field_t* ufield; ulint n_fields; - byte* data; - ulint len; ulint i; ut_ad(rec_offs_validate(rec, index, offsets)); @@ -3897,10 +3790,8 @@ btr_rec_free_updated_extern_fields( if (rec_offs_nth_extern(offsets, ufield->field_no)) { - data = rec_get_nth_field(rec, offsets, - ufield->field_no, &len); - btr_free_externally_stored_field(index, data, len, - page_zip, + btr_free_externally_stored_field(index, rec, offsets, + page_zip, ufield->field_no, do_not_free_inherited, mtr); } } diff --git a/dict/dict0boot.c b/dict/dict0boot.c index 18a707a1b9301e3172209355075ab0f39271a35c..8e79f5a9dab07b8b76d31aa9b9f164f4c271a178 100644 --- a/dict/dict0boot.c +++ b/dict/dict0boot.c @@ -149,7 +149,8 @@ dict_hdr_create( /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_TABLES_ID, FALSE, mtr); + DICT_HDR_SPACE, DICT_TABLES_ID, + srv_sys->dummy_ind1, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -159,7 +160,8 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_UNIQUE, DICT_HDR_SPACE, - DICT_TABLE_IDS_ID, FALSE, mtr); + DICT_TABLE_IDS_ID, + srv_sys->dummy_ind1, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -169,7 +171,8 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_COLUMNS_ID, FALSE, mtr); + DICT_HDR_SPACE, DICT_COLUMNS_ID, + srv_sys->dummy_ind1, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -179,7 +182,8 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_INDEXES_ID, FALSE, mtr); + DICT_HDR_SPACE, DICT_INDEXES_ID, + srv_sys->dummy_ind1, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -189,7 +193,8 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_FIELDS_ID, FALSE, mtr); + DICT_HDR_SPACE, DICT_FIELDS_ID, + srv_sys->dummy_ind1, mtr); if (root_page_no == FIL_NULL) { return(FALSE); diff --git a/dict/dict0crea.c b/dict/dict0crea.c index 824ce6f09179637c2833dcfd17f9b03ae7e776d1..4b446c1273e62cc27ba01c14af5063de9fc085c0 100644 --- a/dict/dict0crea.c +++ b/dict/dict0crea.c @@ -634,7 +634,7 @@ dict_create_index_tree_step( btr_pcur_move_to_next_user_rec(&pcur, &mtr); node->page_no = btr_create(index->type, index->space, index->id, - table->comp, &mtr); + index, &mtr); /* printf("Created a new index tree in space %lu root page %lu\n", index->space, index->page_no); */ @@ -823,7 +823,7 @@ dict_truncate_index_tree( } } - root_page_no = btr_create(type, space, index_id, comp, mtr); + root_page_no = btr_create(type, space, index_id, index, mtr); if (index) { index->tree->page = root_page_no; } else { diff --git a/fsp/fsp0fsp.c b/fsp/fsp0fsp.c index ad4228f6797007b74da40ff4bf7bce7e008cf02f..8b920be6d5ea0f0020972adeef335bab197ae9bb 100644 --- a/fsp/fsp0fsp.c +++ b/fsp/fsp0fsp.c @@ -910,7 +910,8 @@ fsp_header_init( if (space == 0) { fsp_fill_free_list(FALSE, space, header, mtr); btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, space, - ut_dulint_add(DICT_IBUF_ID_MIN, space), FALSE, mtr); + ut_dulint_add(DICT_IBUF_ID_MIN, space), + srv_sys->dummy_ind1, mtr); } else { fsp_fill_free_list(TRUE, space, header, mtr); } diff --git a/include/btr0btr.h b/include/btr0btr.h index 62ef2c8295ce3e8f78d6eb69dc3420a7b88068b1..0674f97bff7de909fb110d0de5ed9c2adf66eb60 100644 --- a/include/btr0btr.h +++ b/include/btr0btr.h @@ -153,13 +153,13 @@ Creates the root node for a new index tree. */ ulint btr_create( /*=======*/ - /* out: page number of the created root, FIL_NULL if - did not succeed */ - ulint type, /* in: type of the index */ - ulint space, /* in: space where created */ - dulint index_id,/* in: index id */ - ulint comp, /* in: nonzero=compact page format */ - mtr_t* mtr); /* in: mini-transaction handle */ + /* out: page number of the created root, + FIL_NULL if did not succeed */ + ulint type, /* in: type of the index */ + ulint space, /* in: space where created */ + dulint index_id,/* in: index id */ + dict_index_t* index, /* in: index */ + mtr_t* mtr); /* in: mini-transaction handle */ /**************************************************************** Frees a B-tree except the root page, which MUST be freed after this by calling btr_free_root. */ @@ -199,12 +199,14 @@ btr_root_raise_and_insert( /***************************************************************** Reorganizes an index page. */ -void +ibool btr_page_reorganize( /*================*/ + /* out: TRUE on success, FALSE on failure */ page_t* page, /* in: page to be reorganized */ dict_index_t* index, /* in: record descriptor */ - mtr_t* mtr); /* in: mtr */ + mtr_t* mtr) /* in: mtr */ + __attribute__((nonnull, warn_unused_result)); /***************************************************************** Decides if the page should be split at the convergence point of inserts converging to left. */ @@ -265,10 +267,8 @@ Sets a record as the predefined minimum record. */ void btr_set_min_rec_mark( /*=================*/ - rec_t* rec, /* in/out: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ - mtr_t* mtr); /* in: mtr */ + rec_t* rec, /* in/out: record */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Deletes on the upper level the node pointer to a page. */ diff --git a/include/btr0btr.ic b/include/btr0btr.ic index 06ada1b0cea7cb3bab11f0b2c84899aa00ccb633..46fe49ac21d517b6b6f6ebca5e582df23e2cecb8 100644 --- a/include/btr0btr.ic +++ b/include/btr0btr.ic @@ -117,6 +117,7 @@ btr_page_set_level( ut_ad(page && mtr); ut_ad(level <= BTR_MAX_NODE_LEVEL); + /* TODO: log this differently for page_zip */ mlog_write_ulint(page + PAGE_HEADER + PAGE_LEVEL, level, MLOG_2BYTES, mtr); @@ -159,6 +160,7 @@ btr_page_set_next( { ut_ad(page && mtr); + /* TODO: log this differently for page_zip */ mlog_write_ulint(page + FIL_PAGE_NEXT, next, MLOG_4BYTES, mtr); if (UNIV_LIKELY_NULL(page_zip)) { @@ -195,6 +197,7 @@ btr_page_set_prev( { ut_ad(page && mtr); + /* TODO: log this differently for page_zip */ mlog_write_ulint(page + FIL_PAGE_PREV, prev, MLOG_4BYTES, mtr); if (UNIV_LIKELY_NULL(page_zip)) { diff --git a/include/btr0cur.h b/include/btr0cur.h index 014f99b6d1dacffae5f64f26309d97cbc45b1f8e..35cc6f6d3b1b19ed4b0e0c03f2ef4ba3a7908f11 100644 --- a/include/btr0cur.h +++ b/include/btr0cur.h @@ -214,7 +214,9 @@ btr_cur_optimistic_update( /*======================*/ /* out: DB_SUCCESS, or DB_OVERFLOW if the updated record does not fit, DB_UNDERFLOW - if the page would become too empty */ + if the page would become too empty, or + DB_ZIP_OVERFLOW if there is not enough + space left on the compressed page */ ulint flags, /* in: undo logging and locking flags */ btr_cur_t* cursor, /* in: cursor on the record to update; cursor stays valid and positioned on the @@ -409,12 +411,13 @@ to free the field. */ void btr_cur_mark_extern_inherited_fields( /*=================================*/ - rec_t* rec, /* in: record in a clustered index */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - n_extern * 5 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /* in/out: record in a clustered index */ + dict_index_t* index, /* in: index of the page */ const ulint* offsets,/* in: array returned by rec_get_offsets() */ upd_t* update, /* in: update vector */ - mtr_t* mtr); /* in: mtr */ + mtr_t* mtr); /* in: mtr, or NULL if not logged */ /*********************************************************************** The complement of the previous function: in an update entry may inherit some externally stored fields from a record. We must mark them as inherited @@ -441,7 +444,8 @@ btr_cur_unmark_dtuple_extern_fields( ulint n_ext_vec); /* in: number of elements in ext_vec */ /*********************************************************************** Stores the fields in big_rec_vec to the tablespace and puts pointers to -them in rec. The fields are stored on pages allocated from leaf node +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node file segment of the index tree. */ ulint @@ -451,9 +455,6 @@ btr_store_big_rec_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ - page_zip_des_t* page_zip, /* in/out: compressed page with - at least 12*big_rec_vec->n_fields - bytes available, or NULL */ const ulint* offsets, /* in: rec_get_offsets(rec, index) */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ @@ -476,12 +477,12 @@ btr_free_externally_stored_field( from purge where 'data' is located on an undo log page, not an index page) */ - byte* data, /* in: internally stored data - + reference to the externally - stored part */ - ulint local_len, /* in: length of data */ - page_zip_des_t* page_zip, /* in/out: compressed page with - at least 12 bytes available, or NULL */ + rec_t* rec, /* in/out: record */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ + page_zip_des_t* page_zip, /* in: compressed page whose + uncompressed part will be updated, + or NULL */ + ulint i, /* in: field number */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free inherited fields */ @@ -497,10 +498,9 @@ btr_rec_free_externally_stored_fields( dict_index_t* index, /* in: index of the data, the index tree MUST be X-latched */ rec_t* rec, /* in: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least n_extern*12 bytes available, - or NULL */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + page_zip_des_t* page_zip,/* in: compressed page whose uncompressed + part will be updated, or NULL */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free inherited fields */ @@ -677,7 +677,7 @@ stored part. */ The 2 highest bits are reserved to the flags below. */ /*--------------------------------------*/ -#define BTR_EXTERN_FIELD_REF_SIZE 20 +/* #define BTR_EXTERN_FIELD_REF_SIZE 20 // moved to btr0types.h */ /* The highest bit of BTR_EXTERN_LEN (i.e., the highest bit of the byte at lowest address) is set to 1 if this field does not 'own' the externally diff --git a/include/btr0types.h b/include/btr0types.h index 03a61480e2e56894faa4dcaa883998afbb0c566e..b8b1b37e5fceb4b93d3fc8c74a43e5e2ee415f8b 100644 --- a/include/btr0types.h +++ b/include/btr0types.h @@ -18,4 +18,9 @@ typedef struct btr_pcur_struct btr_pcur_t; typedef struct btr_cur_struct btr_cur_t; typedef struct btr_search_struct btr_search_t; +/* The size of a reference to data stored on a different page. +The reference is stored at the end of the prefix of the field +in the index record. */ +#define BTR_EXTERN_FIELD_REF_SIZE 20 + #endif diff --git a/include/buf0buf.h b/include/buf0buf.h index af46d4ab308acc6d0453d74f2c57f832f1c4ae96..03273753b7e5c9e9be3d03730e3ec6fafeb1d3fe 100644 --- a/include/buf0buf.h +++ b/include/buf0buf.h @@ -863,9 +863,11 @@ struct buf_block_struct{ ulint curr_side; /* BTR_SEARCH_LEFT_SIDE or BTR_SEARCH_RIGHT_SIDE in hash indexing */ - page_zip_des_t page_zip; /* compressed page info */ dict_index_t* index; /* Index for which the adaptive hash index has been created. */ + /* TODO: how to protect this? */ + page_zip_des_t page_zip; /* compressed page info */ + /* 6. Debug fields */ #ifdef UNIV_SYNC_DEBUG rw_lock_t debug_latch; /* in the debug version, each thread diff --git a/include/db0err.h b/include/db0err.h index de5ac44e73f4a3e332439c898fe509086f54fdf0..1d515ef5210df8cacacee4b8c297d63d5210d6b1 100644 --- a/include/db0err.h +++ b/include/db0err.h @@ -63,6 +63,7 @@ Created 5/24/1996 Heikki Tuuri #define DB_OVERFLOW 1001 #define DB_UNDERFLOW 1002 #define DB_STRONG_FAIL 1003 +#define DB_ZIP_OVERFLOW 1004 #define DB_RECORD_NOT_FOUND 1500 #define DB_END_OF_INDEX 1501 diff --git a/include/mtr0mtr.h b/include/mtr0mtr.h index 048270085872faaed7e80a810edc9cc4f624752f..62f596c46353440940827e50b861b1952f7925ba 100644 --- a/include/mtr0mtr.h +++ b/include/mtr0mtr.h @@ -129,11 +129,23 @@ flag value must give the length also! */ /* copy compact record list end to a new created index page */ #define MLOG_COMP_PAGE_REORGANIZE ((byte)46) /* reorganize an index page */ -#define MLOG_COMP_DECOMPRESS ((byte)47) /* decompress a page +#define MLOG_ZIP_WRITE_NODE_PTR ((byte)47) /* write the node pointer of + a record on a compressed + non-leaf B-tree page */ +#define MLOG_ZIP_WRITE_TRX_ID ((byte)48) /* write the trx_id of + a record on a compressed + leaf B-tree page */ +#define MLOG_ZIP_WRITE_ROLL_PTR ((byte)49) /* write the roll_ptr of + a record on a compressed + leaf B-tree page */ +#define MLOG_ZIP_WRITE_BLOB_PTR ((byte)50) /* write the BLOB pointer + of an externally stored column + on a compressed page */ +#define MLOG_ZIP_COMPRESS ((byte)51) /* compress a page */ +#define MLOG_ZIP_DECOMPRESS ((byte)52) /* decompress a page to undo a compressed page overflow */ - -#define MLOG_BIGGEST_TYPE ((byte)47) /* biggest value (used in +#define MLOG_BIGGEST_TYPE ((byte)52) /* biggest value (used in asserts) */ /******************************************************************* diff --git a/include/page0cur.h b/include/page0cur.h index 212f7e625809d7f6e447da76fe922c50d61a979e..298642f69c9dee3502362f90a8ef6bfa16c9f08e 100644 --- a/include/page0cur.h +++ b/include/page0cur.h @@ -130,8 +130,7 @@ page_cur_tuple_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 25 + rec_size bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ dtuple_t* tuple, /* in: pointer to a data tuple */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr); /* in: mini-transaction handle */ @@ -146,8 +145,7 @@ page_cur_rec_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 25 + rec_size bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ rec_t* rec, /* in: record to insert */ dict_index_t* index, /* in: record descriptor */ ulint* offsets,/* in: rec_get_offsets(rec, index) */ @@ -164,8 +162,7 @@ page_cur_insert_rec_low( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 37 + rec_size bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ dict_index_t* index, /* in: record descriptor */ rec_t* rec, /* in: pointer to a physical record or NULL */ @@ -192,8 +189,7 @@ page_cur_delete_rec( page_cur_t* cursor, /* in/out: a page cursor */ dict_index_t* index, /* in: record descriptor */ const ulint* offsets,/* in: rec_get_offsets(cursor->rec, index) */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 32 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed, or NULL */ mtr_t* mtr); /* in: mini-transaction handle */ /******************************************************************** Searches the right position for a page cursor. */ @@ -253,8 +249,7 @@ page_cur_parse_insert_rec( byte* end_ptr,/* in: buffer end */ dict_index_t* index, /* in: record descriptor */ page_t* page, /* in/out: page or NULL */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 37 + rec_size bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ mtr_t* mtr); /* in: mtr or NULL */ /************************************************************** Parses a log record of copying a record list end to a new created page. */ @@ -280,8 +275,7 @@ page_cur_parse_delete_rec( byte* end_ptr,/* in: buffer end */ dict_index_t* index, /* in: record descriptor */ page_t* page, /* in/out: page or NULL */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 32 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ mtr_t* mtr); /* in: mtr or NULL */ /* Index page cursor */ diff --git a/include/page0cur.ic b/include/page0cur.ic index aac873d24d060160789aa1f81aff476c0ad9c30f..12e54a0726895e165322a3b69f587d97f1e536d7 100644 --- a/include/page0cur.ic +++ b/include/page0cur.ic @@ -181,8 +181,7 @@ page_cur_tuple_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 25 + rec_size bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ dtuple_t* tuple, /* in: pointer to a data tuple */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mini-transaction handle */ @@ -202,8 +201,7 @@ page_cur_rec_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 25 + rec_size bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ rec_t* rec, /* in: record to insert */ dict_index_t* index, /* in: record descriptor */ ulint* offsets,/* in: rec_get_offsets(rec, index) */ diff --git a/include/page0page.h b/include/page0page.h index 4a1c541c2b705eeac6b6f5436def09de24bfc68e..428005473249c6200f7d3df3df63c390937557d8 100644 --- a/include/page0page.h +++ b/include/page0page.h @@ -295,7 +295,10 @@ page_dir_set_n_heap( /*================*/ page_t* page, /* in/out: index page */ page_zip_des_t* page_zip,/* in/out: compressed page whose - uncompressed part will be updated, or NULL */ + uncompressed part will be updated, or NULL. + Note that the size of the dense page directory + in the compressed page trailer is + n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */ ulint n_heap);/* in: number of records */ /***************************************************************** Gets the number of dir slots in directory. */ @@ -347,8 +350,6 @@ void page_dir_slot_set_rec( /*==================*/ page_dir_slot_t* slot, /* in: directory slot */ - page_zip_des_t* page_zip,/* in/out: compressed page whose - uncompressed part will be updated, or NULL */ rec_t* rec); /* in: record on the page */ /******************************************************************* Gets the number of records owned by a directory slot. */ @@ -365,8 +366,7 @@ void page_dir_slot_set_n_owned( /*======================*/ page_dir_slot_t*slot, /* in/out: directory slot */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ ulint n); /* in: number of records owned by the slot */ /**************************************************************** Calculates the space reserved for directory slots of a given @@ -404,6 +404,15 @@ page_rec_is_comp( /* out: nonzero if in compact format */ const rec_t* rec); /* in: record */ /**************************************************************** +Determine whether the page is a B-tree leaf. */ +UNIV_INLINE +ibool +page_is_leaf( +/*=========*/ + /* out: TRUE if the page is a B-tree leaf */ + const page_t* page) /* in: page */ + __attribute__((nonnull, pure)); +/**************************************************************** Gets the pointer to the next record on the page. */ UNIV_INLINE rec_t* @@ -418,12 +427,10 @@ UNIV_INLINE void page_rec_set_next( /*==============*/ - rec_t* rec, /* in: pointer to record, - must not be page supremum */ - rec_t* next, /* in: pointer to next record, - must not be page infimum */ - page_zip_des_t* page_zip);/* in/out: compressed page with at least - 6 bytes available, or NULL */ + rec_t* rec, /* in: pointer to record, + must not be page supremum */ + rec_t* next); /* in: pointer to next record, + must not be page infimum */ /**************************************************************** Gets the pointer to the previous record. */ UNIV_INLINE @@ -562,9 +569,11 @@ page_mem_alloc( page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ ulint need, /* in: number of bytes needed */ dict_index_t* index, /* in: record descriptor */ - ulint* heap_no);/* out: this contains the heap number + ulint* heap_no,/* out: this contains the heap number of the allocated record if allocation succeeds */ + mtr_t* mtr); /* in: mini-transaction handle, or NULL + if page_zip == NULL */ /**************************************************************** Puts a record to free list. */ UNIV_INLINE @@ -575,7 +584,10 @@ page_mem_free( page_zip_des_t* page_zip,/* in/out: compressed page with at least 6 bytes available, or NULL */ rec_t* rec, /* in: pointer to the (origin of) record */ - const ulint* offsets);/* in: array returned by rec_get_offsets() */ + dict_index_t* index, /* in: record descriptor */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + mtr_t* mtr); /* in: mini-transaction handle, or NULL + if page_zip==NULL */ /************************************************************** The index page creation function. */ @@ -587,7 +599,7 @@ page_create( created */ page_zip_des_t* page_zip, /* in/out: compressed page, or NULL */ mtr_t* mtr, /* in: mini-transaction handle */ - ulint comp); /* in: nonzero=compact page format */ + dict_index_t* index); /* in: the index of the page */ /***************************************************************** Differs from page_copy_rec_list_end, because this function does not touch the lock table and max trx id on page or compress the page. */ @@ -622,7 +634,9 @@ The records are copied to the end of the record list on new_page. */ ibool page_copy_rec_list_start( /*=====================*/ - /* out: TRUE on success */ + /* out: TRUE on success; FALSE on + compression failure (new_page will + be decompressed from new_page_zip) */ page_t* new_page, /* in/out: index page to copy to */ page_zip_des_t* new_page_zip, /* in/out: compressed page, or NULL */ rec_t* rec, /* in: record on page */ @@ -685,8 +699,8 @@ void page_dir_split_slot( /*================*/ page_t* page, /* in: index page */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 12 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be written, or NULL */ ulint slot_no)/* in: the directory slot */ __attribute__((nonnull(1))); /***************************************************************** @@ -699,8 +713,7 @@ void page_dir_balance_slot( /*==================*/ page_t* page, /* in/out: index page */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 15 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ ulint slot_no)/* in: the directory slot */ __attribute__((nonnull(1))); /************************************************************** @@ -725,12 +738,12 @@ Parses a redo log record of creating a page. */ byte* page_parse_create( /*==============*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - ulint comp, /* in: nonzero=compact page format */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + ulint comp, /* in: nonzero=compact page format */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /**************************************************************** Prints record contents including the data relevant only in the index page context. */ diff --git a/include/page0page.ic b/include/page0page.ic index 39b792a45ef9ce18cb9e6f22787500bc2bc46660..0f8064fe384e76e4a1be7c975d33538cdba430b6 100644 --- a/include/page0page.ic +++ b/include/page0page.ic @@ -159,6 +159,7 @@ page_header_reset_last_insert( { ut_ad(page && mtr); + /* TODO: log this differently for page_zip */ mlog_write_ulint(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0, MLOG_2BYTES, mtr); if (UNIV_LIKELY_NULL(page_zip)) { @@ -206,6 +207,18 @@ page_rec_is_comp( return(page_is_comp(ut_align_down((rec_t*) rec, UNIV_PAGE_SIZE))); } +/**************************************************************** +Determine whether the page is a B-tree leaf. */ +UNIV_INLINE +ibool +page_is_leaf( +/*=========*/ + /* out: TRUE if the page is a B-tree leaf */ + const page_t* page) /* in: page */ +{ + return(!*(const uint16*) (page + (PAGE_HEADER + PAGE_LEVEL))); +} + /**************************************************************** Gets the first record on the page. */ UNIV_INLINE @@ -433,17 +446,6 @@ page_dir_set_n_slots( uncompressed part will be updated, or NULL */ ulint n_slots)/* in: number of slots */ { -#ifdef UNIV_DEBUG - if (UNIV_LIKELY_NULL(page_zip)) { - /* Ensure that the modification log will not be overwritten. */ - ulint n_slots_old = page_dir_get_n_slots(page); - if (n_slots > n_slots_old) { - ut_ad(page_zip_available_noninline(page_zip, - (n_slots - n_slots_old) - * PAGE_DIR_SLOT_SIZE)); - } - } -#endif /* UNIV_DEBUG */ page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots); } @@ -467,7 +469,10 @@ page_dir_set_n_heap( /*================*/ page_t* page, /* in/out: index page */ page_zip_des_t* page_zip,/* in/out: compressed page whose - uncompressed part will be updated, or NULL */ + uncompressed part will be updated, or NULL. + Note that the size of the dense page directory + in the compressed page trailer is + n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */ ulint n_heap) /* in: number of records */ { ut_ad(n_heap < 0x8000); @@ -532,18 +537,11 @@ void page_dir_slot_set_rec( /*==================*/ page_dir_slot_t* slot, /* in: directory slot */ - page_zip_des_t* page_zip,/* in/out: compressed page whose - uncompressed part will be updated, or NULL */ rec_t* rec) /* in: record on the page */ { ut_ad(page_rec_check(rec)); mach_write_to_2(slot, ut_align_offset(rec, UNIV_PAGE_SIZE)); -#if 0 /* TODO */ - if (UNIV_LIKELY_NULL(page_zip)) { - page_zip_write_trailer(page_zip, slot, 2); - } -#endif } /******************************************************************* @@ -570,8 +568,7 @@ void page_dir_slot_set_n_owned( /*======================*/ page_dir_slot_t*slot, /* in/out: directory slot */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ ulint n) /* in: number of records owned by the slot */ { rec_t* rec = page_dir_slot_get_rec(slot); @@ -643,12 +640,10 @@ UNIV_INLINE void page_rec_set_next( /*==============*/ - rec_t* rec, /* in: pointer to record, - must not be page supremum */ - rec_t* next, /* in: pointer to next record, - must not be page infimum */ - page_zip_des_t* page_zip) /* in/out: compressed page with - at least 6 bytes available, or NULL */ + rec_t* rec, /* in: pointer to record, + must not be page supremum */ + rec_t* next) /* in: pointer to next record, + must not be page infimum */ { ulint offs; @@ -666,10 +661,9 @@ page_rec_set_next( } if (page_rec_is_comp(rec)) { - rec_set_next_offs_new(rec, page_zip, offs); + rec_set_next_offs_new(rec, offs); } else { rec_set_next_offs_old(rec, offs); - ut_ad(!page_zip); } } @@ -880,31 +874,38 @@ page_mem_free( page_zip_des_t* page_zip,/* in/out: compressed page with at least 6 bytes available, or NULL */ rec_t* rec, /* in: pointer to the (origin of) record */ - const ulint* offsets)/* in: array returned by rec_get_offsets() */ + dict_index_t* index, /* in: record descriptor */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + mtr_t* mtr) /* in: mini-transaction handle, or NULL + if page_zip==NULL */ { rec_t* free; ulint garbage; - ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(!rec_offs_comp(offsets) == !page_rec_is_comp(rec)); free = page_header_get_ptr(page, PAGE_FREE); - page_rec_set_next(rec, free, page_zip); + page_rec_set_next(rec, free); page_header_set_ptr(page, page_zip, PAGE_FREE, rec); - if (rec_offs_comp(offsets)/* TODO: UNIV_LIKELY_NULL(page_zip) */) { + if (UNIV_LIKELY_NULL(page_zip)) { ut_ad(rec_offs_comp(offsets)); /* The compression algorithm expects info_bits and n_owned to be 0 for deleted records. */ rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ - + + /* Update the dense page directory. */ + page_zip_dir_delete(page_zip, rec, free); + /* Clear the data bytes of the deleted record in order - to improve the compression ratio of the page. The extra - bytes of the record cannot be cleared, because + to improve the compression ratio of the page. The fixed extra + bytes of the record, which will be omitted from the + stream compression algorithm, cannot be cleared, because page_mem_alloc() needs them in order to determine the size of the deleted record. */ - memset(rec, 0, rec_offs_data_size(offsets)); + page_zip_clear_rec(page_zip, rec, index, offsets, mtr); } garbage = page_header_get_field(page, PAGE_GARBAGE); diff --git a/include/page0types.h b/include/page0types.h index 142b3d5f720e87e37f5a886e4986454d06cdbbea..641ea73f9d25d039e45e634914b012828e5bffb1 100644 --- a/include/page0types.h +++ b/include/page0types.h @@ -10,6 +10,8 @@ Created 2/2/1994 Heikki Tuuri #define page0types_h #include "univ.i" +#include "dict0types.h" +#include "mtr0types.h" /* Type of the index page */ /* The following define eliminates a name collision on HP-UX */ @@ -30,6 +32,8 @@ struct page_zip_des_struct { page_zip_t* data; /* compressed page data */ ulint size; /* total size of compressed page */ + ulint n_blobs; /* number of externally stored + columns */ ulint m_start; /* start offset of modification log */ ulint m_end; /* end offset of modification log */ }; @@ -41,11 +45,27 @@ the uncompressed page. */ void page_zip_write( /*===========*/ - page_zip_des_t* page_zip,/* out: compressed page */ - const byte* str, /* in: address on the uncompressed page */ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record whose data is being written */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + lint offset, /* in: start address of the block, + relative to rec */ ulint length) /* in: length of the data */ __attribute__((nonnull)); +/************************************************************************** +Clear a record on the uncompressed and compressed page, if possible. */ + +void +page_zip_clear_rec( +/*===============*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in: record to clear */ + dict_index_t* index, /* in: index of rec */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + mtr_t* mtr) /* in: mini-transaction */ + __attribute__((nonnull)); + /************************************************************************** Write data to the uncompressed header portion of a page. The data must already have been written to the uncompressed page. */ @@ -58,6 +78,40 @@ page_zip_write_header( ulint length) /* in: length of the data */ __attribute__((nonnull)); +/************************************************************************** +Write the "deleted" flag of a record on a compressed page. The flag must +already have been written on the uncompressed page. */ + +void +page_zip_rec_set_deleted( +/*=====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the deleted flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/************************************************************************** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ + +void +page_zip_rec_set_owned( +/*===================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the owned flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/************************************************************************** +Shift the dense page directory when a record is deleted. */ + +void +page_zip_dir_delete( +/*================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: deleted record */ + const byte* free) /* in: previous start of the free list */ + __attribute__((nonnull)); #ifdef UNIV_DEBUG /************************************************************************** @@ -69,7 +123,11 @@ page_zip_available_noninline( /* out: TRUE if enough space is available */ const page_zip_des_t* page_zip,/* in: compressed page */ - ulint size) + ulint length, /* in: sum of length in + page_zip_write() calls */ + ulint n_write,/* in: number of page_zip_write() */ + ulint n_heap) /* in: number of records that + will be allocated from the heap */ __attribute__((warn_unused_result, nonnull, pure)); #endif /* UNIV_DEBUG */ diff --git a/include/page0zip.h b/include/page0zip.h index 1088421186d5c4c3967fcd1fb528ddaf6f4bb7a2..c700473661d1a5b44ca64d4dc24cc9930d853083 100644 --- a/include/page0zip.h +++ b/include/page0zip.h @@ -16,6 +16,8 @@ Created June 2005 by Marko Makela #include "mtr0types.h" #include "page0types.h" +#include "dict0types.h" +#include "ut0byte.h" /************************************************************************** Initialize a compressed page descriptor. */ @@ -34,8 +36,13 @@ page_zip_compress( /*==============*/ /* out: TRUE on success, FALSE on failure; page_zip will be left intact on failure. */ - page_zip_des_t* page_zip,/* in: size; out: compressed page */ - const page_t* page); /* in: uncompressed page */ + page_zip_des_t* page_zip,/* in: size; out: data, n_blobs, + m_start, m_end */ + const page_t* page, /* in: uncompressed page */ + dict_index_t* index, /* in: index of the B-tree node */ + mtr_t* mtr) /* in: mini-transaction handle, + or NULL if no logging is needed */ + __attribute__((nonnull(1,2,3))); /************************************************************************** Decompress a page. This function should tolerate errors on the compressed @@ -46,7 +53,8 @@ ibool page_zip_decompress( /*================*/ /* out: TRUE on success, FALSE on failure */ - page_zip_des_t* page_zip,/* in: data, size; out: m_start, m_end */ + page_zip_des_t* page_zip,/* in: data, size; + out: m_start, m_end, n_blobs */ page_t* page, /* out: uncompressed page, may be trashed */ mtr_t* mtr) /* in: mini-transaction handle, or NULL if no logging is needed */ @@ -72,11 +80,53 @@ ibool page_zip_validate( /*==============*/ const page_zip_des_t* page_zip,/* in: compressed page */ - const page_t* page); /* in: uncompressed page */ + const page_t* page) /* in: uncompressed page */ + __attribute__((nonnull)); #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ /***************************************************************** -Gets the size of the compressed page trailer (the dense page directory). */ +Gets the number of records that have been relocated, that is, +allocated from the free list since the page was compressed, +such that extra_size has grown. */ +UNIV_INLINE +ulint +page_zip_get_n_relocated( +/*=====================*/ + /* out: number of records + that have been relocated */ + const page_zip_des_t* page_zip) /* in: compressed page */ + __attribute__((pure)); + +/***************************************************************** +Sets the number of records that have been relocated, that is, +allocated from the free list since the page was compressed, +such that extra_size has grown. */ +UNIV_INLINE +void +page_zip_set_n_relocated( +/*=====================*/ + const page_zip_des_t* page_zip, /* in: compressed page */ + ulint n_relocated) /* in: number of records + that have been relocated */ + __attribute__((nonnull)); + +/***************************************************************** +Gets original offset of a record that has been relocated, that is, +allocated from the free list since the page was compressed, +such that extra_size has grown. */ +UNIV_INLINE +ulint +page_zip_get_relocated( +/*===================*/ + /* out: original offset + of the record */ + const page_zip_des_t* page_zip, /* in: compressed page */ + ulint i) /* in: ith record */ + __attribute__((pure)); + +/***************************************************************** +Gets the size of the compressed page trailer (the dense page directory), +including deleted records (the free list) and n_relocated. */ UNIV_INLINE ulint page_zip_dir_size( @@ -86,11 +136,45 @@ page_zip_dir_size( const page_zip_des_t* page_zip) /* in: compressed page */ __attribute__((pure)); /***************************************************************** +Gets the size of the compressed page trailer (the dense page directory), +only including user records (excluding the free list and n_relocated). */ +UNIV_INLINE +ulint +page_zip_dir_user_size( +/*===================*/ + /* out: length of dense page + directory, in bytes */ + const page_zip_des_t* page_zip) /* in: compressed page */ + __attribute__((pure)); + +/***************************************************************** +Find the slot of the given non-free record in the dense page directory. */ +UNIV_INLINE +byte* +page_zip_dir_find( +/*==============*/ + /* out: dense directory slot, + or NULL if record not found */ + page_zip_des_t* page_zip, /* in: compressed page */ + ulint offset) /* in: offset of user record */ + __attribute__((pure)); +/***************************************************************** +Find the slot of the given free record in the dense page directory. */ +UNIV_INLINE +byte* +page_zip_dir_find_free( +/*===================*/ + /* out: dense directory slot, + or NULL if record not found */ + page_zip_des_t* page_zip, /* in: compressed page */ + ulint offset) /* in: offset of user record */ + __attribute__((pure)); +/***************************************************************** Read a given slot in the dense page directory. */ UNIV_INLINE ulint page_zip_dir_get( -/*==============*/ +/*=============*/ /* out: record offset on the uncompressed page, possibly ORed with @@ -105,32 +189,12 @@ Write a given slot in the dense page directory. */ UNIV_INLINE void page_zip_dir_set( -/*==============*/ +/*=============*/ page_zip_des_t* page_zip, /* in: compressed page */ ulint slot, /* in: slot (0=first user record) */ ulint offs); /* in: offset, possibly ORed with PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */ -/************************************************************************** -Determine the encoded length of an integer in the modification log. */ -UNIV_INLINE -ulint -page_zip_ulint_size( -/*================*/ - /* out: length of the integer, in bytes */ - ulint num) /* in: the integer */ - __attribute__((const)); - -/************************************************************************** -Determine the size of a modification log entry. */ -UNIV_INLINE -ulint -page_zip_entry_size( -/*================*/ - /* out: length of the log entry, in bytes */ - ulint pos, /* in: offset of the uncompressed page */ - ulint length) /* in: length of the data */ - __attribute__((const)); /************************************************************************** Ensure that enough space is available in the modification log. @@ -144,51 +208,162 @@ page_zip_alloc( will only be modified if compression is needed and successful */ const page_t* page, /* in: uncompressed page */ - ulint size) /* in: size of modification log entries */ - __attribute__((nonnull)); + dict_index_t* index, /* in: index of the B-tree node */ + mtr_t* mtr, /* in: mini-transaction handle, + or NULL if no logging is desired */ + ulint length, /* in: combined size of the record */ + ulint create) /* in: nonzero=add the record to the heap */ + __attribute__((warn_unused_result, nonnull(1,2,3))); /************************************************************************** -Determine if enough space is available in the modification log. */ +Determine if enough space is available for a page_zip_write_rec() call +in the modification log. */ UNIV_INLINE ibool page_zip_available( /*===============*/ - /* out: TRUE if enough space - is available */ + /* out: TRUE if page_zip_write_rec() + will succeed */ const page_zip_des_t* page_zip,/* in: compressed page */ - ulint size) /* in: requested size of - modification log entries */ + ulint length, /* in: combined size of the record */ + ulint is_leaf,/* in: nonzero=leaf node, + zero=node pointer page */ + ulint create) /* in: nonzero=add the record to + the heap */ __attribute__((warn_unused_result, nonnull, pure)); -#ifdef UNIV_DEBUG /************************************************************************** -Determine if enough space is available in the modification log. */ +Write an entire record on the compressed page. The data must already +have been written to the uncompressed page. */ -ibool -page_zip_available_noninline( -/*=========================*/ - /* out: TRUE if enough space - is available */ - const page_zip_des_t* page_zip,/* in: compressed page */ - ulint size) - __attribute__((warn_unused_result, nonnull, pure)); -#endif /* UNIV_DEBUG */ +void +page_zip_write_rec( +/*===============*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record being written */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ + __attribute__((nonnull)); /************************************************************************** -Write data to the compressed portion of a page. The data must already -have been written to the uncompressed page. */ +Write the BLOB pointer of a record on the leaf page of a clustered index. +The information must already have been updated on the uncompressed page. */ void -page_zip_write( -/*===========*/ +page_zip_write_blob_ptr( +/*====================*/ page_zip_des_t* page_zip,/* in/out: compressed page */ - const byte* str, /* in: address on the uncompressed page */ - ulint length) /* in: length of the data */ - __attribute__((nonnull, deprecated)); + const byte* rec, /* in/out: record whose data is being + written */ + dict_index_t* index, /* in: index of the page */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint n, /* in: column index */ + mtr_t* mtr) /* in: mini-transaction handle, + or NULL if no logging is needed */ + __attribute__((nonnull(1,2,3,4))); + +/************************************************************************** +Write the node pointer of a record on a non-leaf compressed page. */ + +void +page_zip_write_node_ptr( +/*====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in/out: record */ + ulint size, /* in: data size of rec */ + ulint ptr, /* in: node pointer */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2))); + +/************************************************************************** +Write the trx_id of a record on a B-tree leaf node page. */ + +void +page_zip_write_trx_id( +/*==================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in/out: record */ + ulint size, /* in: data size of rec */ + dulint trx_id, /* in: transaction identifier */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2))); + +/************************************************************************** +Write the roll_ptr of a record on a B-tree leaf node page. */ + +void +page_zip_write_roll_ptr( +/*====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in/out: record */ + ulint size, /* in: data size of rec */ + dulint roll_ptr,/* in: roll_ptr */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2))); + +/************************************************************************** +Clear a record on the uncompressed and compressed page, if possible. */ + +void +page_zip_clear_rec( +/*===============*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in/out: record to clear */ + dict_index_t* index, /* in: index of rec */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + mtr_t* mtr) /* in: mini-transaction */ + __attribute__((nonnull)); + +/************************************************************************** +Populate the dense page directory on the compressed page +from the sparse directory on the uncompressed row_format=compact page. */ +void +page_zip_dir_rewrite( +/*=================*/ + page_zip_des_t* page_zip,/* out: dense directory on compressed page */ + const page_t* page) /* in: uncompressed page */ + __attribute__((nonnull)); + +/************************************************************************** +Write the "deleted" flag of a record on a compressed page. The flag must +already have been written on the uncompressed page. */ + +void +page_zip_rec_set_deleted( +/*=====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the deleted flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/************************************************************************** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ + +void +page_zip_rec_set_owned( +/*===================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the owned flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/************************************************************************** +Shift the dense page directory when a record is deleted. */ + +void +page_zip_dir_delete( +/*================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: deleted record */ + const byte* free) /* in: previous start of the free list */ + __attribute__((nonnull)); /************************************************************************** Write data to the uncompressed header portion of a page. The data must -already have been written to the uncompressed page. */ +already have been written to the uncompressed page. +However, the data portion of the uncompressed page may differ from +the compressed page when a record is being inserted in +page_cur_insert_rec_low(). */ UNIV_INLINE void page_zip_write_header( diff --git a/include/page0zip.ic b/include/page0zip.ic index 484c029ace2140818c4059c5d5da7652ea7575d5..299cac647dae0da4edcb3d5aa1266f3e851f004d 100644 --- a/include/page0zip.ic +++ b/include/page0zip.ic @@ -22,11 +22,12 @@ of the compressed page. At the end of the compressed page, there is a dense page directory pointing to every user record contained on the page, including deleted -records on the free list. The dense directory is indexed by the -record heap number. The infimum and supremum records are excluded. -The two most significant bits of the entries are allocated for the -delete-mark and an n_owned flag indicating the last record in a chain -of records pointed to from the sparse page directory on the +records on the free list. The dense directory is indexed in the +collation order, i.e., in the order in which the record list is +linked on the uncompressed page. The infimum and supremum records are +excluded. The two most significant bits of the entries are allocated +for the delete-mark and an n_owned flag indicating the last record in +a chain of records pointed to from the sparse page directory on the uncompressed page. The data between PAGE_ZIP_START and the last page directory entry will @@ -36,20 +37,50 @@ REC_N_NEW_EXTRA_BYTES in every record header. These can be recovered from the dense page directory stored at the end of the compressed page. +The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and +roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of +externally stored columns are stored separately, in ascending order of +heap_no and column index, starting backwards from the dense page +directory. + The compressed data stream may be followed by a modification log covering the compressed portion of the page, as follows. MODIFICATION LOG ENTRY FORMAT -- length (1..2 bytes), not zero -- offset - PAGE_ZIP_START (1..2 bytes) -- data bytes - -The length and the offset are stored in a variable-length format: -- 0xxxxxxxx : 0..127 -- 10xxxxxxx xxxxxxxx: 0..16383 -- 11xxxxxxx xxxxxxxx: reserved - -The end of the modification log is marked by length=0. */ +- write record: + - heap_no-1 (1..2 bytes) + - extra bytes backwards + - data bytes + +The integer values are stored in a variable-length format: +- 0xxxxxxx: 0..127 +- 1xxxxxxx xxxxxxxx: 0..32767 + +The end of the modification log is marked by a 0 byte. + +In summary, the compressed page looks like this: + +(1) Uncompressed page header (PAGE_DATA bytes) +(2) Compressed index information +(3) Compressed page data +(4) Page modification log (page_zip->m_start..page_zip->m_end) +(5) Empty zero-filled space +(6) BLOB pointers + - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column + - in descending collation order +(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes, + - indexed by heap_no + - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN if page_is_leaf(page_zip->data) + - REC_NODE_PTR_SIZE otherwise +(8) Original origins of records that have been relocated since + the page was compressed, in ascending order, 16 bits per entry +(9) dense page directory, stored backwards + - n_dense = n_heap - 2 + - existing records in ascending collation order + - deleted records (free list) in link order +(10) Number of records that have been relocated + since the page was compressed (16 bits), cf. (7) +*/ /* Start offset of the area that will be compressed */ #define PAGE_ZIP_START PAGE_NEW_SUPREMUM_END @@ -74,43 +105,6 @@ page_zip_des_init( memset(page_zip, 0, sizeof *page_zip); } -/************************************************************************** -Determine the encoded length of an integer in the modification log. */ -UNIV_INLINE -ulint -page_zip_ulint_size( -/*================*/ - /* out: length of the integer, in bytes */ - ulint num) /* in: the integer */ -{ - if (num < 128) { /* 0xxxxxxx: 0..127 */ - return(1); - } - if (num < 16384) { /* 10xxxxxx xxxxxxxx: 0..16383 */ - return(2); - } - ut_ad(0); - return(0); -} - -/************************************************************************** -Determine the size of a modification log entry. */ -UNIV_INLINE -ulint -page_zip_entry_size( -/*================*/ - /* out: length of the log entry, in bytes */ - ulint pos, /* in: offset of the uncompressed page */ - ulint length) /* in: length of the data */ -{ - ut_ad(pos >= PAGE_ZIP_START); - ut_ad(pos + length <= UNIV_PAGE_SIZE - PAGE_ZIP_START - /* - trailer_len */); - return(page_zip_ulint_size(pos - PAGE_ZIP_START) - + page_zip_ulint_size(length) - + length); -} - #ifdef UNIV_DEBUG /************************************************************************** Validate a compressed page descriptor. */ @@ -128,12 +122,80 @@ page_zip_simple_validate( ut_ad(page_zip->size > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE); ut_ad(page_zip->m_start <= page_zip->m_end); ut_ad(page_zip->m_end < page_zip->size); + ut_ad(page_zip->n_blobs < page_zip->size / BTR_EXTERN_FIELD_REF_SIZE); return(TRUE); } #endif /* UNIV_DEBUG */ /***************************************************************** -Gets the size of the compressed page trailer (the dense page directory). */ +Gets the number of records that have been relocated, that is, +allocated from the free list since the page was compressed, +such that extra_size has grown. */ +UNIV_INLINE +ulint +page_zip_get_n_relocated( +/*=====================*/ + /* out: number of records + that have been relocated */ + const page_zip_des_t* page_zip) /* in: compressed page */ +{ + return(mach_read_from_2(page_zip->data + + page_zip->size - PAGE_ZIP_DIR_SLOT_SIZE)); +} + +/***************************************************************** +Sets the number of records that have been relocated, that is, +allocated from the free list since the page was compressed, +such that extra_size has grown. */ +UNIV_INLINE +void +page_zip_set_n_relocated( +/*=====================*/ + const page_zip_des_t* page_zip, /* in: compressed page */ + ulint n_relocated) /* in: number of records + that have been relocated */ +{ + mach_write_to_2(page_zip->data + + page_zip->size - PAGE_ZIP_DIR_SLOT_SIZE, + n_relocated); +} + +/***************************************************************** +Gets original offset of a record that has been relocated, that is, +allocated from the free list since the page was compressed, +such that extra_size has grown. */ +UNIV_INLINE +ulint +page_zip_get_relocated( +/*===================*/ + /* out: original offset + of the record */ + const page_zip_des_t* page_zip, /* in: compressed page */ + ulint i) /* in: ith record */ +{ +#ifdef UNIV_DEBUG + ulint n = page_zip_get_n_relocated(page_zip); +#endif /* UNIV_DEBUG */ + ulint offset; + + ut_ad(i < n); + + /* Below, we subtract 2 from n_heap for the page infimum and supremum, + but add 1 for n_relocated, and index by i + 1 */ + offset = mach_read_from_2(page_zip->data + + page_zip->size - PAGE_ZIP_DIR_SLOT_SIZE + * (page_dir_get_n_heap(page_zip->data) + i)); + + ut_ad(offset >= PAGE_ZIP_START); + ut_ad(offset < page_zip->size - PAGE_ZIP_DIR_SLOT_SIZE + * (page_dir_get_n_heap(page_zip->data) + n - 1)); + + return(offset); +} + +/***************************************************************** +Gets the size of the compressed page trailer (the dense page directory), +including deleted records (the free list) and n_relocated. */ UNIV_INLINE ulint page_zip_dir_size( @@ -142,18 +204,97 @@ page_zip_dir_size( directory, in bytes */ const page_zip_des_t* page_zip) /* in: compressed page */ { + /* Exclude the page infimum and supremum from the record count. + Add 1 slot for n_relocated. */ ulint size = PAGE_ZIP_DIR_SLOT_SIZE - * (page_dir_get_n_heap((page_t*) page_zip->data) - 2); + * (page_dir_get_n_heap((page_t*) page_zip->data) - 1); ut_ad(page_zip->m_end + size < page_zip->size); return(size); } +/***************************************************************** +Gets the size of the compressed page trailer (the dense page directory), +only including user records (excluding the free list and n_relocated). */ +UNIV_INLINE +ulint +page_zip_dir_user_size( +/*===================*/ + /* out: length of dense page + directory comprising existing + records, in bytes */ + const page_zip_des_t* page_zip) /* in: compressed page */ +{ + ulint size = PAGE_ZIP_DIR_SLOT_SIZE + * page_get_n_recs((page_t*) page_zip->data); + ut_ad(size < page_zip_dir_size(page_zip)); + return(size); +} + +/***************************************************************** +Find the slot of the given non-free record in the dense page directory. */ +UNIV_INLINE +byte* +page_zip_dir_find( +/*==============*/ + /* out: dense directory slot, + or NULL if record not found */ + page_zip_des_t* page_zip, /* in: compressed page */ + ulint offset) /* in: offset of user record */ +{ + byte* slot; + byte* end; + + ut_ad(page_zip_simple_validate(page_zip)); + + end = page_zip->data + page_zip->size - PAGE_ZIP_DIR_SLOT_SIZE; + slot = end - page_zip_dir_user_size(page_zip); + + for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) { + if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK) + == offset) { + return(slot); + } + } + + return(NULL); +} + +/***************************************************************** +Find the slot of the given free record in the dense page directory. */ +UNIV_INLINE +byte* +page_zip_dir_find_free( +/*===================*/ + /* out: dense directory slot, + or NULL if record not found */ + page_zip_des_t* page_zip, /* in: compressed page */ + ulint offset) /* in: offset of user record */ +{ + byte* slot; + byte* end; + + ut_ad(page_zip_simple_validate(page_zip)); + + slot = end = page_zip->data + page_zip->size; + slot -= page_zip_dir_size(page_zip); + end -= PAGE_ZIP_DIR_SLOT_SIZE + page_zip_dir_user_size(page_zip); + + for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) { + if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK) + == offset) { + return(slot); + } + } + + return(NULL); +} + /***************************************************************** Read a given slot in the dense page directory. */ UNIV_INLINE ulint page_zip_dir_get( -/*==============*/ +/*=============*/ /* out: record offset on the uncompressed page, possibly ORed with @@ -164,16 +305,17 @@ page_zip_dir_get( (0=first user record) */ { ut_ad(page_zip_simple_validate(page_zip)); - ut_ad(slot + 2 < page_dir_get_n_heap((page_t*) page_zip->data)); + ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE); + /* Add 1 for n_relocated */ return(mach_read_from_2(page_zip->data + page_zip->size - - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1))); + - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 2))); } /***************************************************************** Write a given slot in the dense page directory. */ UNIV_INLINE void page_zip_dir_set( -/*==============*/ +/*=============*/ page_zip_des_t* page_zip, /* in: compressed page */ ulint slot, /* in: slot (0=first user record) */ ulint offs) /* in: offset, possibly ORed with @@ -181,8 +323,9 @@ page_zip_dir_set( PAGE_ZIP_DIR_SLOT_OWNED */ { ut_ad(page_zip_simple_validate(page_zip)); + /* Add 1 for n_relocated */ mach_write_to_2(page_zip->data + page_zip->size - - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1), + - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 2), offs); } @@ -198,15 +341,16 @@ page_zip_alloc( will only be modified if compression is needed and successful */ const page_t* page, /* in: uncompressed page */ - ulint size) /* in: size of modification log entries */ + dict_index_t* index, /* in: index of the B-tree node */ + mtr_t* mtr, /* in: mini-transaction handle, + or NULL if no logging is desired */ + ulint length, /* in: combined size of the record */ + ulint create) /* in: nonzero=add the record to the heap */ { - ulint trailer_len = page_zip_dir_size(page_zip); + ut_ad(page_is_comp((page_t*) page)); + ut_ad(page_zip_validate(page_zip, page)); - ut_ad(page_zip_simple_validate(page_zip)); - ut_ad(size >= 3); /* modification log entries are >= 1+1+1 bytes */ - ut_ad(size < page_zip->size); - - if (size + page_zip->m_end + trailer_len < page_zip->size) { + if (page_zip_available(page_zip, length, page_is_leaf(page), create)) { return(TRUE); } @@ -216,7 +360,14 @@ page_zip_alloc( return(FALSE); } - return(page_zip_compress(page_zip, page)); + if (!page_zip_compress(page_zip, page, index, mtr)) { + /* Unable to compress the page */ + return(FALSE); + } + + /* Check if there is enough space available after compression. */ + return(page_zip_available(page_zip, length, + page_is_leaf(page), create)); } /************************************************************************** @@ -228,21 +379,59 @@ page_zip_available( /* out: TRUE if enough space is available */ const page_zip_des_t* page_zip,/* in: compressed page */ - ulint size) /* in: requested size of - modification log entries */ + ulint length, /* in: combined size of the record */ + ulint is_leaf,/* in: nonzero=leaf node, + zero=node pointer page */ + ulint create) /* in: nonzero=add the record to + the heap */ { - ulint trailer_len = page_zip_dir_size(page_zip); + ulint uncompressed_size; + ulint trailer_len; ut_ad(page_zip_simple_validate(page_zip)); - ut_ad(size < page_zip->size); + ut_ad(length > REC_N_NEW_EXTRA_BYTES); + + if (is_leaf) { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE + + REC_NODE_PTR_SIZE; + } + + trailer_len = page_get_n_recs((page_t*) page_zip->data) + * uncompressed_size + + page_zip->n_blobs + * BTR_EXTERN_FIELD_REF_SIZE; + + /* Subtract the fixed extra bytes and add the maximum + space needed for identifying the record (encoded heap_no). */ + length -= REC_N_NEW_EXTRA_BYTES - 2; + + if (UNIV_UNLIKELY(create)) { + /* When a record is created, a pointer may be added to + the dense directory or to the list of relocated records. + Likewise, space for the columns that will not be + compressed will be allocated from the page trailer. + Also the BLOB pointers will be allocated from there, but + we may as well count them in the length of the record. */ + + trailer_len += PAGE_ZIP_DIR_SLOT_SIZE + uncompressed_size; + } return(UNIV_LIKELY( - size + page_zip->m_end + trailer_len < page_zip->size)); + length + + trailer_len + + page_zip->m_end + < page_zip->size)); } /************************************************************************** Write data to the uncompressed header portion of a page. The data must -already have been written to the uncompressed page. */ +already have been written to the uncompressed page. +However, the data portion of the uncompressed page may differ from +the compressed page when a record is being inserted in +page_cur_insert_rec_low(). */ UNIV_INLINE void page_zip_write_header( @@ -262,7 +451,8 @@ page_zip_write_header( memcpy(page_zip + pos, str, length); - ut_ad(page_zip_validate(page_zip, str - pos)); + /* The following would fail in page_cur_insert_rec_low(). */ + /* ut_ad(page_zip_validate(page_zip, str - pos)); */ } #ifdef UNIV_MATERIALIZE diff --git a/include/rem0rec.h b/include/rem0rec.h index 6daf5c5aacf77a02032278f2af7a359c21050afe..c11721e2228d7844e17641cddb65ab19db2d073c 100644 --- a/include/rem0rec.h +++ b/include/rem0rec.h @@ -38,9 +38,8 @@ in addition to the data and the offsets */ #define REC_STATUS_INFIMUM 2 #define REC_STATUS_SUPREMUM 3 -/* The following two constants are needed in page0zip.c in order to -efficiently access heap_no and status when compressing and -decompressing pages. */ +/* The following four constants are needed in page0zip.c in order to +efficiently compress and decompress pages. */ /* The offset of heap_no in a compact record */ #define REC_NEW_HEAP_NO 4 @@ -48,6 +47,17 @@ decompressing pages. */ The status is stored in the low-order bits. */ #define REC_HEAP_NO_SHIFT 3 +/* Length of a B-tree node pointer, in bytes */ +#define REC_NODE_PTR_SIZE 4 + +#ifdef UNIV_DEBUG +/* Length of the rec_get_offsets() header */ +# define REC_OFFS_HEADER_SIZE 4 +#else /* UNIV_DEBUG */ +/* Length of the rec_get_offsets() header */ +# define REC_OFFS_HEADER_SIZE 2 +#endif /* UNIV_DEBUG */ + /* Number of elements that should be initially allocated for the offsets[] array, first passed to rec_get_offsets() */ #define REC_OFFS_NORMAL_SIZE 100 @@ -91,10 +101,8 @@ UNIV_INLINE void rec_set_next_offs_new( /*==================*/ - rec_t* rec, /* in/out: new-style physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 6 bytes available, or NULL */ - ulint next); /* in: offset of the next record */ + rec_t* rec, /* in/out: new-style physical record */ + ulint next); /* in: offset of the next record */ /********************************************************** The following function is used to get the number of fields in an old-style record. */ @@ -147,10 +155,8 @@ UNIV_INLINE void rec_set_n_owned_new( /*================*/ - /* out: TRUE on success */ rec_t* rec, /* in/out: new-style physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 5 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ ulint n_owned);/* in: the number of owned */ /********************************************************** The following function is used to retrieve the info bits of @@ -176,10 +182,8 @@ UNIV_INLINE void rec_set_info_bits_new( /*==================*/ - rec_t* rec, /* in/out: new-style physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ - ulint bits); /* in: info bits */ + rec_t* rec, /* in/out: new-style physical record */ + ulint bits); /* in: info bits */ /********************************************************** The following function retrieves the status bits of a new-style record. */ UNIV_INLINE @@ -195,10 +199,8 @@ UNIV_INLINE void rec_set_status( /*===========*/ - rec_t* rec, /* in/out: physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ - ulint bits); /* in: info bits */ + rec_t* rec, /* in/out: physical record */ + ulint bits); /* in: info bits */ /********************************************************** The following function is used to retrieve the info and status @@ -217,10 +219,8 @@ UNIV_INLINE void rec_set_info_and_status_bits( /*=========================*/ - rec_t* rec, /* in/out: compact physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ - ulint bits); /* in: info bits */ + rec_t* rec, /* in/out: compact physical record */ + ulint bits); /* in: info bits */ /********************************************************** The following function tells if record is delete marked. */ @@ -246,8 +246,7 @@ void rec_set_deleted_flag_new( /*=====================*/ rec_t* rec, /* in/out: new-style physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ ulint flag); /* in: nonzero if delete marked */ /********************************************************** The following function tells if a new-style record is a node pointer. */ @@ -291,10 +290,8 @@ UNIV_INLINE void rec_set_heap_no_new( /*================*/ - rec_t* rec, /* in/out: physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 6 bytes available, or NULL */ - ulint heap_no);/* in: the heap number */ + rec_t* rec, /* in/out: physical record */ + ulint heap_no);/* in: the heap number */ /********************************************************** The following function is used to test whether the data offsets in the record are stored in one-byte or two-byte format. */ @@ -304,6 +301,19 @@ rec_get_1byte_offs_flag( /*====================*/ /* out: TRUE if 1-byte form */ rec_t* rec); /* in: physical record */ + +/********************************************************** +Determine how many of the first n columns in a compact +physical record are stored externally. */ + +ulint +rec_get_n_extern_new( +/*=================*/ + /* out: number of externally stored columns */ + const rec_t* rec, /* in: compact physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint n); /* in: number of columns to scan */ + /********************************************************** The following function determines the offsets to each field in the record. It can reuse a previously allocated array. */ @@ -326,6 +336,21 @@ rec_get_offsets_func( #define rec_get_offsets(rec,index,offsets,n,heap) \ rec_get_offsets_func(rec,index,offsets,n,heap,__FILE__,__LINE__) +/********************************************************** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. */ + +void +rec_get_offsets_reverse( +/*====================*/ + const byte* extra, /* in: the extra bytes of a compact record + in reverse order, excluding the fixed-size + REC_N_NEW_EXTRA_BYTES */ + dict_index_t* index, /* in: record descriptor */ + ibool node_ptr,/* in: TRUE=node pointer, FALSE=leaf node */ + ulint* offsets);/* in/out: array consisting of offsets[0] + allocated elements */ + /**************************************************************** Validates offsets returned by rec_get_offsets(). */ UNIV_INLINE diff --git a/include/rem0rec.ic b/include/rem0rec.ic index 48c3d84655227aa7cb06f730c959cd3688eb2313..c60771553f96a196aea5dacab75e8142dedbae08 100644 --- a/include/rem0rec.ic +++ b/include/rem0rec.ic @@ -380,10 +380,8 @@ UNIV_INLINE void rec_set_next_offs_new( /*==================*/ - rec_t* rec, /* in/out: new-style physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 6 bytes available, or NULL */ - ulint next) /* in: offset of the next record */ + rec_t* rec, /* in/out: new-style physical record */ + ulint next) /* in: offset of the next record */ { ut_ad(rec); ut_ad(UNIV_PAGE_SIZE > next); @@ -403,9 +401,6 @@ rec_set_next_offs_new( } mach_write_to_2(rec - REC_NEXT, field_value); - if (UNIV_LIKELY_NULL(page_zip)) { - page_zip_write(page_zip, rec - REC_NEXT, 2); - } } /********************************************************** @@ -546,16 +541,14 @@ UNIV_INLINE void rec_set_n_owned_new( /*================*/ - /* out: TRUE on success */ rec_t* rec, /* in/out: new-style physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ ulint n_owned)/* in: the number of owned */ { rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); if (UNIV_LIKELY_NULL(page_zip)) { - page_zip_write(page_zip, rec - REC_NEW_N_OWNED, 1); + page_zip_rec_set_owned(page_zip, rec, n_owned); } } @@ -592,16 +585,11 @@ UNIV_INLINE void rec_set_info_bits_new( /*==================*/ - rec_t* rec, /* in/out: new-style physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ - ulint bits) /* in: info bits */ + rec_t* rec, /* in/out: new-style physical record */ + ulint bits) /* in: info bits */ { rec_set_bit_field_1(rec, bits, REC_NEW_INFO_BITS, REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); - if (UNIV_LIKELY_NULL(page_zip)) { - page_zip_write(page_zip, rec - REC_NEW_INFO_BITS, 1); - } } /********************************************************** @@ -610,16 +598,11 @@ UNIV_INLINE void rec_set_status( /*===========*/ - rec_t* rec, /* in/out: physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ - ulint bits) /* in: info bits */ + rec_t* rec, /* in/out: physical record */ + ulint bits) /* in: info bits */ { rec_set_bit_field_1(rec, bits, REC_NEW_STATUS, REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); - if (UNIV_LIKELY_NULL(page_zip)) { - page_zip_write(page_zip, rec - REC_NEW_STATUS, 1); - } } /********************************************************** @@ -653,17 +636,15 @@ UNIV_INLINE void rec_set_info_and_status_bits( /*=========================*/ - rec_t* rec, /* in/out: physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ - ulint bits) /* in: info bits */ + rec_t* rec, /* in/out: physical record */ + ulint bits) /* in: info bits */ { #if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \ & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT) # error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap" #endif - rec_set_status(rec, page_zip, bits & REC_NEW_STATUS_MASK); - rec_set_info_bits_new(rec, page_zip, bits & ~REC_NEW_STATUS_MASK); + rec_set_status(rec, bits & REC_NEW_STATUS_MASK); + rec_set_info_bits_new(rec, bits & ~REC_NEW_STATUS_MASK); } /********************************************************** @@ -716,8 +697,7 @@ void rec_set_deleted_flag_new( /*=====================*/ rec_t* rec, /* in/out: new-style physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 5 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ ulint flag) /* in: nonzero if delete marked */ { ulint val; @@ -730,7 +710,11 @@ rec_set_deleted_flag_new( val &= ~REC_INFO_DELETED_FLAG; } - rec_set_info_bits_new(rec, page_zip, val); + rec_set_info_bits_new(rec, val); + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_rec_set_deleted(page_zip, rec, flag); + } } /********************************************************** @@ -794,16 +778,11 @@ UNIV_INLINE void rec_set_heap_no_new( /*================*/ - rec_t* rec, /* in/out: physical record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 6 bytes available, or NULL */ - ulint heap_no)/* in: the heap number */ + rec_t* rec, /* in/out: physical record */ + ulint heap_no)/* in: the heap number */ { rec_set_bit_field_2(rec, heap_no, REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); - if (UNIV_LIKELY_NULL(page_zip)) { - page_zip_write(page_zip, rec - REC_NEW_HEAP_NO, 2); - } } /********************************************************** @@ -880,14 +859,6 @@ rec_2_get_field_end_info( return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2))); } -#ifdef UNIV_DEBUG -/* Length of the rec_get_offsets() header */ -# define REC_OFFS_HEADER_SIZE 4 -#else /* UNIV_DEBUG */ -/* Length of the rec_get_offsets() header */ -# define REC_OFFS_HEADER_SIZE 2 -#endif /* UNIV_DEBUG */ - /* Get the base address of offsets. The extra_size is stored at this position, and following positions hold the end offsets of the fields. */ @@ -1472,6 +1443,7 @@ rec_get_end( rec_t* rec, /* in: pointer to record */ const ulint* offsets)/* in: array returned by rec_get_offsets() */ { + ut_ad(rec_offs_validate(rec, NULL, offsets)); return(rec + rec_offs_data_size(offsets)); } @@ -1485,6 +1457,7 @@ rec_get_start( rec_t* rec, /* in: pointer to record */ const ulint* offsets)/* in: array returned by rec_get_offsets() */ { + ut_ad(rec_offs_validate(rec, NULL, offsets)); return(rec - rec_offs_extra_size(offsets)); } diff --git a/include/row0ins.h b/include/row0ins.h index a5b4b74e7fccecf18eeb396d80c3534cfe2f99a7..2aa1717759a7b1044ed13c6fcc6c0bb9697842e6 100644 --- a/include/row0ins.h +++ b/include/row0ins.h @@ -58,30 +58,6 @@ ins_node_set_new_row( ins_node_t* node, /* in: insert node */ dtuple_t* row); /* in: new row (or first row) for the node */ /******************************************************************* -Tries to insert an index entry to an index. If the index is clustered -and a record with the same unique key is found, the other record is -necessarily marked deleted by a committed transaction, or a unique key -violation error occurs. The delete marked record is then updated to an -existing record, and we must write an undo log record on the delete -marked record. If the index is secondary, and a record with exactly the -same fields is found, the other record is necessarily marked deleted. -It is then unmarked. Otherwise, the entry is just inserted to the index. */ - -ulint -row_ins_index_entry_low( -/*====================*/ - /* out: DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL - if pessimistic retry needed, or error code */ - ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, - depending on whether we wish optimistic or - pessimistic descent down the index tree */ - dict_index_t* index, /* in: index */ - dtuple_t* entry, /* in: index entry to insert */ - ulint* ext_vec,/* in: array containing field numbers of - externally stored fields in entry, or NULL */ - ulint n_ext_vec,/* in: number of fields in ext_vec */ - que_thr_t* thr); /* in: query thread */ -/******************************************************************* Inserts an index entry to index. Tries first optimistic, then pessimistic descent down the tree. If the entry matches enough to a delete marked record, performs the insert by updating or delete unmarking the delete marked diff --git a/include/row0row.h b/include/row0row.h index 7083b14b966596c748c9fa55694dc77412ee4a86..50f0322b76b3136ac970804cc5fb169bea4f2e19 100644 --- a/include/row0row.h +++ b/include/row0row.h @@ -19,6 +19,17 @@ Created 4/20/1996 Heikki Tuuri #include "read0types.h" #include "btr0types.h" +/************************************************************************* +Gets the offset of the trx id field, in bytes relative to the origin of +a clustered index record. */ + +ulint +row_get_trx_id_offset( +/*==================*/ + /* out: offset of DATA_TRX_ID */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Reads the trx id field from a clustered index record. */ UNIV_INLINE @@ -39,30 +50,6 @@ row_get_rec_roll_ptr( rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ const ulint* offsets);/* in: rec_get_offsets(rec, index) */ -/************************************************************************* -Writes the trx id field to a clustered index record. */ -UNIV_INLINE -void -row_set_rec_trx_id( -/*===============*/ - rec_t* rec, /* in/out: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 10 bytes available,, or NULL */ - dict_index_t* index, /* in: clustered index */ - const ulint* offsets,/* in: rec_get_offsets(rec, index) */ - dulint trx_id);/* in: value of the field */ -/************************************************************************* -Sets the roll pointer field in a clustered index record. */ -UNIV_INLINE -void -row_set_rec_roll_ptr( -/*=================*/ - rec_t* rec, /* in/out: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 11 bytes available, or NULL */ - dict_index_t* index, /* in: clustered index */ - const ulint* offsets,/* in: rec_get_offsets(rec, index) */ - dulint roll_ptr);/* in: value of the field */ /********************************************************************* When an insert to a table is performed, this function builds the entry which has to be inserted to an index on the table. */ diff --git a/include/row0row.ic b/include/row0row.ic index c56dd9a30f8dd6188449127d0e38e0d4ecb70b50..a8ab9bce56ccf7b408be33dcb142c77cc7b055a3 100644 --- a/include/row0row.ic +++ b/include/row0row.ic @@ -10,33 +10,6 @@ Created 4/20/1996 Heikki Tuuri #include "rem0rec.h" #include "trx0undo.h" -/************************************************************************* -Reads the trx id or roll ptr field from a clustered index record: this function -is slower than the specialized inline functions. */ - -dulint -row_get_rec_sys_field( -/*==================*/ - /* out: value of the field */ - ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ - rec_t* rec, /* in: record */ - dict_index_t* index, /* in: clustered index */ - const ulint* offsets);/* in: rec_get_offsets(rec, index) */ -/************************************************************************* -Sets the trx id or roll ptr field in a clustered index record: this function -is slower than the specialized inline functions. */ - -void -row_set_rec_sys_field( -/*==================*/ - ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ - rec_t* rec, /* in/out: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 10 or 11 bytes available, or NULL */ - dict_index_t* index, /* in: clustered index */ - const ulint* offsets,/* in: rec_get_offsets(rec, index) */ - dulint val); /* in: value to set */ - /************************************************************************* Reads the trx id field from a clustered index record. */ UNIV_INLINE @@ -55,12 +28,11 @@ row_get_rec_trx_id( offset = index->trx_id_offset; - if (offset) { - return(trx_read_trx_id(rec + offset)); - } else { - return(row_get_rec_sys_field(DATA_TRX_ID, - rec, index, offsets)); + if (!offset) { + offset = row_get_trx_id_offset(rec, index, offsets); } + + return(trx_read_trx_id(rec + offset)); } /************************************************************************* @@ -81,69 +53,11 @@ row_get_rec_roll_ptr( offset = index->trx_id_offset; - if (offset) { - return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN)); - } else { - return(row_get_rec_sys_field(DATA_ROLL_PTR, - rec, index, offsets)); + if (!offset) { + offset = row_get_trx_id_offset(rec, index, offsets); } -} -/************************************************************************* -Writes the trx id field to a clustered index record. */ -UNIV_INLINE -void -row_set_rec_trx_id( -/*===============*/ - rec_t* rec, /* in/out: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 10 bytes available, or NULL */ - dict_index_t* index, /* in: clustered index */ - const ulint* offsets,/* in: rec_get_offsets(rec, index) */ - dulint trx_id) /* in: value of the field */ -{ - ulint offset; - - ut_ad(index->type & DICT_CLUSTERED); - ut_ad(rec_offs_validate(rec, index, offsets)); - - offset = index->trx_id_offset; - - if (offset) { - trx_write_trx_id(rec + offset, page_zip, trx_id); - } else { - row_set_rec_sys_field(DATA_TRX_ID, - rec, page_zip, index, offsets, trx_id); - } -} - -/************************************************************************* -Sets the roll pointer field in a clustered index record. */ -UNIV_INLINE -void -row_set_rec_roll_ptr( -/*=================*/ - rec_t* rec, /* in/out: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 11 bytes available, or NULL */ - dict_index_t* index, /* in: clustered index */ - const ulint* offsets,/* in: rec_get_offsets(rec, index) */ - dulint roll_ptr)/* in: value of the field */ -{ - ulint offset; - - ut_ad(index->type & DICT_CLUSTERED); - ut_ad(rec_offs_validate(rec, index, offsets)); - - offset = index->trx_id_offset; - - if (offset) { - trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, - page_zip, roll_ptr); - } else { - row_set_rec_sys_field(DATA_ROLL_PTR, - rec, page_zip, index, offsets, roll_ptr); - } + return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN)); } /*********************************************************************** diff --git a/include/row0upd.h b/include/row0upd.h index 8ebbb4890fe5a4f81135d12da2b8921abbc94532..fdc87138f76c915a5183e27b35fd37f841451536 100644 --- a/include/row0upd.h +++ b/include/row0upd.h @@ -79,8 +79,8 @@ void row_upd_rec_sys_fields( /*===================*/ rec_t* rec, /* in/out: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 21 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ dict_index_t* index, /* in: clustered index */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ trx_t* trx, /* in: transaction */ @@ -140,7 +140,9 @@ row_upd_rec_in_place( /*=================*/ rec_t* rec, /* in/out: record where replaced */ const ulint* offsets,/* in: array returned by rec_get_offsets() */ - upd_t* update);/* in: update vector */ + upd_t* update, /* in: update vector */ + page_zip_des_t* page_zip);/* in: compressed page with enough space + available, or NULL */ /******************************************************************* Builds an update vector from those fields which in a secondary index entry differ from a record that has the equal ordering fields. NOTE: we compare diff --git a/include/row0upd.ic b/include/row0upd.ic index 1eb9bc4d23238527b34192b06e3cce8078eea329..5c24debbed69989ca2147f31f77903af94aa9937 100644 --- a/include/row0upd.ic +++ b/include/row0upd.ic @@ -106,21 +106,37 @@ void row_upd_rec_sys_fields( /*===================*/ rec_t* rec, /* in/out: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 21 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ dict_index_t* index, /* in: clustered index */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ trx_t* trx, /* in: transaction */ dulint roll_ptr)/* in: roll ptr of the undo log record */ { + ulint offset; + ut_ad(index->type & DICT_CLUSTERED); ut_ad(rec_offs_validate(rec, index, offsets)); #ifdef UNIV_SYNC_DEBUG ut_ad(!buf_block_align(rec)->is_hashed || rw_lock_own(&btr_search_latch, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(!page_zip || page_zip_available(page_zip, 21)); - row_set_rec_trx_id(rec, page_zip, index, offsets, trx->id); - row_set_rec_roll_ptr(rec, page_zip, index, offsets, roll_ptr); + offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(rec, index, offsets); + } + + trx_write_trx_id(rec + offset, trx->id); + trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr); + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_trx_id( + page_zip, rec, rec_offs_data_size(offsets), + trx->id, NULL/* TODO: mtr */); + page_zip_write_roll_ptr( + page_zip, rec, rec_offs_data_size(offsets), + roll_ptr, NULL/* TODO: mtr */); + } } diff --git a/include/trx0sys.h b/include/trx0sys.h index 2a31e63db3543f5071ef2678b07ae16cab4b2e20..c8cca33708f1efaff523bb0ac38204dc305e51a9 100644 --- a/include/trx0sys.h +++ b/include/trx0sys.h @@ -211,10 +211,8 @@ UNIV_INLINE void trx_write_trx_id( /*=============*/ - byte* ptr, /* in: pointer to memory where written */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 10 bytes available, or NULL */ - dulint id); /* in: id */ + byte* ptr, /* in: pointer to memory where written */ + dulint id); /* in: id */ /********************************************************************* Reads a trx id from an index page. In case that the id size changes in some future version, this function should be used instead of diff --git a/include/trx0sys.ic b/include/trx0sys.ic index 11bb0534c41e66e8dd76e3e5ec94b0004998b69f..b6c8691f5cb0e08c7e83e7683c6ee88f391b57fb 100644 --- a/include/trx0sys.ic +++ b/include/trx0sys.ic @@ -214,18 +214,13 @@ UNIV_INLINE void trx_write_trx_id( /*=============*/ - byte* ptr, /* in: pointer to memory where written */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 10 bytes available, or NULL */ - dulint id) /* in: id */ + byte* ptr, /* in: pointer to memory where written */ + dulint id) /* in: id */ { - ut_ad(DATA_TRX_ID_LEN == 6); - +#if DATA_TRX_ID_LEN != 6 +# error "DATA_TRX_ID_LEN != 6" +#endif mach_write_to_6(ptr, id); - if (UNIV_LIKELY_NULL(page_zip)) { - ut_ad(page_zip_available(page_zip, 4 + DATA_TRX_ID_LEN)); - page_zip_write(page_zip, ptr, DATA_TRX_ID_LEN); - } } /********************************************************************* @@ -239,8 +234,9 @@ trx_read_trx_id( /* out: id */ byte* ptr) /* in: pointer to memory from where to read */ { - ut_ad(DATA_TRX_ID_LEN == 6); - +#if DATA_TRX_ID_LEN != 6 +# error "DATA_TRX_ID_LEN != 6" +#endif return(mach_read_from_6(ptr)); } diff --git a/include/trx0undo.h b/include/trx0undo.h index 0453b25567efc474005b0ebce0349f2f70a41d4b..bd7337e4f900d0463dffeada0a9a02f20760c1bb 100644 --- a/include/trx0undo.h +++ b/include/trx0undo.h @@ -55,8 +55,6 @@ void trx_write_roll_ptr( /*===============*/ byte* ptr, /* in: pointer to memory where written */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 11 bytes available, or NULL */ dulint roll_ptr); /* in: roll ptr */ /********************************************************************* Reads a roll ptr from an index page. In case that the roll ptr size diff --git a/include/trx0undo.ic b/include/trx0undo.ic index 2a1f539cee4d2e29f011c654aa006d22a776aef8..f9a505592d3b1c0d7ac075706303a1d3ac843141 100644 --- a/include/trx0undo.ic +++ b/include/trx0undo.ic @@ -88,18 +88,13 @@ UNIV_INLINE void trx_write_roll_ptr( /*===============*/ - byte* ptr, /* in: pointer to memory where written */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 11 bytes available, or NULL */ - dulint roll_ptr)/* in: roll ptr */ + byte* ptr, /* in: pointer to memory where written */ + dulint roll_ptr) /* in: roll ptr */ { - ut_ad(DATA_ROLL_PTR_LEN == 7); - +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif mach_write_to_7(ptr, roll_ptr); - if (UNIV_LIKELY_NULL(page_zip)) { - ut_ad(page_zip_available(page_zip, 4 + DATA_ROLL_PTR_LEN)); - page_zip_write(page_zip, ptr, DATA_ROLL_PTR_LEN); - } } /********************************************************************* diff --git a/log/log0recv.c b/log/log0recv.c index 49a2343cd967bacd07894b7e0af815b0b27dbfc9..731cdb1e446ed3fd4b7626453c414e2768f10183 100644 --- a/log/log0recv.c +++ b/log/log0recv.c @@ -838,7 +838,8 @@ recv_parse_or_apply_log_rec_body( break; case MLOG_PAGE_CREATE: case MLOG_COMP_PAGE_CREATE: ptr = page_parse_create(ptr, end_ptr, - type == MLOG_COMP_PAGE_CREATE, page, mtr); + type == MLOG_COMP_PAGE_CREATE, + page, mtr); break; case MLOG_UNDO_INSERT: ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page); @@ -885,8 +886,28 @@ recv_parse_or_apply_log_rec_body( ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, FALSE, ULINT_UNDEFINED); break; - case MLOG_COMP_DECOMPRESS: - if (page) { + case MLOG_ZIP_WRITE_NODE_PTR: + case MLOG_ZIP_WRITE_TRX_ID: + case MLOG_ZIP_WRITE_ROLL_PTR: + ut_error; /* TODO */ + break; + case MLOG_ZIP_COMPRESS: + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, TRUE, &index)) + && page) { + ut_a(page_is_comp(page)); + ut_a(page_zip); + if (UNIV_UNLIKELY(!page_zip_compress( + page_zip, page, index, NULL))) { + ut_error; + } + } + break; + case MLOG_ZIP_DECOMPRESS: + /* TODO: remove this? */ + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, TRUE, &index)) + && page) { ut_a(page_is_comp(page)); ut_a(page_zip); if (UNIV_UNLIKELY(!page_zip_decompress( diff --git a/page/page0cur.c b/page/page0cur.c index 05f5e55ee3d7e0ad43c3f732421fbad050e1e44d..ea86e314986525eb35a9b0beed9222415b2e87f2 100644 --- a/page/page0cur.c +++ b/page/page0cur.c @@ -699,8 +699,7 @@ page_cur_parse_insert_rec( byte* end_ptr,/* in: buffer end */ dict_index_t* index, /* in: record descriptor */ page_t* page, /* in/out: page or NULL */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 25 + rec_size bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ mtr_t* mtr) /* in: mtr or NULL */ { ulint offset = 0; /* remove warning */ @@ -847,7 +846,7 @@ page_cur_parse_insert_rec( ut_memcpy(buf + mismatch_index, ptr, end_seg_len); if (page_is_comp(page)) { - rec_set_info_and_status_bits(buf + origin_offset, NULL, + rec_set_info_and_status_bits(buf + origin_offset, info_and_status_bits); } else { rec_set_info_bits_old(buf + origin_offset, @@ -889,8 +888,7 @@ page_cur_insert_rec_low( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 37 + rec_size bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ dict_index_t* index, /* in: record descriptor */ rec_t* rec, /* in: pointer to a physical record or NULL */ @@ -929,20 +927,11 @@ page_cur_insert_rec_low( rec_size = rec_offs_size(offsets); } - if (UNIV_LIKELY_NULL(page_zip)) { - if (UNIV_UNLIKELY(!page_zip_alloc( - page_zip, page, 37 + rec_size))) { - - goto err_exit; - } - } - /* 2. Try to find suitable space from page memory management */ insert_buf = page_mem_alloc(page, page_zip, rec_size, - index, &heap_no); + index, &heap_no, mtr); if (UNIV_UNLIKELY(insert_buf == NULL)) { -err_exit: if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } @@ -978,8 +967,8 @@ page_cur_insert_rec_low( ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); } #endif - page_rec_set_next(insert_rec, next_rec, NULL); - page_rec_set_next(current_rec, insert_rec, page_zip); + page_rec_set_next(insert_rec, next_rec); + page_rec_set_next(current_rec, insert_rec); } page_header_set_field(page, page_zip, PAGE_N_RECS, @@ -989,7 +978,7 @@ page_cur_insert_rec_low( and set the heap_no field */ if (page_is_comp(page)) { rec_set_n_owned_new(insert_rec, NULL, 0); - rec_set_heap_no_new(insert_rec, NULL, heap_no); + rec_set_heap_no_new(insert_rec, heap_no); } else { rec_set_n_owned_old(insert_rec, 0); rec_set_heap_no_old(insert_rec, heap_no); @@ -1036,7 +1025,7 @@ page_cur_insert_rec_low( ulint n_owned; if (page_is_comp(page)) { n_owned = rec_get_n_owned_new(owner_rec); - rec_set_n_owned_new(owner_rec, page_zip, n_owned + 1); + rec_set_n_owned_new(owner_rec, NULL, n_owned + 1); } else { n_owned = rec_get_n_owned_old(owner_rec); rec_set_n_owned_old(owner_rec, n_owned + 1); @@ -1047,15 +1036,16 @@ page_cur_insert_rec_low( we have to split the corresponding directory slot in two. */ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) { - page_dir_split_slot(page, page_zip/* 12 */, + page_dir_split_slot(page, NULL, page_dir_find_owner_slot(owner_rec)); } } if (UNIV_LIKELY_NULL(page_zip)) { - page_zip_write(page_zip, - insert_rec - rec_offs_extra_size(offsets), - rec_size); + /* TODO: something similar to page_zip_dir_delete() */ + page_zip_dir_rewrite(page_zip, page); + + page_zip_write_rec(page_zip, insert_rec, offsets); } /* 9. Write log record of the insert */ @@ -1221,11 +1211,11 @@ page_copy_rec_list_end_to_created_page( insert_rec = rec_copy(heap_top, rec, offsets); if (page_is_comp(new_page)) { - rec_set_next_offs_new(prev_rec, NULL, + rec_set_next_offs_new(prev_rec, ut_align_offset(insert_rec, UNIV_PAGE_SIZE)); rec_set_n_owned_new(insert_rec, NULL, 0); - rec_set_heap_no_new(insert_rec, NULL, 2 + n_recs); + rec_set_heap_no_new(insert_rec, 2 + n_recs); } else { rec_set_next_offs_old(prev_rec, ut_align_offset(insert_rec, UNIV_PAGE_SIZE)); @@ -1244,7 +1234,7 @@ page_copy_rec_list_end_to_created_page( slot = page_dir_get_nth_slot(new_page, slot_index); - page_dir_slot_set_rec(slot, NULL, insert_rec); + page_dir_slot_set_rec(slot, insert_rec); page_dir_slot_set_n_owned(slot, NULL, count); count = 0; @@ -1290,14 +1280,14 @@ page_copy_rec_list_end_to_created_page( mach_write_to_4(log_ptr, log_data_len); if (page_is_comp(new_page)) { - rec_set_next_offs_new(insert_rec, NULL, PAGE_NEW_SUPREMUM); + rec_set_next_offs_new(insert_rec, PAGE_NEW_SUPREMUM); } else { rec_set_next_offs_old(insert_rec, PAGE_OLD_SUPREMUM); } slot = page_dir_get_nth_slot(new_page, 1 + slot_index); - page_dir_slot_set_rec(slot, NULL, page_get_supremum_rec(new_page)); + page_dir_slot_set_rec(slot, page_get_supremum_rec(new_page)); page_dir_slot_set_n_owned(slot, NULL, count + 1); page_dir_set_n_slots(new_page, NULL, 2 + slot_index); @@ -1357,8 +1347,7 @@ page_cur_parse_delete_rec( byte* end_ptr,/* in: buffer end */ dict_index_t* index, /* in: record descriptor */ page_t* page, /* in/out: page or NULL */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 32 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ mtr_t* mtr) /* in: mtr or NULL */ { ulint offset; @@ -1405,8 +1394,7 @@ page_cur_delete_rec( page_cur_t* cursor, /* in/out: a page cursor */ dict_index_t* index, /* in: record descriptor */ const ulint* offsets,/* in: rec_get_offsets(cursor->rec, index) */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 32 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ mtr_t* mtr) /* in: mini-transaction handle */ { page_dir_slot_t* cur_dir_slot; @@ -1425,7 +1413,6 @@ page_cur_delete_rec( current_rec = cursor->rec; ut_ad(rec_offs_validate(current_rec, index, offsets)); ut_ad((ibool) !!page_is_comp(page) == index->table->comp); - ut_ad(!page_zip || page_zip_available(page_zip, 32)); /* The record must not be the supremum or infimum record. */ ut_ad(page_rec_is_user_rec(current_rec)); @@ -1469,7 +1456,7 @@ page_cur_delete_rec( /* 3. Remove the record from the linked list of records */ - page_rec_set_next(prev_rec, next_rec, page_zip); + page_rec_set_next(prev_rec, next_rec); page_header_set_field(page, page_zip, PAGE_N_RECS, (ulint)(page_get_n_recs(page) - 1)); @@ -1482,7 +1469,7 @@ page_cur_delete_rec( ut_ad(cur_n_owned > 1); if (current_rec == page_dir_slot_get_rec(cur_dir_slot)) { - page_dir_slot_set_rec(cur_dir_slot, page_zip, prev_rec); + page_dir_slot_set_rec(cur_dir_slot, prev_rec); } /* 5. Update the number of owned records of the slot */ @@ -1490,7 +1477,7 @@ page_cur_delete_rec( page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1); /* 6. Free the memory occupied by the record */ - page_mem_free(page, page_zip, current_rec, offsets); + page_mem_free(page, page_zip, current_rec, index, offsets, mtr); /* 7. Now we have decremented the number of owned records of the slot. If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the diff --git a/page/page0page.c b/page/page0page.c index a74d3274967e6be2056bc20c45fabfaa7333e2af..766df3763c74bc57f4b15b1c0ab1ef1ea4302daf 100644 --- a/page/page0page.c +++ b/page/page0page.c @@ -242,9 +242,11 @@ page_mem_alloc( page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ ulint need, /* in: number of bytes needed */ dict_index_t* index, /* in: record descriptor */ - ulint* heap_no)/* out: this contains the heap number + ulint* heap_no,/* out: this contains the heap number of the allocated record if allocation succeeds */ + mtr_t* mtr) /* in: mini-transaction handle, or NULL + if page_zip == NULL */ { rec_t* rec; byte* block; @@ -252,7 +254,18 @@ page_mem_alloc( ulint garbage; ut_ad(page && heap_no); - ut_ad(!page_zip || page_zip_validate(page_zip, page)); + + /* TODO: add parameter n_extra */ + + if (UNIV_LIKELY_NULL(page_zip)) { + ut_ad(page_is_comp(page)); + ut_ad(page_zip_validate(page_zip, page)); + + if (!page_zip_alloc(page_zip, page, index, mtr, need, 1)) { + + return(NULL); + } + } /* If there are records in the free list, look if the first is big enough */ @@ -324,10 +337,17 @@ page_create_write_log( buf_frame_t* frame, /* in: a buffer frame where the page is created */ mtr_t* mtr, /* in: mini-transaction handle */ - ulint comp) /* in: nonzero=compact page format */ + ibool comp) /* in: TRUE=compact page format */ { - mlog_write_initial_log_record(frame, - comp ? MLOG_COMP_PAGE_CREATE : MLOG_PAGE_CREATE, mtr); + ulint type; + + if (UNIV_LIKELY(comp)) { + type = MLOG_COMP_PAGE_CREATE; + } else { + type = MLOG_PAGE_CREATE; + } + + mlog_write_initial_log_record(frame, type, mtr); } /*************************************************************** @@ -336,20 +356,27 @@ Parses a redo log record of creating a page. */ byte* page_parse_create( /*==============*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr __attribute__((unused)), /* in: buffer end */ - ulint comp, /* in: nonzero=compact page format */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr __attribute__((unused)), /* in: buffer end */ + ulint comp, /* in: nonzero=compact page format */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { ut_ad(ptr && end_ptr); /* The record is empty, except for the record initial part */ if (page) { - page_create(page, buf_block_get_page_zip( - buf_block_align(page)), mtr, comp); + dict_index_t* index; + + if (UNIV_LIKELY(comp)) { + index = srv_sys->dummy_ind2; + } else { + index = srv_sys->dummy_ind1; + } + + page_create(page, NULL, mtr, index); } return(ptr); @@ -366,7 +393,7 @@ page_create( page is created */ page_zip_des_t* page_zip, /* in/out: compressed page, or NULL */ mtr_t* mtr, /* in: mini-transaction handle */ - ulint comp) /* in: nonzero=compact page format */ + dict_index_t* index) /* in: the index of the page */ { page_dir_slot_t* slot; mem_heap_t* heap; @@ -376,19 +403,10 @@ page_create( rec_t* infimum_rec; rec_t* supremum_rec; page_t* page; - dict_index_t* index; ulint* offsets; -#if 1 /* testing */ - byte zip_data[512]; -#endif + const ibool comp = index->table->comp; - if (UNIV_LIKELY(comp)) { - index = srv_sys->dummy_ind2; - } else { - index = srv_sys->dummy_ind1; - ut_ad(!page_zip); - } - + ut_ad(!page_zip || comp); ut_ad(frame && mtr); ut_ad(PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE <= PAGE_DATA); @@ -435,7 +453,7 @@ page_create( ut_a(infimum_rec == page + PAGE_NEW_INFIMUM); rec_set_n_owned_new(infimum_rec, NULL, 1); - rec_set_heap_no_new(infimum_rec, NULL, 0); + rec_set_heap_no_new(infimum_rec, 0); } else { ut_a(infimum_rec == page + PAGE_OLD_INFIMUM); @@ -464,7 +482,7 @@ page_create( ut_a(supremum_rec == page + PAGE_NEW_SUPREMUM); rec_set_n_owned_new(supremum_rec, NULL, 1); - rec_set_heap_no_new(supremum_rec, NULL, 1); + rec_set_heap_no_new(supremum_rec, 1); } else { ut_a(supremum_rec == page + PAGE_OLD_SUPREMUM); @@ -501,41 +519,30 @@ page_create( /* Set the slots to point to infimum and supremum. */ slot = page_dir_get_nth_slot(page, 0); - page_dir_slot_set_rec(slot, NULL, infimum_rec); + page_dir_slot_set_rec(slot, infimum_rec); slot = page_dir_get_nth_slot(page, 1); - page_dir_slot_set_rec(slot, NULL, supremum_rec); + page_dir_slot_set_rec(slot, supremum_rec); /* Set the next pointers in infimum and supremum */ if (UNIV_LIKELY(comp)) { - rec_set_next_offs_new(infimum_rec, NULL, PAGE_NEW_SUPREMUM); - rec_set_next_offs_new(supremum_rec, NULL, 0); + rec_set_next_offs_new(infimum_rec, PAGE_NEW_SUPREMUM); + rec_set_next_offs_new(supremum_rec, 0); } else { rec_set_next_offs_old(infimum_rec, PAGE_OLD_SUPREMUM); rec_set_next_offs_old(supremum_rec, 0); } -#if 1 /* testing */ - if (UNIV_LIKELY(comp)) { - page_zip = &buf_block_align(page)->page_zip; - page_zip->data = zip_data; - page_zip->size = sizeof zip_data; - page_zip->m_start = page_zip->m_end = 0; - } -#endif if (UNIV_LIKELY_NULL(page_zip)) { ut_ad(comp); - if (!page_zip_compress(page_zip, page)) { + if (!page_zip_compress(page_zip, page, index, mtr)) { /* The compression of a newly created page should always succeed. */ ut_error; } } -#if 1 /* testing */ - buf_block_align(page)->page_zip.data = 0; -#endif return(page); } @@ -644,7 +651,7 @@ page_copy_rec_list_end( if (UNIV_LIKELY_NULL(new_page_zip)) { if (UNIV_UNLIKELY(!page_zip_compress(new_page_zip, - new_page))) { + new_page, index, mtr))) { if (UNIV_UNLIKELY(!page_zip_decompress( new_page_zip, new_page, mtr))) { @@ -674,7 +681,9 @@ The records are copied to the end of the record list on new_page. */ ibool page_copy_rec_list_start( /*=====================*/ - /* out: TRUE on success */ + /* out: TRUE on success; FALSE on + compression failure (new_page will + be decompressed from new_page_zip) */ page_t* new_page, /* in/out: index page to copy to */ page_zip_des_t* new_page_zip, /* in/out: compressed page, or NULL */ rec_t* rec, /* in: record on page */ @@ -725,7 +734,7 @@ page_copy_rec_list_start( if (UNIV_LIKELY_NULL(new_page_zip)) { if (UNIV_UNLIKELY(!page_zip_compress(new_page_zip, - new_page))) { + new_page, index, mtr))) { if (UNIV_UNLIKELY(!page_zip_decompress( new_page_zip, new_page, mtr))) { @@ -899,7 +908,7 @@ page_delete_rec_list_end( offsets = rec_get_offsets(rec2, index, offsets, ULINT_UNDEFINED, &heap); - if (1 /* TODO: UNIV_LIKELY_NULL(page_zip) */) { + if (UNIV_LIKELY_NULL(page_zip)) { /* Clear the data bytes of the deleted record in order to improve the compression ratio of the page. The @@ -948,7 +957,7 @@ page_delete_rec_list_end( slot_index = page_dir_find_owner_slot(rec2); slot = page_dir_get_nth_slot(page, slot_index); - if (1 /* TODO: UNIV_UNLIKELY(page_zip != NULL) */) { + if (UNIV_LIKELY_NULL(page_zip)) { ulint n_slots; rec2 = rec; do { @@ -957,14 +966,12 @@ page_delete_rec_list_end( for deleted records. */ rec2[-REC_N_NEW_EXTRA_BYTES] = 0; rec2 = rec_get_next_ptr(rec2, TRUE); - } - while (rec2); + } while (rec2); /* The compression algorithm expects the removed slots in the page directory to be cleared. */ n_slots = page_dir_get_n_slots(page) - slot_index - 1; - ut_ad(n_slots > 0); ut_ad(n_slots < UNIV_PAGE_SIZE / PAGE_DIR_SLOT_SIZE); memset(slot - (n_slots * PAGE_DIR_SLOT_SIZE), 0, @@ -987,19 +994,17 @@ page_delete_rec_list_end( slot = page_dir_get_nth_slot(page, slot_index); } - page_dir_slot_set_rec(slot, page_zip, - page_get_supremum_rec(page)); + page_dir_slot_set_rec(slot, page_get_supremum_rec(page)); page_dir_slot_set_n_owned(slot, page_zip, n_owned); page_dir_set_n_slots(page, page_zip, slot_index + 1); /* Remove the record chain segment from the record chain */ - page_rec_set_next(prev_rec, page_get_supremum_rec(page), page_zip); + page_rec_set_next(prev_rec, page_get_supremum_rec(page)); /* Catenate the deleted chain segment to the page free list */ - page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE), - page_zip); + page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE)); page_header_set_ptr(page, page_zip, PAGE_FREE, rec); page_header_set_field(page, page_zip, PAGE_GARBAGE, @@ -1007,6 +1012,9 @@ page_delete_rec_list_end( page_header_set_field(page, page_zip, PAGE_N_RECS, (ulint)(page_get_n_recs(page) - n_recs)); + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_dir_rewrite(page_zip, page); + } } /***************************************************************** @@ -1157,7 +1165,7 @@ page_move_rec_list_start( /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ - page_create(page, NULL, mtr, TRUE); + page_create(page, NULL, mtr, index); buf_block_align(page)->check_index_page_at_flush = TRUE; /* Copy the records from the temporary space to the @@ -1176,7 +1184,8 @@ page_move_rec_list_start( buf_frame_free(temp_page); mtr_set_log_mode(mtr, log_mode); - if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page))) { + if (UNIV_UNLIKELY(!page_zip_compress( + page_zip, page, index, mtr))) { /* Reorganizing a page should reduce entropy, making the compressed page occupy less space. */ @@ -1218,8 +1227,7 @@ void page_dir_delete_slot( /*=================*/ page_t* page, /* in/out: the index page */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 10 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ ulint slot_no)/* in: slot to be deleted */ { page_dir_slot_t* slot; @@ -1227,7 +1235,6 @@ page_dir_delete_slot( ulint i; ulint n_slots; - ut_ad(!page_zip || page_zip_available(page_zip, 10)); ut_ad(!page_zip || page_is_comp(page)); ut_ad(slot_no > 0); ut_ad(slot_no + 1 < page_dir_get_n_slots(page)); @@ -1250,8 +1257,7 @@ page_dir_delete_slot( for (i = slot_no + 1; i < n_slots; i++) { rec_t* rec; rec = page_dir_slot_get_rec(page_dir_get_nth_slot(page, i)); - page_dir_slot_set_rec(page_dir_get_nth_slot(page, i - 1), - page_zip, rec); + page_dir_slot_set_rec(page_dir_get_nth_slot(page, i - 1), rec); } /* 4. Zero out the last slot, which will be removed */ @@ -1270,9 +1276,7 @@ void page_dir_add_slots( /*===============*/ page_t* page, /* in/out: the index page */ - page_zip_des_t* page_zip,/* in/out: comprssed page with at least - n * PAGE_DIR_SLOT_SIZE bytes available, - or NULL */ + page_zip_des_t* page_zip,/* in/out: comprssed page, or NULL */ ulint start, /* in: the slot above which the new slots are added */ ulint n) /* in: number of slots to add @@ -1289,9 +1293,6 @@ page_dir_add_slots( ut_ad(start < n_slots - 1); - ut_ad(!page_zip - || page_zip_available(page_zip, n * PAGE_DIR_SLOT_SIZE)); - /* Update the page header */ page_dir_set_n_slots(page, page_zip, n_slots + n); @@ -1303,7 +1304,7 @@ page_dir_add_slots( rec = page_dir_slot_get_rec(slot); slot = page_dir_get_nth_slot(page, i + n); - page_dir_slot_set_rec(slot, page_zip, rec); + page_dir_slot_set_rec(slot, rec); } } @@ -1314,8 +1315,8 @@ void page_dir_split_slot( /*================*/ page_t* page, /* in/out: index page */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 12 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be written, or NULL */ ulint slot_no)/* in: the directory slot */ { rec_t* rec; @@ -1326,7 +1327,6 @@ page_dir_split_slot( ulint n_owned; ut_ad(page); - ut_ad(!page_zip || page_zip_available(page_zip, 12)); ut_ad(!page_zip || page_is_comp(page)); ut_ad(slot_no > 0); @@ -1350,7 +1350,7 @@ page_dir_split_slot( /* 2. We add one directory slot immediately below the slot to be split. */ - page_dir_add_slots(page, page_zip/* 2 */, slot_no - 1, 1); + page_dir_add_slots(page, page_zip, slot_no - 1, 1); /* The added slot is now number slot_no, and the old slot is now number slot_no + 1 */ @@ -1360,14 +1360,13 @@ page_dir_split_slot( /* 3. We store the appropriate values to the new slot. */ - page_dir_slot_set_rec(new_slot, page_zip, rec); - page_dir_slot_set_n_owned(new_slot, page_zip/* 5 */, n_owned / 2); + page_dir_slot_set_rec(new_slot, rec); + page_dir_slot_set_n_owned(new_slot, page_zip, n_owned / 2); /* 4. Finally, we update the number of records field of the original slot */ - page_dir_slot_set_n_owned(slot, page_zip/* 5 */, - n_owned - (n_owned / 2)); + page_dir_slot_set_n_owned(slot, page_zip, n_owned - (n_owned / 2)); } /***************************************************************** @@ -1379,8 +1378,7 @@ void page_dir_balance_slot( /*==================*/ page_t* page, /* in/out: index page */ - page_zip_des_t* page_zip,/* in/out: compressed page with - at least 15 bytes available, or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ ulint slot_no)/* in: the directory slot */ { page_dir_slot_t* slot; @@ -1391,7 +1389,6 @@ page_dir_balance_slot( rec_t* new_rec; ut_ad(page); - ut_ad(!page_zip || page_zip_available(page_zip, 15)); ut_ad(!page_zip || page_is_comp(page)); ut_ad(slot_no > 0); @@ -1434,7 +1431,7 @@ page_dir_balance_slot( rec_set_n_owned_old(new_rec, n_owned + 1); } - page_dir_slot_set_rec(slot, page_zip, new_rec); + page_dir_slot_set_rec(slot, new_rec); page_dir_slot_set_n_owned(up_slot, page_zip, up_n_owned -1); } else { diff --git a/page/page0zip.c b/page/page0zip.c index c917f3d882032790d7c96168002089ac134dc0f3..8ccbcd193b424dfa0040fe3ac20ffe26c4d8a5a7 100644 --- a/page/page0zip.c +++ b/page/page0zip.c @@ -15,6 +15,9 @@ Created June 2005 by Marko Makela #include "page0page.h" #include "mtr0log.h" #include "ut0sort.h" +#include "dict0boot.h" +#include "btr0cur.h" +#include "page0types.h" #include "zlib.h" /* Please refer to ../include/page0zip.ic for a description of the @@ -40,6 +43,146 @@ static const byte supremum_extra_data[] = { 0x65, 0x6d, 0x75, 0x6d /* "supremum" */ }; +/************************************************************************** +Encode the length of a fixed-length column. */ +static +byte* +page_zip_fixed_field_encode( +/*========================*/ + /* out: buf + length of encoded val */ + byte* buf, /* in: pointer to buffer where to write */ + ulint val) /* in: value to write */ +{ + ut_ad(val >= 2); + + if (UNIV_LIKELY(val < 126)) { + /* + 0 = nullable variable field of at most 255 bytes length; + 1 = not null variable field of at most 255 bytes length; + 126 = nullable variable field with maximum length >255; + 127 = not null variable field with maximum length >255 + */ + *buf++ = val; + } else { + *buf++ = 0x80 | val >> 7; + *buf++ = 0xff & val; + } + + return(buf); +} + +/************************************************************************** +Write the index information for the compressed page. */ +static +ulint +page_zip_fields_encode( +/*===================*/ + /* out: used size of buf */ + ulint n, /* in: number of fields to compress */ + dict_index_t* index, /* in: index comprising at least n fields */ + ulint trx_id_pos,/* in: position of the trx_id column + in the index, or ULINT_UNDEFINED if + this is a non-leaf page */ + byte* buf) /* out: buffer of (n + 1) * 2 bytes */ +{ + const byte* buf_start = buf; + ulint i; + ulint col; + ulint trx_id_col = 0; + /* sum of lengths of preceding non-nullable fixed fields, or 0 */ + ulint fixed_sum = 0; + + ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n); + + for (i = col = 0; i < n; i++) { + dict_field_t* field = dict_index_get_nth_field(index, i); + ulint val; + + if (dtype_get_prtype(dict_col_get_type( + dict_field_get_col(field))) + & DATA_NOT_NULL) { + val = 1; /* set the "not nullable" flag */ + } else { + val = 0; /* nullable field */ + } + + if (!field->fixed_len) { + /* variable-length field */ + + dtype_t* type = dict_col_get_type( + dict_field_get_col(field)); + + if (UNIV_UNLIKELY(dtype_get_len(type) > 255) + || UNIV_UNLIKELY(dtype_get_mtype(type) + == DATA_BLOB)) { + val |= 0x7e; /* max > 255 bytes */ + } + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode(buf, + fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + *buf++ = val; + col++; + } else if (val) { + /* fixed-length non-nullable field */ + if (i && UNIV_UNLIKELY(i == trx_id_pos)) { + if (fixed_sum) { + /* Write out the length of any + preceding non-nullable fields, + and start a new trx_id column. */ + buf = page_zip_fixed_field_encode(buf, + fixed_sum << 1 | 1); + } + + fixed_sum = field->fixed_len; + trx_id_col = ++col; + } else { + /* add to the sum */ + fixed_sum += field->fixed_len; + } + } else { + /* fixed-length nullable field */ + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode(buf, + fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + buf = page_zip_fixed_field_encode(buf, + field->fixed_len << 1); + col++; + } + } + + if (fixed_sum) { + /* Write out the lengths of last fixed-length columns. */ + buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1); + } + + if (trx_id_pos != ULINT_UNDEFINED) { + /* Write out the position of the trx_id column */ + if (trx_id_col < 128) { + *buf++ = trx_id_col; + } else { + *buf++ = 0x80 | trx_id_col >> 7; + *buf++ = 0xff & trx_id_col; + } + } + + ut_ad((ulint) (buf - buf_start) <= (n + 1) * 2); + return((ulint) (buf - buf_start)); +} + /************************************************************************** Populate the dense page directory from the sparse directory. */ static @@ -48,8 +191,9 @@ page_zip_dir_encode( /*================*/ const page_t* page, /* in: compact page */ page_zip_des_t* page_zip,/* out: dense directory on compressed page */ - const rec_t** recs) /* in: array of 0, out: dense page directory - sorted by ascending address (and heap_no) */ + const rec_t** recs) /* in: pointer to array of 0, or NULL; + out: dense page directory sorted by ascending + address (and heap_no) */ { byte* rec; ulint status; @@ -61,14 +205,14 @@ page_zip_dir_encode( min_mark = 0; - if (mach_read_from_2((page_t*) page + (PAGE_HEADER + PAGE_LEVEL))) { + if (page_is_leaf(page)) { + status = REC_STATUS_ORDINARY; + } else { status = REC_STATUS_NODE_PTR; if (UNIV_UNLIKELY(mach_read_from_4((page_t*) page + FIL_PAGE_PREV) == FIL_NULL)) { min_mark = REC_INFO_MIN_REC_FLAG; } - } else { - status = REC_STATUS_ORDINARY; } n_heap = page_dir_get_n_heap((page_t*) page); @@ -109,9 +253,12 @@ page_zip_dir_encode( page_zip_dir_set(page_zip, i++, offs); - /* Ensure that each heap_no occurs at most once. */ - ut_a(!recs[heap_no - 2]); /* exclude infimum and supremum */ - recs[heap_no - 2] = rec; + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - 2]); + /* exclude infimum and supremum */ + recs[heap_no - 2] = rec; + } ut_a(rec_get_status(rec) == status); } @@ -132,9 +279,12 @@ page_zip_dir_encode( page_zip_dir_set(page_zip, i++, offs); - /* Ensure that each heap_no occurs at most once. */ - ut_a(!recs[heap_no - 2]); /* exclude infimum and supremum */ - recs[heap_no - 2] = rec; + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - 2]); + /* exclude infimum and supremum */ + recs[heap_no - 2] = rec; + } offs = rec_get_next_offs(rec, TRUE); } @@ -151,16 +301,25 @@ page_zip_compress( /*==============*/ /* out: TRUE on success, FALSE on failure; page_zip will be left intact on failure. */ - page_zip_des_t* page_zip,/* in: size; out: compressed page */ - const page_t* page) /* in: uncompressed page */ + page_zip_des_t* page_zip,/* in: size; out: data, n_blobs, + m_start, m_end */ + const page_t* page, /* in: uncompressed page */ + dict_index_t* index, /* in: index of the B-tree node */ + mtr_t* mtr) /* in: mini-transaction handle, + or NULL if no logging is needed */ { z_stream c_stream; int err; - byte* buf; + ulint n_fields;/* number of index fields needed */ + byte* fields; /* index field information */ + byte* buf; /* compressed payload of the page */ ulint n_dense; - const byte* src; - const byte** recs; /* dense page directory, sorted by address */ + const rec_t** recs; /* dense page directory, sorted by address */ mem_heap_t* heap; + ulint trx_id_col; + ulint* offsets = NULL; + ulint n_blobs = 0; + byte* storage;/* storage of uncompressed columns */ ut_a(page_is_comp((page_t*) page)); ut_ad(page_simple_validate_new((page_t*) page)); @@ -182,21 +341,31 @@ page_zip_compress( == PAGE_NEW_SUPREMUM); } + if (page_is_leaf(page)) { + n_fields = dict_index_get_n_fields(index); + } else { + n_fields = dict_index_get_n_unique_in_tree(index); + } + /* The dense directory excludes the infimum and supremum records. */ n_dense = page_dir_get_n_heap((page_t*) page) - 2; - ut_a(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip->size); + if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE + >= page_zip->size)) { + return(FALSE); + } heap = mem_heap_create(page_zip->size + + n_fields * (2 + sizeof *offsets) + n_dense * ((sizeof *recs) - PAGE_ZIP_DIR_SLOT_SIZE)); recs = mem_heap_alloc(heap, n_dense * sizeof *recs); memset(recs, 0, n_dense * sizeof *recs); + fields = mem_heap_alloc(heap, (n_fields + 1) * 2); + buf = mem_heap_alloc(heap, page_zip->size - PAGE_DATA - PAGE_ZIP_DIR_SLOT_SIZE * n_dense); - page_zip_dir_encode(page, page_zip, recs); - /* Compress the data payload. */ c_stream.zalloc = (alloc_func) 0; c_stream.zfree = (free_func) 0; @@ -206,37 +375,221 @@ page_zip_compress( ut_a(err == Z_OK); c_stream.next_out = buf; - c_stream.avail_out = page_zip->size - (PAGE_DATA + 1) - - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; - - if (UNIV_LIKELY(n_dense > 0) - && *recs == page + (PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) { - src = page + (PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES); - recs++; - n_dense--; + /* Subtract the space reserved for uncompressed data. */ + /* Page header, n_relocated, end marker of modification log */ + c_stream.avail_out = page_zip->size + - (PAGE_DATA + 2 * PAGE_ZIP_DIR_SLOT_SIZE); + /* Dense page directory and uncompressed columns, if any */ + if (page_is_leaf(page)) { + trx_id_col = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + ut_ad(trx_id_col > 0); + if (trx_id_col == ULINT_UNDEFINED) { + /* Signal the absence of trx_id + in page_zip_fields_encode() */ + trx_id_col = 0; + c_stream.avail_out -= n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + } else { + c_stream.avail_out -= n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + } } else { - src = page + PAGE_ZIP_START; + c_stream.avail_out -= n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + + REC_NODE_PTR_SIZE); + trx_id_col = ULINT_UNDEFINED; } - while (n_dense--) { - c_stream.next_in = (void*) src; - c_stream.avail_in = *recs - src - REC_N_NEW_EXTRA_BYTES; + c_stream.avail_in = page_zip_fields_encode( + n_fields, index, trx_id_col, fields); + c_stream.next_in = fields; + if (!trx_id_col) { + trx_id_col = ULINT_UNDEFINED; + } - err = deflate(&c_stream, Z_NO_FLUSH); - if (err != Z_OK) { - goto zlib_error; + err = deflate(&c_stream, Z_FULL_FLUSH); + if (err != Z_OK) { + goto zlib_error; + } + + /* TODO: do not write to page_zip->data until deflateEnd() */ + page_zip_set_n_relocated(page_zip, 0); + page_zip_dir_encode(page, page_zip, recs); + + c_stream.next_in = (byte*) page + PAGE_ZIP_START; + + /* TODO: do not write to page_zip->data until deflateEnd() */ + storage = page_zip->data + page_zip->size + - (n_dense + 1) + * PAGE_ZIP_DIR_SLOT_SIZE; + + if (page_is_leaf(page)) { + /* BTR_EXTERN_FIELD_REF storage */ + byte* externs = storage - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + while (n_dense--) { + ulint i; + rec_t* rec = (rec_t*) *recs++; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + ut_ad(rec_offs_n_fields(offsets) == n_fields); + + /* Compress the extra bytes. */ + c_stream.avail_in = rec - REC_N_NEW_EXTRA_BYTES + - c_stream.next_in; + + if (c_stream.avail_in) { + err = deflate(&c_stream, Z_NO_FLUSH); + if (err != Z_OK) { + goto zlib_error; + } + } + + /* Compress the data bytes. */ + + c_stream.next_in = rec; + + /* Check if there are any externally stored columns. + For each externally stored column, store the + BTR_EXTERN_FIELD_REF separately._*/ + + for (i = 0; i < n_fields; i++) { + ulint len; + byte* src; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + ut_ad(!rec_offs_nth_extern(offsets, i)); + /* Store trx_id and roll_ptr + in uncompressed form. */ + src = rec_get_nth_field(rec, offsets, + i, &len); +#ifdef UNIV_DEBUG + ut_ad(len == DATA_TRX_ID_LEN); + rec_get_nth_field(rec, offsets, + i + 1, &len); + ut_ad(len == DATA_ROLL_PTR_LEN); +#endif /* UNIV_DEBUG */ + /* Compress any preceding bytes. */ + c_stream.avail_in = src - c_stream.next_in; + + if (c_stream.avail_in) { + err = deflate(&c_stream, Z_NO_FLUSH); + if (err != Z_OK) { + goto zlib_error; + } + } + + ut_ad(c_stream.next_in == src); + + memcpy(storage - (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + c_stream.next_in, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + c_stream.next_in += + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Skip also roll_ptr */ + i++; + } else if (rec_offs_nth_extern(offsets, i)) { + src = rec_get_nth_field(rec, offsets, + i, &len); + ut_ad(len > BTR_EXTERN_FIELD_REF_SIZE); + src += len - BTR_EXTERN_FIELD_REF_SIZE; + + c_stream.avail_in = src - c_stream.next_in; + ut_ad(c_stream.avail_in); + err = deflate(&c_stream, Z_NO_FLUSH); + if (err != Z_OK) { + goto zlib_error; + } + + ut_ad(c_stream.next_in == src); + + /* Reserve space for the data at + the end of the space reserved for + the compressed data and the page + modification log. */ + + if (UNIV_UNLIKELY(c_stream.avail_out + <= BTR_EXTERN_FIELD_REF_SIZE)) { + /* out of space */ + goto zlib_error; + } + + c_stream.avail_out + -= BTR_EXTERN_FIELD_REF_SIZE; + externs -= BTR_EXTERN_FIELD_REF_SIZE; + + ut_ad(externs > c_stream.next_in); + + /* Copy the BLOB pointer */ + memcpy(externs, c_stream.next_in, + BTR_EXTERN_FIELD_REF_SIZE); + c_stream.next_in += + BTR_EXTERN_FIELD_REF_SIZE; + /* Increment the BLOB counter */ + n_blobs++; + } + } + + /* Compress the last bytes of the record. */ + c_stream.avail_in = rec_get_end(rec, offsets) + - c_stream.next_in; + + if (c_stream.avail_in) { + err = deflate(&c_stream, Z_NO_FLUSH); + if (err != Z_OK) { + goto zlib_error; + } + } } + } else { + /* This is a node pointer page. */ + while (n_dense--) { + rec_t* rec = (rec_t*) *recs++; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + ut_ad(rec_offs_n_fields(offsets) == n_fields + 1); + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Compress the extra bytes. */ + c_stream.avail_in = rec - REC_N_NEW_EXTRA_BYTES + - c_stream.next_in; + + if (c_stream.avail_in) { + err = deflate(&c_stream, Z_NO_FLUSH); + if (err != Z_OK) { + goto zlib_error; + } + } + + /* Compress the data bytes, except node_ptr. */ + c_stream.next_in = rec; + c_stream.avail_in = rec_offs_data_size(offsets) + - REC_NODE_PTR_SIZE; + ut_ad(c_stream.avail_in); - src = *recs++; + err = deflate(&c_stream, Z_NO_FLUSH); + if (err != Z_OK) { + goto zlib_error; + } + + memcpy(storage - REC_NODE_PTR_SIZE + * (rec_get_heap_no_new(rec) - 1), + c_stream.next_in, REC_NODE_PTR_SIZE); + c_stream.next_in += REC_NODE_PTR_SIZE; + } } - /* Compress the last record. */ - c_stream.next_in = (void*) src; - c_stream.avail_in = - page_header_get_field((page_t*) page, PAGE_HEAP_TOP) - - ut_align_offset(src, UNIV_PAGE_SIZE); - ut_a(c_stream.avail_in < UNIV_PAGE_SIZE - - PAGE_ZIP_START - PAGE_DIR); + ut_ad(page + page_header_get_field((page_t*) page, PAGE_HEAP_TOP) + == c_stream.next_in); + /* Finish the compression. */ + ut_ad(!c_stream.avail_in); err = deflate(&c_stream, Z_FINISH); @@ -251,66 +604,25 @@ page_zip_compress( ut_a(err == Z_OK); page_zip->m_end = page_zip->m_start = PAGE_DATA + c_stream.total_out; + page_zip->n_blobs = n_blobs; /* Copy the page header */ memcpy(page_zip->data, page, PAGE_DATA); /* Copy the compressed data */ memcpy(page_zip->data + PAGE_DATA, buf, c_stream.total_out); /* Zero out the area reserved for the modification log */ memset(page_zip->data + PAGE_DATA + c_stream.total_out, 0, - c_stream.avail_out + 1); + c_stream.avail_out + PAGE_ZIP_DIR_SLOT_SIZE); mem_heap_free(heap); #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG ut_a(page_zip_validate(page_zip, page)); #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ - return(TRUE); -} -/************************************************************************** -Read an integer from the modification log of the compressed page. */ -static -ulint -page_zip_ulint_read( -/*================*/ - /* out: length of the integer, in bytes; - zero on failure */ - const byte* src, /* in: where to read */ - ulint* dest) /* out: the decoded integer */ -{ - ulint num = (unsigned char) *src; - if (num < 128) { - *dest = num; /* 0xxxxxxx: 0..127 */ - return(1); - } - if (num < 192) { /* 10xxxxxx xxxxxxxx: 0..16383 */ - *dest = ((num << 8) & ~0x8000) | (unsigned char) src[1]; - return(2); + if (UNIV_LIKELY_NULL(mtr)) { + mlog_open_and_write_index(mtr, (page_t*) page, index, + MLOG_ZIP_COMPRESS, 0); } - *dest = ULINT_MAX; - return(0); /* 11xxxxxxx xxxxxxxx: reserved */ -} -/************************************************************************** -Write an integer to the modification log of the compressed page. */ -static -ulint -page_zip_ulint_write( -/*=================*/ - /* out: length of the integer, in bytes; - zero on failure */ - byte* dest, /* in: where to write */ - ulint num) /* out: integer to write */ -{ - if (num < 128) { - *dest = num; /* 0xxxxxxx: 0..127 */ - return(1); - } - if (num < 16384) { /* 10xxxxxx xxxxxxxx: 0..16383 */ - dest[0] = num >> 8 | 0x80; - dest[1] = num; - return(2); - } - ut_error; - return(0); /* 11xxxxxxx xxxxxxxx: reserved */ + return(TRUE); } /************************************************************************** @@ -338,7 +650,120 @@ page_zip_dir_sort( ulint high) /* in: upper bound of the sorting area, exclusive */ { UT_SORT_FUNCTION_BODY(page_zip_dir_sort, arr, aux_arr, low, high, - page_zip_dir_cmp); + page_zip_dir_cmp); +} + +/************************************************************************** +Deallocate the index information initialized by page_zip_fields_decode(). */ +static +void +page_zip_fields_free( +/*=================*/ + dict_index_t* index) /* in: dummy index to be freed */ +{ + if (index) { + dict_table_t* table = index->table; + mem_heap_free(index->heap); + mutex_free(&(table->autoinc_mutex)); + mem_heap_free(table->heap); + } +} + +/************************************************************************** +Read the index information for the compressed page. */ +static +dict_index_t* +page_zip_fields_decode( +/*===================*/ + /* out,own: dummy index describing the page, + or NULL on error */ + const byte* buf, /* in: index information */ + const byte* end, /* in: end of buf */ + ulint* trx_id_col)/* in: NULL for non-leaf pages; + for leaf pages, pointer to where to store + the position of the trx_id column */ +{ + const byte* b; + ulint n; + ulint i; + dict_table_t* table; + dict_index_t* index; + + /* Determine the number of fields. */ + for (b = buf, n = 0; b < end; n++) { + if (*b++ & 0x80) { + b++; /* skip the second byte */ + } + } + + if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS) + || UNIV_UNLIKELY(b > end)) { + + return(NULL); + } + + if (trx_id_col) { + n--; + } + + table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n, TRUE); + index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY", + DICT_HDR_SPACE, 0, n); + index->table = table; + index->n_uniq = n; + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + + /* Initialize the fields. */ + for (b = buf, i = 0; i < n; i++) { + ulint val = *b++; + ulint mtype; + ulint len; + + if (UNIV_UNLIKELY(val & 0x80)) { + val = (val & 0x7f) << 7 | *b++; + } + + len = val >> 1; + + switch (len) { + case 0x7e: + len = 0x7fff; + /* fall through */ + case 0: + mtype = DATA_BINARY; + break; + default: + mtype = DATA_FIXBINARY; + } + dict_mem_table_add_col(table, "DUMMY", mtype, + val & 1 ? DATA_NOT_NULL : 0, len, 0); + dict_index_add_col(index, + dict_table_get_nth_col(table, i), 0, 0); + } + + /* Decode the position of the trx_id column. */ + if (trx_id_col) { + ulint val = *b++; + if (UNIV_UNLIKELY(val & 0x80)) { + val = (val & 0x7f) << 7 | *b++; + } + + if (UNIV_UNLIKELY(val >= n)) { + page_zip_fields_free(index); + index = NULL; + } + + if (!val) { + val = ULINT_UNDEFINED; + } + + *trx_id_col = val; + } + + ut_ad(b == end); + + return(index); } /************************************************************************** @@ -393,6 +818,9 @@ page_zip_dir_decode( UNIV_PREFETCH_RW(slot); } + ut_ad((offs & PAGE_ZIP_DIR_SLOT_MASK) + >= PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES); + recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK); } @@ -419,6 +847,8 @@ page_zip_dir_decode( return(TRUE); } +/************************************************************************** +Initialize the REC_N_NEW_EXTRA_BYTES of each record. */ static ibool page_zip_set_extra_bytes( @@ -456,14 +886,14 @@ page_zip_set_extra_bytes( return(FALSE); } - rec_set_next_offs_new(rec, NULL, offs); + rec_set_next_offs_new(rec, offs); rec = page + offs; rec[-REC_N_NEW_EXTRA_BYTES] = info_bits; info_bits = 0; } /* Set the next pointer of the last user record. */ - rec_set_next_offs_new(rec, NULL, PAGE_NEW_SUPREMUM); + rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM); /* Set n_owned of the supremum record. */ page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = n_owned; @@ -493,16 +923,65 @@ page_zip_set_extra_bytes( } offs = page_zip_dir_get(page_zip, i); - rec_set_next_offs_new(rec, NULL, offs); + rec_set_next_offs_new(rec, offs); } /* Terminate the free list. */ rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ - rec_set_next_offs_new(rec, NULL, 0); + rec_set_next_offs_new(rec, 0); return(TRUE); } +/************************************************************************** +Find the heap number of a record by binary search in the sorted +dense page directory. */ +static +ulint +page_zip_find_heap_no( +/*==================*/ + /* out: the heap number of the smallest record + in recs[] that is >= start; 0 if not found */ + const byte* start, /* in: start address of the record */ + rec_t** recs, /* in: dense page directory, + sorted by address (indexed by heap_no - 2) */ + ulint n_dense)/* in: number of entries in recs[] */ +{ + ulint low = 0; + ulint high = n_dense; + ulint mid; + + for (;;) { + mid = (low + high) / 2; + + /* 'start' should be at least REC_N_NEW_EXTRA_BYTES + smaller than the matching entry in recs[] */ + ut_ad(start != recs[mid]); + + if (UNIV_UNLIKELY(low == high)) { + if (UNIV_UNLIKELY(start > recs[high])) { + return(0); + } + break; + } + + if (start > recs[mid]) { + /* Too high */ + high = mid; + } else { + /* Either this is too low, or we found a match. */ + low = mid + 1; + if (start > recs[low]) { + /* The adjacent record does not match. + This is the closest match. */ + break; + } + } + } + + return(mid + 2); +} + /************************************************************************** Apply the modification log to an uncompressed page. */ static @@ -513,34 +992,129 @@ page_zip_apply_log( or NULL on failure */ const byte* data, /* in: modification log */ ulint size, /* in: maximum length of the log, in bytes */ - page_t* page) /* in/out: uncompressed page */ + page_t* page, /* out: uncompressed page */ + rec_t** recs, /* in: dense page directory, + sorted by address (indexed by heap_no - 2) */ + ulint n_dense,/* in: size of recs[] */ + ulint heap_status, + /* in: heap_no and status bits for + the next record to uncompress */ + dict_index_t* index, /* in: index of the page */ + ulint* offsets)/* in/out: work area for + rec_get_offsets_reverse() */ { const byte* const end = data + size; - /* Apply the modification log. */ - while (*data) { - ulint ulint_len; - ulint length, offset; - ulint_len = page_zip_ulint_read(data, &length); - data += ulint_len; - if (UNIV_UNLIKELY(!ulint_len) - || UNIV_UNLIKELY(data + length >= end)) { + for (;;) { + ulint start; + rec_t* rec; + ulint len; + ulint hs; + + start = mach_read_from_2((byte*) data); + if (UNIV_UNLIKELY(data + 2 >= end)) { + return(NULL); + } + if (UNIV_UNLIKELY(!start)) { + break; + } + if (UNIV_UNLIKELY(start < PAGE_ZIP_START)) { return(NULL); } - ut_a(length > 0 && length < UNIV_PAGE_SIZE - PAGE_DATA); - ulint_len = page_zip_ulint_read(data, &offset); - data += ulint_len; - if (UNIV_UNLIKELY(!ulint_len) - || UNIV_UNLIKELY(data + length >= end)) { + data += 2; + + /* Determine the heap number of the record. */ + hs = page_zip_find_heap_no(page + start, recs, n_dense) + << REC_HEAP_NO_SHIFT; + if (UNIV_UNLIKELY(!hs)) { return(NULL); } - /* TODO: determine offset from heap_no */ - offset += PAGE_DATA; - ut_a(offset + length < UNIV_PAGE_SIZE); + hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1); + + /* This may either be an old record that is being + overwritten (updated in place, or allocated from + the free list), or a new record, with the next + available_heap_no. */ + if (UNIV_UNLIKELY(hs > heap_status)) { + return(NULL); + } else if (hs == heap_status) { + /* A new record was allocated from the heap. */ + heap_status += REC_HEAP_NO_SHIFT; + } + + rec_get_offsets_reverse(data, index, + heap_status & REC_STATUS_NODE_PTR, + offsets); + + rec = page + start + rec_offs_extra_size(offsets); + + mach_write_to_2(rec - REC_NEW_HEAP_NO, hs); + + /* Copy the extra bytes (backwards). */ + { + ulint n = rec_offs_extra_size(offsets) + - REC_N_NEW_EXTRA_BYTES; + byte* b = rec - REC_N_NEW_EXTRA_BYTES; + while (n--) { + *b-- = *data++; + } + } + + /* Copy the data bytes. */ + if (UNIV_UNLIKELY(heap_status & REC_STATUS_NODE_PTR)) { + /* Non-leaf nodes should not contain any + externally stored columns. */ + if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) { + return(NULL); + } - memcpy(page + offset, data, length); - data += length; + len = rec_offs_data_size(offsets) + - REC_NODE_PTR_SIZE; + /* Copy the data bytes, except node_ptr. */ + if (UNIV_UNLIKELY(data + len >= end)) { + return(NULL); + } + memcpy(rec, data, len); + data += len; + } else { + ulint i; + byte* next_out = rec; + + /* Check if there are any externally stored columns. + For each externally stored column, skip the + BTR_EXTERN_FIELD_REF._*/ + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + byte* dst = rec_get_nth_field( + rec, offsets, i, &len); + ut_ad(len > BTR_EXTERN_FIELD_REF_SIZE); + + len += dst - next_out + - BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY(data + len >= end)) { + return(NULL); + } + memcpy(next_out, data, len); + data += len; + next_out += len + + BTR_EXTERN_FIELD_REF_SIZE; + } + } + + /* Copy the last bytes of the record. + Skip roll_ptr and trx_id. */ + len = rec_get_end(rec, offsets) + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + - next_out; + if (UNIV_UNLIKELY(data + len >= end)) { + return(NULL); + } + memcpy(next_out, data, len); + data += len; + } } return(data); @@ -555,19 +1129,29 @@ ibool page_zip_decompress( /*================*/ /* out: TRUE on success, FALSE on failure */ - page_zip_des_t* page_zip,/* in: data, size; out: m_start, m_end */ + page_zip_des_t* page_zip,/* in: data, size; + out: m_start, m_end, n_blobs */ page_t* page, /* out: uncompressed page, may be trashed */ mtr_t* mtr) /* in: mini-transaction handle, or NULL if no logging is needed */ { z_stream d_stream; int err; - byte** recs; /* dense page directory, sorted by address */ - byte* dst; + dict_index_t* index = NULL; + rec_t** recs; /* dense page directory, sorted by address */ + rec_t** recsc; /* cursor to dense page directory */ ulint heap_status;/* heap_no and status bits */ - ulint n_dense; + ulint n_dense;/* number of user records on the page */ + ulint reloc = 0;/* index to page_zip_get_relocated() */ + ulint orig = ULINT_UNDEFINED; + /* page_zip_get_relocated(reloc), + or ULINT_UNDEFINED */ + ulint trx_id_col = ULINT_UNDEFINED; mem_heap_t* heap; - ulint info_bits; + ulint* offsets = NULL; + ulint info_bits = 0; + const byte* storage; + const byte* externs; ut_ad(page_zip_simple_validate(page_zip)); @@ -575,8 +1159,8 @@ page_zip_decompress( n_dense = page_dir_get_n_heap(page_zip->data) - 2; ut_a(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip->size); - heap = mem_heap_create(n_dense * (2 * sizeof *recs)); - recs = mem_heap_alloc(heap, n_dense * (2 * sizeof *recs)); + heap = mem_heap_create(n_dense * (3 * sizeof *recs)); + recsc = recs = mem_heap_alloc(heap, n_dense * (2 * sizeof *recs)); /* Copy the page header. */ memcpy(page, page_zip->data, PAGE_DATA); @@ -593,10 +1177,9 @@ page_zip_decompress( infimum_extra, sizeof infimum_extra); if (UNIV_UNLIKELY(!page_get_n_recs((page_t*) page))) { rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, - NULL, PAGE_NEW_SUPREMUM); + PAGE_NEW_SUPREMUM); } else { rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, - NULL, page_zip_dir_get(page_zip, 0) & PAGE_ZIP_DIR_SLOT_MASK); } @@ -604,7 +1187,6 @@ page_zip_decompress( memcpy(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1), supremum_extra_data, sizeof supremum_extra_data); - /* Decompress the user records. */ d_stream.zalloc = (alloc_func) 0; d_stream.zfree = (free_func) 0; d_stream.opaque = (voidpf) 0; @@ -613,103 +1195,322 @@ page_zip_decompress( ut_a(err == Z_OK); d_stream.next_in = page_zip->data + PAGE_DATA; - d_stream.avail_in = page_zip->size - (PAGE_DATA + 1) - - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + d_stream.avail_in = page_zip->size - (PAGE_DATA + 1); - info_bits = 0; + d_stream.next_out = page + PAGE_ZIP_START; + d_stream.avail_out = UNIV_PAGE_SIZE - PAGE_ZIP_START; - if (mach_read_from_2((page_t*) page + (PAGE_HEADER + PAGE_LEVEL))) { - heap_status = REC_STATUS_NODE_PTR | 2 << REC_HEAP_NO_SHIFT; - if (UNIV_UNLIKELY(mach_read_from_4((page_t*) page + /* Decode the zlib header. */ + err = inflate(&d_stream, Z_BLOCK); + if (err != Z_OK) { + + goto zlib_error; + } + + /* Decode the index information. */ + err = inflate(&d_stream, Z_BLOCK); + if (err != Z_OK) { + + goto zlib_error; + } + + index = page_zip_fields_decode(page + PAGE_ZIP_START, + d_stream.next_out, + page_is_leaf(page) ? &trx_id_col : NULL); + + if (UNIV_UNLIKELY(!index)) { + + goto zlib_error; + } + + /* Decompress the user records. */ + d_stream.next_out = page + PAGE_ZIP_START; + + { + /* Pre-allocate the offsets + for rec_get_offsets_reverse(). */ + ulint n; + + if (page_is_leaf(page)) { + n = dict_index_get_n_fields(index); + heap_status = REC_STATUS_ORDINARY + | 2 << REC_HEAP_NO_SHIFT; + + /* Subtract the space reserved + for uncompressed data. */ + if (trx_id_col != ULINT_UNDEFINED) { + d_stream.avail_in -= n_dense + * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + } else { + d_stream.avail_in -= n_dense + * PAGE_ZIP_DIR_SLOT_SIZE; + } + } else { + n = dict_index_get_n_unique_in_tree(index) + 1; + heap_status = REC_STATUS_NODE_PTR + | 2 << REC_HEAP_NO_SHIFT; + + if (UNIV_UNLIKELY(mach_read_from_4((page_t*) page + FIL_PAGE_PREV) == FIL_NULL)) { - info_bits = REC_INFO_MIN_REC_FLAG; + info_bits = REC_INFO_MIN_REC_FLAG; + } + + /* Subtract the space reserved + for uncompressed data. */ + d_stream.avail_in -= n_dense + * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE); } - } else { - heap_status = REC_STATUS_ORDINARY | 2 << REC_HEAP_NO_SHIFT; + + n += 1 + REC_OFFS_HEADER_SIZE; + offsets = mem_heap_alloc(heap, n * sizeof(ulint)); + *offsets = n; } - dst = page + PAGE_ZIP_START; + if (page_zip_get_n_relocated(page_zip)) { + orig = page_zip_get_relocated(page_zip, reloc); + reloc++; + } - if (UNIV_LIKELY(n_dense > 0)) { - n_dense--; + page_zip->n_blobs = 0; - if (*recs == page + (PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) { - dst = page + (PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES); - recs++; - } else { - /* This is a special case: we are - decompressing the extra bytes of the first - user record. As dst will not be pointing to a - record, we do not set the heap_no and status - bits. On the next round of the loop, dst will - point to the first user record. */ + if (UNIV_UNLIKELY(!n_dense)) { + d_stream.avail_out = 0; + err = inflate(&d_stream, Z_FINISH); - goto first_inflate; + if (err == Z_STREAM_END) { + goto zlib_error; } + + goto zlib_done; } while (n_dense--) { - /* set heap_no and the status bits */ - mach_write_to_2(dst - REC_NEW_HEAP_NO, heap_status); - heap_status += 1 << REC_HEAP_NO_SHIFT; -first_inflate: - d_stream.next_out = dst; - d_stream.avail_out = *recs - dst - REC_N_NEW_EXTRA_BYTES; + byte* const last = d_stream.next_out; + rec_t* rec = *recsc++; + + /* Was the record relocated? */ + if (UNIV_UNLIKELY(orig + < ut_align_offset(rec, UNIV_PAGE_SIZE))) { + /* The record was relocated since the page was + compressed. Get the original offset. */ + rec = page + orig; + + /* Get the offset of the next relocated record. */ + if (reloc < page_zip_get_n_relocated(page_zip)) { + orig = page_zip_get_relocated(page_zip, reloc); + ut_ad(ut_align_offset(rec, UNIV_PAGE_SIZE) + < orig); + reloc++; + } else { + /* End of list */ + orig = ULINT_UNDEFINED; + } + } + + d_stream.avail_out = rec - REC_N_NEW_EXTRA_BYTES - last; ut_ad(d_stream.avail_out < UNIV_PAGE_SIZE - - PAGE_ZIP_START - PAGE_DIR); + - PAGE_ZIP_START - PAGE_DIR); err = inflate(&d_stream, Z_NO_FLUSH); switch (err) { case Z_OK: break; + case Z_STREAM_END: + /* Apparently, n_dense has grown + since the time the page was last compressed. */ + if (UNIV_UNLIKELY(d_stream.next_out != last)) { + /* Somehow, we got a partial record. */ + goto zlib_error; + } + goto zlib_done; case Z_BUF_ERROR: if (!d_stream.avail_out) { break; } - /* fall through */ default: goto zlib_error; } - dst = *recs++; - } + ut_ad(d_stream.next_out == rec - REC_N_NEW_EXTRA_BYTES); + /* Prepare to decompress the data bytes. */ + d_stream.next_out = rec; + /* Set heap_no and the status bits. */ + mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); + heap_status += 1 << REC_HEAP_NO_SHIFT; - /* Decompress the last record. */ - d_stream.next_out = dst; - d_stream.avail_out = - page_header_get_field(page, PAGE_HEAP_TOP) - - ut_align_offset(dst, UNIV_PAGE_SIZE); - ut_a(d_stream.avail_out < UNIV_PAGE_SIZE - - PAGE_ZIP_START - PAGE_DIR); + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (page_is_leaf(page)) { + ulint i; + + /* Check if there are any externally stored columns. + For each externally stored column, restore the + BTR_EXTERN_FIELD_REF separately._*/ + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ulint len; + byte* dst; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + /* Skip trx_id and roll_ptr */ + dst = rec_get_nth_field( + rec, offsets, i, &len); + if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN) + || rec_offs_nth_extern( + offsets, i)) { + + goto zlib_error; + } + + d_stream.avail_out = dst + - d_stream.next_out; + err = inflate(&d_stream, Z_NO_FLUSH); + switch (err) { + case Z_OK: + break; + case Z_STREAM_END: + if (!n_dense) { + /* This was the last + record. */ + goto zlib_done; + } + goto zlib_error; + case Z_BUF_ERROR: + if (!d_stream.avail_out) { + break; + } + /* fall through */ + default: + goto zlib_error; + } + + ut_ad(d_stream.next_out == dst); + + d_stream.avail_out -= DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + d_stream.next_out += DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + } else if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field( + rec, offsets, i, &len); + ut_ad(len > BTR_EXTERN_FIELD_REF_SIZE); + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + d_stream.avail_out = dst + - d_stream.next_out; + err = inflate(&d_stream, Z_NO_FLUSH); + switch (err) { + case Z_OK: + break; + case Z_STREAM_END: + if (!n_dense) { + /* This was the last + record. */ + goto zlib_done; + } + goto zlib_error; + case Z_BUF_ERROR: + if (!d_stream.avail_out) { + break; + } + /* fall through */ + default: + goto zlib_error; + } + + ut_ad(d_stream.next_out == dst); + + /* Reserve space for the data at + the end of the space reserved for + the compressed data and the + page modification log. */ + + if (UNIV_UNLIKELY(d_stream.avail_in + <= BTR_EXTERN_FIELD_REF_SIZE)) { + /* out of space */ + goto zlib_error; + } + + d_stream.avail_in + -= BTR_EXTERN_FIELD_REF_SIZE; + d_stream.next_out + += BTR_EXTERN_FIELD_REF_SIZE; + page_zip->n_blobs++; + } + } - if (UNIV_LIKELY(d_stream.avail_out != 0)) { - /* set heap_no and the status bits */ - mach_write_to_2(dst - REC_NEW_HEAP_NO, heap_status); - } + /* Decompress the last bytes of the record. */ + d_stream.avail_out = rec_get_end(rec, offsets) + - d_stream.next_out; - err = inflate(&d_stream, Z_FINISH); + err = inflate(&d_stream, Z_NO_FLUSH); + switch (err) { + case Z_OK: + break; + case Z_STREAM_END: + if (!n_dense) { + /* This was the last record. */ + goto zlib_done; + } + goto zlib_error; + case Z_BUF_ERROR: + if (!d_stream.avail_out) { + break; + } + /* fall through */ + default: + goto zlib_error; + } + } else { + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); - if (err != Z_STREAM_END) { -zlib_error: - inflateEnd(&d_stream); - mem_heap_free(heap); - return(FALSE); - } + /* Decompress the data bytes, except node_ptr. */ + d_stream.avail_out = rec_offs_data_size(offsets) + - REC_NODE_PTR_SIZE; - err = inflateEnd(&d_stream); - ut_a(err == Z_OK); + err = inflate(&d_stream, Z_NO_FLUSH); + switch (err) { + case Z_OK: + break; + case Z_STREAM_END: + if (!n_dense) { + /* This was the last record. */ + goto zlib_done; + } + goto zlib_error; + case Z_BUF_ERROR: + if (!d_stream.avail_out) { + break; + } + /* fall through */ + default: + goto zlib_error; + } - mem_heap_free(heap); + d_stream.next_out += REC_NODE_PTR_SIZE; + } - if (UNIV_UNLIKELY(!page_zip_set_extra_bytes( - page_zip, page, info_bits))) { - return(FALSE); + ut_ad(d_stream.next_out == rec_get_end(rec, offsets)); } + /* We should have run out of data in the loop. */ +zlib_error: + inflateEnd(&d_stream); + goto err_exit; + +zlib_done: + err = inflateEnd(&d_stream); + ut_a(err == Z_OK); + /* Clear the unused heap space on the uncompressed page. */ - dst = page_header_get_ptr(page, PAGE_HEAP_TOP); - memset(dst, 0, page_dir_get_nth_slot(page, - page_dir_get_n_slots(page) - 1) - dst); + memset(d_stream.next_out, 0, page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) - 1) - d_stream.next_out); /* The dense directory excludes the infimum and supremum records. */ n_dense = page_dir_get_n_heap(page) - 2; @@ -721,24 +1522,95 @@ page_zip_decompress( const byte* mod_log_ptr; mod_log_ptr = page_zip_apply_log( page_zip->data + page_zip->m_start, - d_stream.avail_in, page); + d_stream.avail_in, page, recs, n_dense, + heap_status, index, offsets); + if (UNIV_UNLIKELY(!mod_log_ptr)) { - return(FALSE); + goto err_exit; } page_zip->m_end = mod_log_ptr - page_zip->data; } + page_zip_fields_free(index); + mem_heap_free(heap); + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes( + page_zip, page, info_bits))) { +err_exit: + page_zip_fields_free(index); + mem_heap_free(heap); + return(FALSE); + } + + /* Copy the uncompressed fields. */ + + storage = page_zip->data + page_zip->size + - (n_dense + 1 + page_zip_get_n_relocated(page_zip)) + * PAGE_ZIP_DIR_SLOT_SIZE; + externs = storage - n_dense * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + page_zip->n_blobs = 0; + recsc = recs; + + while (n_dense--) { + rec_t* rec = *recsc++; + + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (page_is_leaf(page)) { + ulint i; + ulint len; + byte* dst; + + /* Check if there are any externally stored columns. + For each externally stored column, restore the + BTR_EXTERN_FIELD_REF separately._*/ + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field( + rec, offsets, i, &len); + ut_ad(len > BTR_EXTERN_FIELD_REF_SIZE); + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + externs -= BTR_EXTERN_FIELD_REF_SIZE; + + /* Copy the BLOB pointer */ + memcpy(dst, externs, + BTR_EXTERN_FIELD_REF_SIZE); + } + } + + if (trx_id_col != ULINT_UNDEFINED) { + dst = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(len >= DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + + memcpy(dst, storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + } + } else { + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE, + storage - REC_NODE_PTR_SIZE + * (rec_get_heap_no_new(rec) - 1), + REC_NODE_PTR_SIZE); + } + } + ut_a(page_is_comp(page)); ut_ad(page_simple_validate_new(page)); if (UNIV_LIKELY_NULL(mtr)) { - byte* log_ptr = mlog_open(mtr, 11); - if (log_ptr) { - log_ptr = mlog_write_initial_log_record_fast( - page, MLOG_COMP_DECOMPRESS, - log_ptr, mtr); - mlog_close(mtr, log_ptr); - } + mlog_open_and_write_index(mtr, page, index, + MLOG_ZIP_DECOMPRESS, 0); } return(TRUE); @@ -751,8 +1623,8 @@ Check that the compressed and decompressed pages match. */ ibool page_zip_validate( /*==============*/ - const page_zip_des_t* page_zip, /* in: compressed page */ - const page_t* page) /* in: uncompressed page */ + const page_zip_des_t* page_zip,/* in: compressed page */ + const page_t* page) /* in: uncompressed page */ { page_zip_des_t temp_page_zip = *page_zip; page_t* temp_page = buf_frame_alloc(); @@ -770,64 +1642,509 @@ page_zip_validate( #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ /************************************************************************** -Write data to the compressed portion of a page. The data must already +Write an entire record on the compressed page. The data must already have been written to the uncompressed page. */ void -page_zip_write( -/*===========*/ +page_zip_write_rec( +/*===============*/ page_zip_des_t* page_zip,/* in/out: compressed page */ - const byte* str, /* in: address on the uncompressed page */ - ulint length) /* in: length of the data */ + const byte* rec, /* in: record being written */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { - ulint pos = ut_align_offset(str, UNIV_PAGE_SIZE); -#ifdef UNIV_DEBUG - ulint trailer_len = page_zip_dir_size(page_zip); -#endif /* UNIV_DEBUG */ + page_t* page; + byte* data; + byte* storage; - ut_ad(buf_block_get_page_zip(buf_block_align((byte*)str)) == page_zip); + ut_ad(buf_block_get_page_zip(buf_block_align((byte*)rec)) == page_zip); ut_ad(page_zip_simple_validate(page_zip)); - ut_ad(page_zip->m_start >= PAGE_DATA); - ut_ad(!memcmp(ut_align_down((byte*) str, UNIV_PAGE_SIZE), - page_zip->data, PAGE_ZIP_START)); - ut_ad(!page_zip->data[page_zip->m_end]); + ut_ad(page_zip->size > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate((rec_t*) rec, NULL, offsets)); - ut_ad(pos >= PAGE_DATA); - ut_ad(pos + length <= UNIV_PAGE_SIZE - PAGE_DIR - PAGE_DIR_SLOT_SIZE - * page_dir_get_n_slots(buf_frame_align((byte*)str))); + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(!memcmp(ut_align_down((byte*) rec, UNIV_PAGE_SIZE), + page_zip->data, PAGE_DATA)); - pos -= PAGE_DATA; - /* TODO: encode heap_no instead of pos */ + page = ut_align_down((rec_t*) rec, UNIV_PAGE_SIZE); - ut_ad(page_zip_available(page_zip, page_zip_entry_size(pos, length))); + ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START); + ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + UNIV_PAGE_SIZE + - PAGE_DIR - PAGE_DIR_SLOT_SIZE + * page_dir_get_n_slots(page)); /* Append to the modification log. */ - page_zip->m_end += page_zip_ulint_write( - page_zip->data + page_zip->m_end, length); - page_zip->m_end += page_zip_ulint_write( - page_zip->data + page_zip->m_end, pos); - memcpy(&page_zip->data[page_zip->m_end], str, length); - page_zip->m_end += length; - ut_ad(!page_zip->data[page_zip->m_end]); - ut_ad(page_zip->m_end + trailer_len < page_zip->size); + data = page_zip->data + page_zip->m_end; + ut_ad(!mach_read_from_2(data)); + + { + /* Identify the record by writing its start address. 0 is + reserved to indicate the end of the modification log. */ + const byte* start = rec_get_start((rec_t*) rec, offsets); + const byte* b = rec - REC_N_NEW_EXTRA_BYTES; + + mach_write_to_2(data, ut_align_offset(start, UNIV_PAGE_SIZE)); + /* Write the extra bytes backwards, so that + rec_offs_extra_size() can be easily computed in + page_zip_apply_log() by invoking + rec_get_offsets_reverse(). */ + + while (b != start) { + *data++ = *--b; + } + } + + /* Write the data bytes. Store the uncompressed bytes separately. */ + storage = page_zip->data + page_zip->size + - (page_dir_get_n_heap(page) - 1 + + page_zip_get_n_relocated(page_zip)) + * PAGE_ZIP_DIR_SLOT_SIZE; + + if (page_is_leaf(page)) { + ulint i; + ulint len; + const byte* start = rec; + + /* Check if there are any externally stored columns. + For each externally stored column, store the + BTR_EXTERN_FIELD_REF separately._*/ + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* src = rec_get_nth_field( + (rec_t*) rec, offsets, i, &len); + ut_ad(len > BTR_EXTERN_FIELD_REF_SIZE); + src += len - BTR_EXTERN_FIELD_REF_SIZE; + + memcpy(data, start, src - start); + data += src - start; + start = src; + + /* TODO: copy the BLOB pointer to + the appropriate place in the + uncompressed BLOB pointer array */ + } + } + + /* Log the last bytes of the record. + Skip roll_ptr and trx_id. */ + len = rec_get_end((rec_t*) rec, offsets) + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) - start; + + memcpy(data, start, len); + data += len; + start += len; + + /* Copy roll_ptr and trx_id to the uncompressed area. */ + memcpy(storage - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new((rec_t*) rec) - 2), + start, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + ut_a(data < storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (page_dir_get_n_heap(page) - 2) + - page_zip->n_blobs + * BTR_EXTERN_FIELD_REF_SIZE + - 2 /* for the modification log terminator */); + } else { + /* This is a node pointer page. */ + ulint len; + + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Copy the data bytes, except node_ptr. */ + len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE; + memcpy(data, rec, len); + data += len; + + /* Copy the node pointer to the uncompressed area. */ + memcpy(storage - REC_NODE_PTR_SIZE + * (rec_get_heap_no_new((rec_t*) rec) - 2), + rec + len, + REC_NODE_PTR_SIZE); + ut_a(data < storage + - REC_NODE_PTR_SIZE + * (page_dir_get_n_heap(page) - 2) + - 2 /* for the modification log terminator */); + } + + page_zip->m_end = data - page_zip->data; + ut_a(!mach_read_from_2(data)); + + /* TODO: write a redo log record (MLOG_ZIP_WRITE_REC), + or is it at all needed? */ +} + +/************************************************************************** +Write the BLOB pointer of a record on the leaf page of a clustered index. +The information must already have been updated on the uncompressed page. */ + +void +page_zip_write_blob_ptr( +/*====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in/out: record whose data is being + written */ + dict_index_t* index, /* in: index of the page */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint n, /* in: column index */ + mtr_t* mtr) /* in: mini-transaction handle, + or NULL if no logging is needed */ +{ + byte* field; + byte* storage; + page_t* page = buf_frame_align((byte*) rec); + ulint blob_no; + ulint next_offs; + ulint len; + + ut_ad(buf_block_get_page_zip(buf_block_align((byte*)rec)) == page_zip); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip->size > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate((rec_t*) rec, NULL, offsets)); + ut_ad(rec_offs_nth_extern(offsets, n)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(!memcmp(page, page_zip->data, PAGE_DATA)); + + ut_ad(page_is_leaf(page)); + + blob_no = 0; + next_offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE); + ut_a(next_offs > PAGE_NEW_SUPREMUM_END); + + do { + rec_t* r = page + next_offs; + + if (r == rec) { + goto found; + } + + blob_no += rec_get_n_extern_new(r, index, ULINT_UNDEFINED); + + next_offs = rec_get_next_offs(r, TRUE); + ut_a(next_offs > 0); + } while (next_offs != PAGE_NEW_SUPREMUM); + + ut_error; + +found: + blob_no += rec_get_n_extern_new(rec, index, n); + ut_a(blob_no < page_zip->n_blobs); + + /* The heap number of the first user record is 2. */ + storage = page_zip->data + page_zip->size + - (page_dir_get_n_heap(page) - 2) + * PAGE_ZIP_DIR_SLOT_SIZE + - (rec_get_heap_no_new((rec_t*) rec) - 2) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + - blob_no * BTR_EXTERN_FIELD_REF_SIZE; + field = rec_get_nth_field((rec_t*) rec, offsets, n, &len); + + memcpy(storage, field + len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + + if (mtr) { + mlog_write_initial_log_record( + (rec_t*) rec, MLOG_ZIP_WRITE_BLOB_PTR, mtr); + /* TODO: write n */ + } +} + +/************************************************************************** +Write the node pointer of a record on a non-leaf compressed page. */ + +void +page_zip_write_node_ptr( +/*====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in/out: record */ + ulint size, /* in: data size of rec */ + ulint ptr, /* in: node pointer */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ +{ + byte* field; + byte* storage; + page_t* page = buf_frame_align(rec); + + ut_ad(buf_block_get_page_zip(buf_block_align(rec)) == page_zip); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip->size > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(page_rec_is_comp(rec)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(!memcmp(page, page_zip->data, PAGE_DATA)); + + ut_ad(!page_is_leaf(page)); + + /* The heap number of the first user record is 2. */ + storage = page_zip->data + page_zip->size + - (page_dir_get_n_heap(page) - 2) + * PAGE_ZIP_DIR_SLOT_SIZE + - (rec_get_heap_no_new(rec) - 2) * REC_NODE_PTR_SIZE; + field = rec + size - REC_NODE_PTR_SIZE; + #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG - ut_a(page_zip_validate(page_zip, - ut_align_down((byte*) str, UNIV_PAGE_SIZE))); + ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE)); #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +#if REC_NODE_PTR_SIZE != 4 +# error "REC_NODE_PTR_SIZE != 4" +#endif + mach_write_to_4(field, ptr); + memcpy(storage, field, REC_NODE_PTR_SIZE); + + if (mtr) { + mlog_write_initial_log_record( + rec, MLOG_ZIP_WRITE_NODE_PTR, mtr); + } } -#ifdef UNIV_DEBUG /************************************************************************** -Determine if enough space is available in the modification log. */ +Write the trx_id of a record on a B-tree leaf node page. */ -ibool -page_zip_available_noninline( -/*=========================*/ - /* out: TRUE if enough space - is available */ - const page_zip_des_t* page_zip,/* in: compressed page */ - ulint size) +void +page_zip_write_trx_id( +/*==================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in/out: record */ + ulint size, /* in: data size of rec */ + dulint trx_id, /* in: transaction identifier */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ { - return(page_zip_available(page_zip, size)); + byte* field; + byte* storage; + page_t* page = ut_align_down(rec, UNIV_PAGE_SIZE); + + ut_ad(buf_block_get_page_zip(buf_block_align(rec)) == page_zip); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip->size > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(page_rec_is_comp(rec)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(!memcmp(page, page_zip->data, PAGE_DATA)); + + ut_ad(page_is_leaf(page)); + + /* The heap number of the first user record is 2. */ + storage = page_zip->data + page_zip->size + - (page_dir_get_n_heap(page) - 2) + * PAGE_ZIP_DIR_SLOT_SIZE + - (rec_get_heap_no_new(rec) - 2) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + field = rec + size + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +#if DATA_TRX_ID_LEN != 6 +# error "DATA_TRX_ID_LEN != 6" +#endif + mach_write_to_6(field, trx_id); + memcpy(storage, field, DATA_TRX_ID_LEN); + + if (mtr) { + mlog_write_initial_log_record( + rec, MLOG_ZIP_WRITE_TRX_ID, mtr); + } +} + +/************************************************************************** +Write the roll_ptr of a record on a B-tree leaf node page. */ + +void +page_zip_write_roll_ptr( +/*====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in/out: record */ + ulint size, /* in: data size of rec */ + dulint roll_ptr,/* in: roll_ptr */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ +{ + byte* field; + byte* storage; + page_t* page = ut_align_down(rec, UNIV_PAGE_SIZE); + + ut_ad(buf_block_get_page_zip(buf_block_align(rec)) == page_zip); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip->size > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(page_rec_is_comp(rec)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(!memcmp(page, page_zip->data, PAGE_DATA)); + + ut_ad(page_is_leaf(page)); + + /* The heap number of the first user record is 2. */ + storage = page_zip->data + page_zip->size + - (page_dir_get_n_heap(page) - 2) + * PAGE_ZIP_DIR_SLOT_SIZE + - (rec_get_heap_no_new(rec) - 2) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + + DATA_TRX_ID_LEN; + field = rec + size + - DATA_ROLL_PTR_LEN; +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage, field, DATA_ROLL_PTR_LEN)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + mach_write_to_7(field, roll_ptr); + memcpy(storage, field, DATA_ROLL_PTR_LEN); + + if (mtr) { + mlog_write_initial_log_record( + rec, MLOG_ZIP_WRITE_TRX_ID, mtr); + } +} + +/************************************************************************** +Clear an area on the uncompressed and compressed page, if possible. */ + +void +page_zip_clear_rec( +/*===============*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in: record to clear */ + dict_index_t* index, /* in: index of rec */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + mtr_t* mtr) /* in: mini-transaction */ +{ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, ut_align_down(rec, UNIV_PAGE_SIZE))); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (page_zip_available(page_zip, rec_offs_size(offsets), + page_is_leaf(page_zip->data), 0)) { + memset(rec - rec_offs_extra_size(offsets), 0, + rec_offs_extra_size(offsets) - REC_N_NEW_EXTRA_BYTES); + memset(rec, 0, rec_offs_data_size(offsets)); + + /* Log that the data was zeroed out. */ + page_zip_write_rec(page_zip, rec, offsets); + } else { + /* There is not enough space to log the clearing. + Try to clear the block and to recompress the page. */ + + byte* buf = mem_alloc(rec_offs_size(offsets)); + memcpy(buf, rec - rec_offs_extra_size(offsets), + rec_offs_size(offsets)); + + memset(rec - rec_offs_extra_size(offsets), 0, + rec_offs_extra_size(offsets) - REC_N_NEW_EXTRA_BYTES); + memset(rec, 0, rec_offs_data_size(offsets)); + /* TODO: maybe log the memset()s? */ + + if (UNIV_UNLIKELY(!page_zip_compress(page_zip, + ut_align_down(rec, UNIV_PAGE_SIZE), + index, mtr))) { + /* Compression failed. Restore the block. */ + memcpy(rec - rec_offs_extra_size(offsets), buf, + rec_offs_size(offsets)); + } + + mem_free(buf); + } +} + +/************************************************************************** +Populate the dense page directory on the compressed page +from the sparse directory on the uncompressed row_format=compact page. */ +void +page_zip_dir_rewrite( +/*=================*/ + page_zip_des_t* page_zip,/* out: dense directory on compressed page */ + const page_t* page) /* in: uncompressed page */ +{ + page_zip_dir_encode(page, page_zip, NULL); +} + +/************************************************************************** +Write the "deleted" flag of a record on a compressed page. The flag must +already have been written on the uncompressed page. */ + +void +page_zip_rec_set_deleted( +/*=====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the deleted flag (nonzero=TRUE) */ +{ + byte* slot = page_zip_dir_find(page_zip, + ut_align_offset(rec, UNIV_PAGE_SIZE)); + ut_a(slot); + if (flag) { + *slot |= (PAGE_ZIP_DIR_SLOT_DEL >> 8); + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8); + } +} + +/************************************************************************** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ + +void +page_zip_rec_set_owned( +/*===================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the owned flag (nonzero=TRUE) */ +{ + byte* slot = page_zip_dir_find(page_zip, + ut_align_offset(rec, UNIV_PAGE_SIZE)); + ut_a(slot); + if (flag) { + *slot |= (PAGE_ZIP_DIR_SLOT_OWNED >> 8); + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_OWNED >> 8); + } +} + + +/************************************************************************** +Shift the dense page directory when a record is deleted. */ + +void +page_zip_dir_delete( +/*================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: deleted record */ + const byte* free) /* in: previous start of the free list */ +{ + byte* slot_rec; + byte* slot_free; + + slot_rec = page_zip_dir_find(page_zip, + ut_align_offset(rec, UNIV_PAGE_SIZE)); + slot_free = page_zip_dir_find_free(page_zip, + ut_align_offset(free, UNIV_PAGE_SIZE)); + + ut_a(slot_rec); + + if (UNIV_UNLIKELY(!slot_free)) { + /* Make the last slot the start of the free list. */ + slot_free = page_zip->data + page_zip->size + - PAGE_ZIP_DIR_SLOT_SIZE + * page_dir_get_n_heap(page_zip->data); + } else { + ut_a(slot_free < slot_rec); + /* Grow the free list by one slot by moving the start. */ + slot_free += PAGE_ZIP_DIR_SLOT_SIZE; + } + + if (UNIV_LIKELY(slot_free < slot_rec)) { + memmove(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, + slot_free, + slot_rec - slot_free - PAGE_ZIP_DIR_SLOT_SIZE); + } + + /* Write the entry for the deleted record. + The "owned" and "deleted" flags will be cleared. */ + mach_write_to_2(slot_free, ut_align_offset(rec, UNIV_PAGE_SIZE)); } -#endif /* UNIV_DEBUG */ diff --git a/rem/rem0rec.c b/rem/rem0rec.c index 9af3e80b488d37e43c86ee17e508925dcd37a594..a02b63ad21a4f1fa375b8f0bef22959bd6510644 100644 --- a/rem/rem0rec.c +++ b/rem/rem0rec.c @@ -136,6 +136,84 @@ rec_validate_old( /* out: TRUE if ok */ rec_t* rec); /* in: physical record */ +/********************************************************** +Determine how many of the first n columns in a compact +physical record are stored externally. */ + +ulint +rec_get_n_extern_new( +/*=================*/ + /* out: number of externally stored columns */ + const rec_t* rec, /* in: compact physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint n) /* in: number of columns to scan */ +{ + const byte* nulls; + const byte* lens; + dict_field_t* field; + ulint null_mask; + ulint n_extern; + ulint i; + + ut_ad(index->table->comp); + ut_ad(rec_get_status((rec_t*) rec) == REC_STATUS_ORDINARY); + ut_ad(n == ULINT_UNDEFINED || n <= dict_index_get_n_fields(index)); + + if (n == ULINT_UNDEFINED) { + n = dict_index_get_n_fields(index); + } + + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - (index->n_nullable + 7) / 8; + null_mask = 1; + n_extern = 0; + i = 0; + + /* read the lengths of fields 0..n */ + do { + ulint len; + + field = dict_index_get_nth_field(index, i); + if (!(dtype_get_prtype(dict_col_get_type( + dict_field_get_col(field))) + & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + /* No length is stored for NULL fields. */ + continue; + } + null_mask <<= 1; + } + + if (UNIV_UNLIKELY(!field->fixed_len)) { + /* Variable-length field: read the length */ + dtype_t* type = dict_col_get_type( + dict_field_get_col(field)); + len = *lens--; + if (UNIV_UNLIKELY(dtype_get_len(type) > 255) + || UNIV_UNLIKELY(dtype_get_mtype(type) + == DATA_BLOB)) { + if (len & 0x80) { + /* 1exxxxxxx xxxxxxxx */ + if (len & 0x40) { + n_extern++; + } + lens--; + } + } + } + } while (++i < n); + + return(n_extern); +} + /********************************************************** The following function determines the offsets to each field in the record. The offsets are written to a previously allocated array of @@ -364,6 +442,118 @@ rec_get_offsets_func( return(offsets); } +/********************************************************** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. */ + +void +rec_get_offsets_reverse( +/*====================*/ + const byte* extra, /* in: the extra bytes of a compact record + in reverse order, excluding the fixed-size + REC_N_NEW_EXTRA_BYTES */ + dict_index_t* index, /* in: record descriptor */ + ibool node_ptr,/* in: TRUE=node pointer, FALSE=leaf node */ + ulint* offsets)/* in/out: array consisting of offsets[0] + allocated elements */ +{ + ulint n; + ulint i; + ulint offs; + const byte* nulls; + const byte* lens; + dict_field_t* field; + ulint null_mask; + ulint n_node_ptr_field; + + ut_ad(extra); + ut_ad(index); + ut_ad(offsets); + ut_ad(index->table->comp); + + if (UNIV_UNLIKELY(node_ptr)) { + n_node_ptr_field = dict_index_get_n_unique_in_tree(index); + n = n_node_ptr_field + 1; + } else { + n_node_ptr_field = ULINT_UNDEFINED; + n = dict_index_get_n_fields(index); + } + + ut_a(rec_offs_get_n_alloc(offsets) >= n + (1 + REC_OFFS_HEADER_SIZE)); + rec_offs_set_n_fields(offsets, n); + + nulls = extra; + lens = nulls + (index->n_nullable + 7) / 8; + i = offs = 0; + null_mask = 1; + + /* read the lengths of fields 0..n */ + do { + ulint len; + if (UNIV_UNLIKELY(i == n_node_ptr_field)) { + len = offs += 4; + goto resolved; + } + + field = dict_index_get_nth_field(index, i); + if (!(dtype_get_prtype(dict_col_get_type( + dict_field_get_col(field))) + & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls++; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + /* No length is stored for NULL fields. + We do not advance offs, and we set + the length to zero and enable the + SQL NULL flag in offsets[]. */ + len = offs | REC_OFFS_SQL_NULL; + goto resolved; + } + null_mask <<= 1; + } + + if (UNIV_UNLIKELY(!field->fixed_len)) { + /* Variable-length field: read the length */ + dtype_t* type = dict_col_get_type( + dict_field_get_col(field)); + len = *lens++; + if (UNIV_UNLIKELY(dtype_get_len(type) > 255) + || UNIV_UNLIKELY(dtype_get_mtype(type) + == DATA_BLOB)) { + if (len & 0x80) { + /* 1exxxxxxx xxxxxxxx */ + len <<= 8; + len |= *lens++; + + offs += len & 0x3fff; + if (UNIV_UNLIKELY(len & 0x4000)) { + len = offs | REC_OFFS_EXTERNAL; + } else { + len = offs; + } + + goto resolved; + } + } + + len = offs += len; + } else { + len = offs += field->fixed_len; + } + resolved: + rec_offs_base(offsets)[i + 1] = len; + } while (++i < rec_offs_n_fields(offsets)); + + *rec_offs_base(offsets) = + ((lens - 1) - extra) | REC_OFFS_COMPACT; +} + /**************************************************************** The following function is used to get a pointer to the nth data field in an old-style record. */ @@ -632,6 +822,9 @@ rec_set_nth_field_extern_bit_new( /* toggle the extern bit */ len |= 0x40; if (mtr) { + /* TODO: page_zip: + log this differently, + or remove altogether */ mlog_write_ulint(lens + 1, len, MLOG_1BYTE, mtr); } else { @@ -904,8 +1097,7 @@ rec_convert_dtuple_to_rec_new( memset (lens + 1, 0, nulls - lens); /* Set the info bits of the record */ - rec_set_info_and_status_bits(rec, NULL, - dtuple_get_info_bits(dtuple)); + rec_set_info_and_status_bits(rec, dtuple_get_info_bits(dtuple)); /* Store the data and the offsets */ diff --git a/row/row0ins.c b/row/row0ins.c index c20ca3b4cd7d993bf1c88d37c0e4b14ce1be2268..7ba57ae3b787a453607690331b2a481520b910c1 100644 --- a/row/row0ins.c +++ b/row/row0ins.c @@ -273,7 +273,10 @@ row_ins_sec_index_entry_by_modify( err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG, cursor, update, 0, thr, mtr); - if (err == DB_OVERFLOW || err == DB_UNDERFLOW) { + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: err = DB_FAIL; } } else { @@ -337,7 +340,10 @@ row_ins_clust_index_entry_by_modify( err = btr_cur_optimistic_update(0, cursor, update, 0, thr, mtr); - if (err == DB_OVERFLOW || err == DB_UNDERFLOW) { + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: err = DB_FAIL; } } else { @@ -1919,7 +1925,7 @@ existing record, and we must write an undo log record on the delete marked record. If the index is secondary, and a record with exactly the same fields is found, the other record is necessarily marked deleted. It is then unmarked. Otherwise, the entry is just inserted to the index. */ - +static ulint row_ins_index_entry_low( /*====================*/ @@ -2063,7 +2069,9 @@ row_ins_index_entry_low( } if (err == DB_SUCCESS) { + /* TODO: set these before insert */ if (ext_vec) { + /* TODO: page_zip, mtr=NULL */ rec_set_field_extern_bits(insert_rec, index, ext_vec, n_ext_vec, &mtr); } @@ -2083,7 +2091,8 @@ row_ins_index_entry_low( offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); - err = btr_store_big_rec_extern_fields(index, rec, 0/*TODO*/, + /* TODO: set the extern bits outside this function */ + err = btr_store_big_rec_extern_fields(index, rec, offsets, big_rec, &mtr); if (modify) { @@ -2409,7 +2418,7 @@ row_ins_step( goto same_trx; } - trx_write_trx_id(node->trx_id_buf, NULL, trx->id); + trx_write_trx_id(node->trx_id_buf, trx->id); err = lock_table(0, node->table, LOCK_IX, thr); diff --git a/row/row0purge.c b/row/row0purge.c index 67a82c32ddbf29ad83bf5b8ffd236524cea71f42..7e49155db0a24a459a36a1022876c1de0db58ba0 100644 --- a/row/row0purge.c +++ b/row/row0purge.c @@ -370,16 +370,16 @@ row_purge_upd_exist_or_extern( ulint rseg_id; ulint page_no; ulint offset; - ulint internal_offset; - byte* data_field; - ulint data_field_len; ulint i; + ulint* offsets; mtr_t mtr; ut_ad(node); + offsets = NULL; if (node->rec_type == TRX_UNDO_UPD_DEL_REC) { + heap = NULL; goto skip_secondaries; } @@ -399,7 +399,7 @@ row_purge_upd_exist_or_extern( node->index = dict_table_get_next_index(node->index); } - mem_heap_free(heap); + mem_heap_empty(heap); skip_secondaries: /* Free possible externally stored fields */ @@ -407,10 +407,14 @@ row_purge_upd_exist_or_extern( ufield = upd_get_nth_field(node->update, i); - if (ufield->extern_storage) { + if (UNIV_UNLIKELY(ufield->extern_storage)) { + byte* rec; + ulint j; + ulint internal_offset; + /* We use the fact that new_val points to node->undo_rec and get thus the offset of - dfield data inside the unod record. Then we + dfield data inside the undo record. Then we can calculate from node->roll_ptr the file address of the new_val data */ @@ -445,23 +449,43 @@ row_purge_upd_exist_or_extern( /* We assume in purge of externally stored fields that the space id of the undo log record is 0! */ - data_field = buf_page_get(0, page_no, RW_X_LATCH, &mtr) - + offset + internal_offset; + rec = buf_page_get(0, page_no, RW_X_LATCH, &mtr) + + internal_offset; #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(buf_frame_align(data_field), SYNC_TRX_UNDO_PAGE); #endif /* UNIV_SYNC_DEBUG */ - - data_field_len = ufield->new_val.len; - btr_free_externally_stored_field(index, data_field, - data_field_len, - 0/*TODO*/, - FALSE, &mtr); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + for (j = 0; j < rec_offs_n_fields(offsets); j++) { + ulint len; + byte* field = rec_get_nth_field( + rec, offsets, j, &len); + + if (UNIV_UNLIKELY(rec + offset == field)) { + ut_a(len == ufield->new_val.len); + ut_a(rec_offs_nth_extern(offsets, j)); + goto found_field; + } + } + + /* field not found */ + ut_error; + +found_field: + btr_free_externally_stored_field(index, rec, offsets, + buf_block_get_page_zip( + buf_block_align(rec)), + j, FALSE, &mtr); mtr_commit(&mtr); } } + + if (heap) { + mem_heap_free(heap); + } } /*************************************************************** diff --git a/row/row0row.c b/row/row0row.c index 50bba7c06012d37c2c9aff2907bcf04580b04f5d..3f46b960158a6aedfd22ff8d30084c265e0ff9ad 100644 --- a/row/row0row.c +++ b/row/row0row.c @@ -28,52 +28,16 @@ Created 4/20/1996 Heikki Tuuri #include "read0read.h" /************************************************************************* -Reads the trx id or roll ptr field from a clustered index record: this function -is slower than the specialized inline functions. */ +Gets the offset of trx id field, in bytes relative to the origin of +a clustered index record. */ -dulint -row_get_rec_sys_field( +ulint +row_get_trx_id_offset( /*==================*/ - /* out: value of the field */ - ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ + /* out: offset of DATA_TRX_ID */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ const ulint* offsets)/* in: rec_get_offsets(rec, index) */ -{ - ulint pos; - byte* field; - ulint len; - - ut_ad(index->type & DICT_CLUSTERED); - - pos = dict_index_get_sys_col_pos(index, type); - - field = rec_get_nth_field(rec, offsets, pos, &len); - - if (type == DATA_TRX_ID) { - - return(trx_read_trx_id(field)); - } else { - ut_ad(type == DATA_ROLL_PTR); - - return(trx_read_roll_ptr(field)); - } -} - -/************************************************************************* -Sets the trx id or roll ptr field in a clustered index record: this function -is slower than the specialized inline functions. */ - -void -row_set_rec_sys_field( -/*==================*/ - ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ - rec_t* rec, /* in/out: record */ - page_zip_des_t* page_zip,/* in/out: compressed page with at least - 10 or 11 bytes available, or NULL */ - dict_index_t* index, /* in: clustered index */ - const ulint* offsets,/* in: rec_get_offsets(rec, index) */ - dulint val) /* in: value to set */ { ulint pos; byte* field; @@ -82,18 +46,13 @@ row_set_rec_sys_field( ut_ad(index->type & DICT_CLUSTERED); ut_ad(rec_offs_validate(rec, index, offsets)); - pos = dict_index_get_sys_col_pos(index, type); + pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); field = rec_get_nth_field(rec, offsets, pos, &len); - if (type == DATA_TRX_ID) { + ut_ad(len == DATA_TRX_ID_LEN); - trx_write_trx_id(field, page_zip/* 10 bytes */, val); - } else { - ut_ad(type == DATA_ROLL_PTR); - - trx_write_roll_ptr(field, page_zip/* 11 bytes */, val); - } + return(field - rec); } /********************************************************************* diff --git a/row/row0umod.c b/row/row0umod.c index f906027033f52d20c392ced973a34cf1cbe9e631..d773efae13db3393ba8d78e50b5bd555ed6ffc2f 100644 --- a/row/row0umod.c +++ b/row/row0umod.c @@ -455,9 +455,12 @@ row_undo_mod_del_unmark_sec_and_undo_update( err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG, btr_cur, update, 0, thr, &mtr); - if (err == DB_OVERFLOW || err == DB_UNDERFLOW) { - err = DB_FAIL; - } + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: + err = DB_FAIL; + } } else { ut_a(mode == BTR_MODIFY_TREE); err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG diff --git a/row/row0upd.c b/row/row0upd.c index 0158c55cf49a98664145ab360a293e286efdcec1..64a9dc0448a96c62f1a92ddac0622ac08c4ee3f0 100644 --- a/row/row0upd.c +++ b/row/row0upd.c @@ -308,16 +308,20 @@ row_upd_rec_sys_fields_in_recovery( dulint trx_id, /* in: transaction id */ dulint roll_ptr)/* in: roll ptr of the undo log record */ { - byte* field; - ulint len; - - field = rec_get_nth_field(rec, offsets, pos, &len); - ut_ad(len == DATA_TRX_ID_LEN); - trx_write_trx_id(field, page_zip, trx_id); + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_trx_id(page_zip, rec, + rec_offs_size(offsets), trx_id, NULL); + page_zip_write_roll_ptr(page_zip, rec, + rec_offs_size(offsets), roll_ptr, NULL); + } else { + byte* field; + ulint len; - field = rec_get_nth_field(rec, offsets, pos + 1, &len); - ut_ad(len == DATA_ROLL_PTR_LEN); - trx_write_roll_ptr(field, page_zip, roll_ptr); + field = rec_get_nth_field(rec, offsets, pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + trx_write_trx_id(field, trx_id); + trx_write_roll_ptr(field + DATA_TRX_ID_LEN, roll_ptr); + } } /************************************************************************* @@ -346,10 +350,10 @@ row_upd_index_entry_sys_field( field = dfield_get_data(dfield); if (type == DATA_TRX_ID) { - trx_write_trx_id(field, NULL, val); + trx_write_trx_id(field, val); } else { ut_ad(type == DATA_ROLL_PTR); - trx_write_roll_ptr(field, NULL, val); + trx_write_roll_ptr(field, val); } } @@ -437,7 +441,9 @@ row_upd_rec_in_place( /*=================*/ rec_t* rec, /* in/out: record where replaced */ const ulint* offsets,/* in: array returned by rec_get_offsets() */ - upd_t* update) /* in: update vector */ + upd_t* update, /* in: update vector */ + page_zip_des_t* page_zip)/* in: compressed page with enough space + available, or NULL */ { upd_field_t* upd_field; dfield_t* new_val; @@ -447,7 +453,7 @@ row_upd_rec_in_place( ut_ad(rec_offs_validate(rec, NULL, offsets)); if (rec_offs_comp(offsets)) { - rec_set_info_bits_new(rec, NULL, update->info_bits); + rec_set_info_bits_new(rec, update->info_bits); } else { rec_set_info_bits_old(rec, update->info_bits); } @@ -462,6 +468,10 @@ row_upd_rec_in_place( dfield_get_data(new_val), dfield_get_len(new_val)); } + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_rec(page_zip, rec, offsets); + } } /************************************************************************* @@ -485,7 +495,7 @@ row_upd_write_sys_vals_to_log( log_ptr += mach_write_compressed(log_ptr, dict_index_get_sys_col_pos(index, DATA_TRX_ID)); - trx_write_roll_ptr(log_ptr, NULL, roll_ptr); + trx_write_roll_ptr(log_ptr, roll_ptr); log_ptr += DATA_ROLL_PTR_LEN; log_ptr += mach_dulint_write_compressed(log_ptr, trx->id); @@ -1410,7 +1420,9 @@ row_upd_clust_rec_by_insert( btr_cur = btr_pcur_get_btr_cur(pcur); if (node->state != UPD_NODE_INSERT_CLUSTERED) { - ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_t* rec; + dict_index_t* index; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; *offsets_ = (sizeof offsets_) / sizeof *offsets_; err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, @@ -1425,10 +1437,13 @@ row_upd_clust_rec_by_insert( free those externally stored fields even if the delete marked record is removed from the index tree, or updated. */ - btr_cur_mark_extern_inherited_fields(btr_cur_get_rec(btr_cur), - 0/*TODO*/, - rec_get_offsets(btr_cur_get_rec(btr_cur), - dict_table_get_first_index(table), offsets_, + rec = btr_cur_get_rec(btr_cur); + index = dict_table_get_first_index(table); + + btr_cur_mark_extern_inherited_fields( + buf_block_get_page_zip(buf_block_align(rec)), + rec, index, + rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), node->update, mtr); if (check_ref) { /* NOTE that the following call loses @@ -1524,9 +1539,9 @@ row_upd_clust_rec( mtr_commit(mtr); - if (err == DB_SUCCESS) { + if (UNIV_LIKELY(err == DB_SUCCESS)) { - return(err); + return(DB_SUCCESS); } /* We may have to modify the tree structure: do a pessimistic descent @@ -1560,7 +1575,7 @@ row_upd_clust_rec( ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); rec = btr_cur_get_rec(btr_cur); - err = btr_store_big_rec_extern_fields(index, rec, 0/*TODO*/, + err = btr_store_big_rec_extern_fields(index, rec, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), big_rec, mtr); @@ -2046,7 +2061,7 @@ row_upd_in_place_in_select( err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, btr_cur, node->update, node->cmpl_info, thr, mtr); - /* TODO: the above can fail if page_zip != NULL. + /* TODO: the above can fail with DB_ZIP_OVERFLOW if page_zip != NULL. However, this function row_upd_in_place_in_select() is only invoked when executing UPDATE statements of the built-in InnoDB SQL parser. The built-in SQL is only used for InnoDB system tables, which diff --git a/trx/trx0rec.c b/trx/trx0rec.c index 20e6cfebfd65520c7b694e42c936c4e2e1743f6d..181e7d5a0e3e52c1eee00a36106f57915934fbd3 100644 --- a/trx/trx0rec.c +++ b/trx/trx0rec.c @@ -807,7 +807,7 @@ trx_undo_update_rec_get_update( upd_field = upd_get_nth_field(update, n_fields); buf = mem_heap_alloc(heap, DATA_TRX_ID_LEN); - trx_write_trx_id(buf, NULL, trx_id); + trx_write_trx_id(buf, trx_id); upd_field_set_field_no(upd_field, dict_index_get_sys_col_pos(index, DATA_TRX_ID), @@ -816,7 +816,7 @@ trx_undo_update_rec_get_update( upd_field = upd_get_nth_field(update, n_fields + 1); buf = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN); - trx_write_roll_ptr(buf, NULL, roll_ptr); + trx_write_roll_ptr(buf, roll_ptr); upd_field_set_field_no(upd_field, dict_index_get_sys_col_pos(index, DATA_ROLL_PTR), @@ -1408,7 +1408,7 @@ trx_undo_prev_version_build( buf = mem_heap_alloc(heap, rec_offs_size(offsets)); *old_vers = rec_copy(buf, rec, offsets); rec_offs_make_valid(*old_vers, index, offsets); - row_upd_rec_in_place(*old_vers, offsets, update); + row_upd_rec_in_place(*old_vers, offsets, update, NULL); } return(DB_SUCCESS);