Commit 1a8a63d0 authored by marko's avatar marko

branches/zip: Reimplement merge sort in fast index creation.

The creation of the primary key does not work.  We will have to flag
externally stored columns and copy the externally stored part from
the old table.

row_build_index_for_mysql(): Rename to row_merge_build_indexes().
Move from row0mysql.c to row0merge.c.

Remove private declarations from row0merge.h.  Make many functions static
in row0merge.c.

cmp_rec_rec_simple(): A new comparison function.

dict_index_get_min_size(): New function.

OS_FILE_FROM_FD(fd): A macro for converting from int to os_file_t.

rec_convert_dtuple_to_rec_comp(): Make the interface lower-level.

rec_get_converted_size_comp(): Return also extra_size.

UT_SORT_FUNCTION_BODY(): Remove reference to an obsolete test program.

row_rec_to_index_entry_low(): New function.

row0merge.c: Implement merge sort based on file streams instead of
fixed-size blocks.  Sort the small blocks as arrays of dfield_t*,
because it is faster than invoking rec_get_offsets() for every
comparison.
parent 673f836f
...@@ -8283,7 +8283,7 @@ ha_innobase::add_index( ...@@ -8283,7 +8283,7 @@ ha_innobase::add_index(
/* Read clustered index of the table and build indexes /* Read clustered index of the table and build indexes
based on this information using temporary files and merge based on this information using temporary files and merge
sort.*/ sort.*/
error = row_build_index_for_mysql( error = row_merge_build_indexes(
trx, innodb_table, indexed_table, index, trx, innodb_table, indexed_table, index,
num_of_idx); num_of_idx);
......
...@@ -656,6 +656,14 @@ dict_table_get_sys_col_no( ...@@ -656,6 +656,14 @@ dict_table_get_sys_col_no(
const dict_table_t* table, /* in: table */ const dict_table_t* table, /* in: table */
ulint sys); /* in: DATA_ROW_ID, ... */ ulint sys); /* in: DATA_ROW_ID, ... */
/************************************************************************ /************************************************************************
Returns the minimum data size of an index record. */
UNIV_INLINE
ulint
dict_index_get_min_size(
/*====================*/
/* out: minimum data size in bytes */
const dict_index_t* index); /* in: index */
/************************************************************************
Check whether the table uses the compact page format. */ Check whether the table uses the compact page format. */
UNIV_INLINE UNIV_INLINE
ibool ibool
......
...@@ -507,6 +507,26 @@ dict_index_get_nth_col_no( ...@@ -507,6 +507,26 @@ dict_index_get_nth_col_no(
return(dict_col_get_no(dict_index_get_nth_col(index, pos))); return(dict_col_get_no(dict_index_get_nth_col(index, pos)));
} }
/************************************************************************
Returns the minimum data size of an index record. */
UNIV_INLINE
ulint
dict_index_get_min_size(
/*====================*/
/* out: minimum data size in bytes */
const dict_index_t* index) /* in: index */
{
ulint n = dict_index_get_n_fields(index);
ulint size = 0;
while (n--) {
size += dict_col_get_min_size(dict_index_get_nth_col(index,
n));
}
return(size);
}
/************************************************************************* /*************************************************************************
Gets the space id of the root of the index tree. */ Gets the space id of the root of the index tree. */
UNIV_INLINE UNIV_INLINE
......
...@@ -43,8 +43,10 @@ extern ulint os_n_pending_writes; ...@@ -43,8 +43,10 @@ extern ulint os_n_pending_writes;
#ifdef __WIN__ #ifdef __WIN__
#define os_file_t HANDLE #define os_file_t HANDLE
#define OS_FILE_FROM_FD(fd) _get_osfhandle(fd)
#else #else
typedef int os_file_t; typedef int os_file_t;
#define OS_FILE_FROM_FD(fd) fd
#endif #endif
extern ulint os_innodb_umask; extern ulint os_innodb_umask;
......
...@@ -125,6 +125,22 @@ cmp_dtuple_is_prefix_of_rec( ...@@ -125,6 +125,22 @@ cmp_dtuple_is_prefix_of_rec(
const dtuple_t* dtuple, /* in: data tuple */ const dtuple_t* dtuple, /* in: data tuple */
const rec_t* rec, /* in: physical record */ const rec_t* rec, /* in: physical record */
const ulint* offsets);/* in: array returned by rec_get_offsets() */ const ulint* offsets);/* in: array returned by rec_get_offsets() */
#ifndef UNIV_HOTBACKUP
/*****************************************************************
Compare two physical records that contain the same number of columns,
none of which are stored externally. */
int
cmp_rec_rec_simple(
/*===============*/
/* out: 1, 0 , -1 if rec1 is greater, equal,
less, respectively, than rec2 */
const rec_t* rec1, /* in: physical record */
const rec_t* rec2, /* in: physical record */
const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */
const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */
dict_index_t* index); /* in: data dictionary index */
#endif /* !UNIV_HOTBACKUP */
/***************************************************************** /*****************************************************************
This function is used to compare two physical records. Only the common This function is used to compare two physical records. Only the common
first fields are compared, and if an externally stored field is first fields are compared, and if an externally stored field is
......
...@@ -607,16 +607,17 @@ rec_fold( ...@@ -607,16 +607,17 @@ rec_fold(
/************************************************************* /*************************************************************
Builds a ROW_FORMAT=COMPACT record out of a data tuple. */ Builds a ROW_FORMAT=COMPACT record out of a data tuple. */
byte* void
rec_convert_dtuple_to_rec_comp( rec_convert_dtuple_to_rec_comp(
/*===========================*/ /*===========================*/
/* out: pointer to the start of data payload */ rec_t* rec, /* in: origin of record */
byte* buf, /* in: start address of the data area */
ulint extra, /* in: number of bytes to reserve between ulint extra, /* in: number of bytes to reserve between
the record header and the data payload the record header and the data payload
(usually REC_N_NEW_EXTRA_BYTES) */ (usually REC_N_NEW_EXTRA_BYTES) */
dict_index_t* index, /* in: record descriptor */ dict_index_t* index, /* in: record descriptor */
const dtuple_t* dtuple, /* in: data tuple */ ulint status, /* in: status bits of the record */
const dfield_t* fields, /* in: array of data fields */
ulint n_fields,/* in: number of data fields */
const ulint* ext, /* in: array of extern field numbers, const ulint* ext, /* in: array of extern field numbers,
in ascending order */ in ascending order */
ulint n_ext); /* in: number of elements in ext */ ulint n_ext); /* in: number of elements in ext */
...@@ -657,9 +658,12 @@ rec_get_converted_size_comp( ...@@ -657,9 +658,12 @@ rec_get_converted_size_comp(
/* out: size */ /* out: size */
dict_index_t* index, /* in: record descriptor; dict_index_t* index, /* in: record descriptor;
dict_table_is_comp() is assumed to hold */ dict_table_is_comp() is assumed to hold */
const dtuple_t* dtuple, /* in: data tuple */ ulint status, /* in: status bits of the record */
const dfield_t* fields, /* in: array of data fields */
ulint n_fields,/* in: number of data fields */
const ulint* ext, /* in: array of extern field numbers */ const ulint* ext, /* in: array of extern field numbers */
ulint n_ext); /* in: number of elements in ext */ ulint n_ext, /* in: number of elements in ext */
ulint* extra); /* out: extra size */
/************************************************************** /**************************************************************
The following function returns the size of a data tuple when converted to The following function returns the size of a data tuple when converted to
a physical record. */ a physical record. */
......
...@@ -1538,7 +1538,12 @@ rec_get_converted_size( ...@@ -1538,7 +1538,12 @@ rec_get_converted_size(
: dict_index_get_n_fields(index))); : dict_index_get_n_fields(index)));
if (dict_table_is_comp(index->table)) { if (dict_table_is_comp(index->table)) {
return(rec_get_converted_size_comp(index, dtuple, ext, n_ext)); return(rec_get_converted_size_comp(index,
dtuple_get_info_bits(dtuple)
& REC_NEW_STATUS_MASK,
dtuple->fields,
dtuple->n_fields,
ext, n_ext, NULL));
} }
data_size = dtuple_get_data_size(dtuple); data_size = dtuple_get_data_size(dtuple);
......
...@@ -21,17 +21,6 @@ Created 13/06/2005 Jan Lindstrom ...@@ -21,17 +21,6 @@ Created 13/06/2005 Jan Lindstrom
#include "btr0types.h" #include "btr0types.h"
#include "row0mysql.h" #include "row0mysql.h"
/* Information about temporary files used in merge sort are stored
to this structure */
struct merge_file_struct {
os_file_t file; /* File descriptor */
ulint offset; /* File offset */
ulint num_of_blocks; /* Number of blocks */
};
typedef struct merge_file_struct merge_file_t;
/* This structure holds index field definitions */ /* This structure holds index field definitions */
struct merge_index_field_struct { struct merge_index_field_struct {
...@@ -53,48 +42,6 @@ struct merge_index_def_struct { ...@@ -53,48 +42,6 @@ struct merge_index_def_struct {
typedef struct merge_index_def_struct merge_index_def_t; typedef struct merge_index_def_struct merge_index_def_t;
/************************************************************************
Reads clustered index of the table and create temporary files
containing index entries for indexes to be built. */
ulint
row_merge_read_clustered_index(
/*===========================*/
/* out: DB_SUCCESS if successfull,
or ERROR code */
trx_t* trx, /* in: transaction */
dict_table_t* table, /* in: table where index is created */
dict_index_t** index, /* in: indexes to be created */
merge_file_t* files, /* in: Files where to write index
entries */
ulint num_of_idx); /* in: number of indexes to be
created */
/************************************************************************
Read sorted file containing index data tuples and insert these data
data tuples to the index */
ulint
row_merge_insert_index_tuples(
/*==========================*/
/* out: 0 or error number */
trx_t* trx, /* in: transaction */
dict_index_t* index, /* in: index */
dict_table_t* table, /* in: table */
os_file_t file, /* in: file handle */
ulint offset); /* in: offset where to start
reading */
/*****************************************************************
Merge sort for linked list in the disk. */
ulint
row_merge_sort_linked_list_in_disk(
/*===============================*/
/* out: offset to first block in
the list or ULINT_UNDEFINED in
case of error */
dict_index_t* index, /* in: index to be created */
os_file_t file, /* in: File handle */
int* error); /* out: 0 or error */
/************************************************************************* /*************************************************************************
Drop an index from the InnoDB system tables. */ Drop an index from the InnoDB system tables. */
...@@ -116,13 +63,6 @@ row_merge_drop_indexes( ...@@ -116,13 +63,6 @@ row_merge_drop_indexes(
dict_table_t* table, /* in: table containing the indexes */ dict_table_t* table, /* in: table containing the indexes */
dict_index_t** index, /* in: indexes to drop */ dict_index_t** index, /* in: indexes to drop */
ulint num_created); /* in: number of elements in index[] */ ulint num_created); /* in: number of elements in index[] */
/*************************************************************************
Initialize memory for a merge file structure */
void
row_merge_file_create(
/*==================*/
merge_file_t* merge_file); /* out: merge file structure */
/************************************************************************* /*************************************************************************
Create a temporary table using a definition of the old table. You must Create a temporary table using a definition of the old table. You must
...@@ -136,16 +76,7 @@ row_merge_create_temporary_table( ...@@ -136,16 +76,7 @@ row_merge_create_temporary_table(
dict_table_t* table, /* in: old table definition */ dict_table_t* table, /* in: old table definition */
trx_t* trx); /* in/out: trx (sets error_state) */ trx_t* trx); /* in/out: trx (sets error_state) */
/************************************************************************* /*************************************************************************
Update all prebuilts for this table */ Rename the indexes in the dictionary. */
void
row_merge_prebuilts_update(
/*=======================*/
trx_t* trx, /* in: trx */
dict_table_t* old_table); /* in: old table */
/*************************************************************************
Rename the indexes in the dicitionary. */
ulint ulint
row_merge_rename_index( row_merge_rename_index(
...@@ -155,7 +86,7 @@ row_merge_rename_index( ...@@ -155,7 +86,7 @@ row_merge_rename_index(
dict_table_t* table, /* in: Table for index */ dict_table_t* table, /* in: Table for index */
dict_index_t* index); /* in: Index to rename */ dict_index_t* index); /* in: Index to rename */
/************************************************************************* /*************************************************************************
Create the index and load in to the dicitionary. */ Create the index and load in to the dictionary. */
dict_index_t* dict_index_t*
row_merge_create_index( row_merge_create_index(
...@@ -166,7 +97,7 @@ row_merge_create_index( ...@@ -166,7 +97,7 @@ row_merge_create_index(
const merge_index_def_t* /* in: the index definition */ const merge_index_def_t* /* in: the index definition */
index_def); index_def);
/************************************************************************* /*************************************************************************
Check if a transaction can use an index.*/ Check if a transaction can use an index. */
ibool ibool
row_merge_is_index_usable( row_merge_is_index_usable(
...@@ -177,13 +108,31 @@ row_merge_is_index_usable( ...@@ -177,13 +108,31 @@ row_merge_is_index_usable(
const dict_index_t* index); /* in: index to check */ const dict_index_t* index); /* in: index to check */
/************************************************************************* /*************************************************************************
If there are views that refer to the old table name then we "attach" to If there are views that refer to the old table name then we "attach" to
the new instance of the table else we drop it immediately.*/ the new instance of the table else we drop it immediately. */
ulint ulint
row_merge_drop_table( row_merge_drop_table(
/*=================*/ /*=================*/
/* out: DB_SUCCESS if all OK else /* out: DB_SUCCESS or error code */
error code.*/
trx_t* trx, /* in: transaction */ trx_t* trx, /* in: transaction */
dict_table_t* table); /* in: table instance to drop */ dict_table_t* table); /* in: table instance to drop */
/*************************************************************************
Build indexes on a table by reading a clustered index,
creating a temporary file containing index entries, merge sorting
these index entries and inserting sorted index entries to indexes. */
ulint
row_merge_build_indexes(
/*====================*/
/* out: DB_SUCCESS or error code */
trx_t* trx, /* in: transaction */
dict_table_t* old_table, /* in: Table where rows are
read from */
dict_table_t* new_table, /* in: Table where indexes are
created. Note that old_table ==
new_table if we are creating a
secondary keys. */
dict_index_t** indexes, /* in: indexes to be created */
ulint n_indexes); /* in: size of indexes[] */
#endif /* row0merge.h */ #endif /* row0merge.h */
...@@ -503,25 +503,6 @@ row_check_table_for_mysql( ...@@ -503,25 +503,6 @@ row_check_table_for_mysql(
handle */ handle */
#endif /* !UNIV_HOTBACKUP */ #endif /* !UNIV_HOTBACKUP */
/************************************************************************* /*************************************************************************
Build new indexes to a table by reading a clustered index,
creating a temporary file containing index entries, merge sorting
these index entries and inserting sorted index entries to indexes. */
ulint
row_build_index_for_mysql(
/*======================*/
/* out: 0 or error code */
trx_t* trx, /* in: transaction */
dict_table_t* old_table, /* in: Table where rows are
read from */
dict_table_t* new_table, /* in: Table where indexes are
created. Note that old_table ==
new_table if we are creating a
secondary keys. */
dict_index_t** index, /* in: Indexes to be created */
ulint num_of_keys); /* in: Number of indexes to be
created */
/*************************************************************************
Create query graph for a index creation */ Create query graph for a index creation */
ulint ulint
......
...@@ -68,7 +68,7 @@ row_build_index_entry( ...@@ -68,7 +68,7 @@ row_build_index_entry(
mem_heap_t* heap); /* in: memory heap from which the memory for mem_heap_t* heap); /* in: memory heap from which the memory for
the index entry is allocated */ the index entry is allocated */
/*********************************************************************** /***********************************************************************
An inverse function to dict_row_build_index_entry. Builds a row from a An inverse function to row_build_index_entry. Builds a row from a
record in a clustered index. */ record in a clustered index. */
dtuple_t* dtuple_t*
...@@ -98,6 +98,21 @@ row_build( ...@@ -98,6 +98,21 @@ row_build(
/*********************************************************************** /***********************************************************************
Converts an index record to a typed data tuple. */ Converts an index record to a typed data tuple. */
dtuple_t*
row_rec_to_index_entry_low(
/*=======================*/
/* out, index entry built; does not
set info_bits, and the data fields in
the entry will point directly to rec */
const rec_t* rec, /* in: record in the index */
dict_index_t* index, /* in: index */
const ulint* offsets,/* in: rec_get_offsets(rec, index) */
mem_heap_t* heap); /* in: memory heap from which the memory
needed is allocated */
/***********************************************************************
Converts an index record to a typed data tuple. NOTE that externally
stored (often big) fields are NOT copied to heap. */
dtuple_t* dtuple_t*
row_rec_to_index_entry( row_rec_to_index_entry(
/*===================*/ /*===================*/
......
...@@ -30,8 +30,7 @@ and the low (LOW), inclusive, and high (HIGH), noninclusive, ...@@ -30,8 +30,7 @@ and the low (LOW), inclusive, and high (HIGH), noninclusive,
limits for the sort interval as arguments. limits for the sort interval as arguments.
CMP_FUN is the comparison function name. It takes as arguments CMP_FUN is the comparison function name. It takes as arguments
two elements from the array and returns 1, if the first is bigger, two elements from the array and returns 1, if the first is bigger,
0 if equal, and -1 if the second bigger. For an eaxmaple of use 0 if equal, and -1 if the second bigger. */
see test program in tsut.c. */
#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\ #define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\
{\ {\
......
...@@ -481,7 +481,7 @@ engine = innodb default charset=utf8; ...@@ -481,7 +481,7 @@ engine = innodb default charset=utf8;
insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,2,'ad','ad'),(4,4,'afe','afe'); insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,2,'ad','ad'),(4,4,'afe','afe');
commit; commit;
alter table t1 add unique index (b); alter table t1 add unique index (b);
ERROR 23000: Duplicate entry '0' for key 'b' ERROR 23000: Duplicate entry '' for key 'b'
insert into t1 values(8,9,'fff','fff'); insert into t1 values(8,9,'fff','fff');
select * from t1; select * from t1;
a b c d a b c d
...@@ -650,7 +650,7 @@ engine = innodb default charset=ucs2; ...@@ -650,7 +650,7 @@ engine = innodb default charset=ucs2;
insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,2,'ad','ad'),(4,4,'afe','afe'); insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,2,'ad','ad'),(4,4,'afe','afe');
commit; commit;
alter table t1 add unique index (b); alter table t1 add unique index (b);
ERROR 23000: Duplicate entry '0' for key 'b' ERROR 23000: Duplicate entry '' for key 'b'
show create table t1; show create table t1;
Table Create Table Table Create Table
t1 CREATE TABLE `t1` ( t1 CREATE TABLE `t1` (
......
...@@ -132,6 +132,7 @@ create table t1(a int not null, b int, c char(10), d varchar(20), primary key (a ...@@ -132,6 +132,7 @@ create table t1(a int not null, b int, c char(10), d varchar(20), primary key (a
engine = innodb default charset=utf8; engine = innodb default charset=utf8;
insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,2,'ad','ad'),(4,4,'afe','afe'); insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,2,'ad','ad'),(4,4,'afe','afe');
commit; commit;
--replace_regex /Duplicate entry '[0-9]*'/Duplicate entry ''/
--error 1582 --error 1582
alter table t1 add unique index (b); alter table t1 add unique index (b);
insert into t1 values(8,9,'fff','fff'); insert into t1 values(8,9,'fff','fff');
...@@ -170,6 +171,7 @@ create table t1(a int not null, b int, c char(10), d varchar(20), primary key (a ...@@ -170,6 +171,7 @@ create table t1(a int not null, b int, c char(10), d varchar(20), primary key (a
engine = innodb default charset=ucs2; engine = innodb default charset=ucs2;
insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,2,'ad','ad'),(4,4,'afe','afe'); insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,2,'ad','ad'),(4,4,'afe','afe');
commit; commit;
--replace_regex /Duplicate entry '[0-9]*'/Duplicate entry ''/
--error 1582 --error 1582
alter table t1 add unique index (b); alter table t1 add unique index (b);
show create table t1; show create table t1;
......
...@@ -1995,7 +1995,7 @@ explain select count(*) from t1 where v between 'a' and 'a ' and v between 'a ' ...@@ -1995,7 +1995,7 @@ explain select count(*) from t1 where v between 'a' and 'a ' and v between 'a '
id select_type table type possible_keys key key_len ref rows Extra id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ref v v 13 const # Using where; Using index 1 SIMPLE t1 ref v v 13 const # Using where; Using index
alter table t1 add unique(v); alter table t1 add unique(v);
ERROR 23000: Duplicate entry '{ ' for key 'v_2' ERROR 23000: Duplicate entry '' for key 'v_2'
alter table t1 add key(v); alter table t1 add key(v);
select concat('*',v,'*',c,'*',t,'*') as qq from t1 where v='a'; select concat('*',v,'*',c,'*',t,'*') as qq from t1 where v='a';
qq qq
......
...@@ -704,6 +704,154 @@ cmp_dtuple_is_prefix_of_rec( ...@@ -704,6 +704,154 @@ cmp_dtuple_is_prefix_of_rec(
return(FALSE); return(FALSE);
} }
#ifndef UNIV_HOTBACKUP
/*****************************************************************
Compare two physical records that contain the same number of columns,
none of which are stored externally. */
int
cmp_rec_rec_simple(
/*===============*/
/* out: 1, 0 , -1 if rec1 is greater, equal,
less, respectively, than rec2 */
const rec_t* rec1, /* in: physical record */
const rec_t* rec2, /* in: physical record */
const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */
const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */
dict_index_t* index) /* in: data dictionary index */
{
ulint rec1_f_len; /* length of current field in rec1 */
const byte* rec1_b_ptr; /* pointer to the current byte
in rec1 field */
ulint rec1_byte; /* value of current byte to be
compared in rec1 */
ulint rec2_f_len; /* length of current field in rec2 */
const byte* rec2_b_ptr; /* pointer to the current byte
in rec2 field */
ulint rec2_byte; /* value of current byte to be
compared in rec2 */
ulint cur_field; /* current field number */
ut_ad(!rec_offs_any_extern(offsets1));
ut_ad(!rec_offs_any_extern(offsets2));
ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2));
ut_ad(rec_offs_n_fields(offsets1) == rec_offs_n_fields(offsets2));
for (cur_field = 0; cur_field < rec_offs_n_fields(offsets1);
cur_field++) {
ulint cur_bytes;
ulint mtype;
ulint prtype;
{
const dict_col_t* col
= dict_index_get_nth_col(index, cur_field);
mtype = col->mtype;
prtype = col->prtype;
}
rec1_b_ptr = rec_get_nth_field(rec1, offsets1,
cur_field, &rec1_f_len);
rec2_b_ptr = rec_get_nth_field(rec2, offsets2,
cur_field, &rec2_f_len);
if (rec1_f_len == UNIV_SQL_NULL
|| rec2_f_len == UNIV_SQL_NULL) {
if (rec1_f_len == rec2_f_len) {
goto next_field;
} else if (rec2_f_len == UNIV_SQL_NULL) {
/* We define the SQL null to be the
smallest possible value of a field
in the alphabetical order */
return(1);
} else {
return(-1);
}
}
if (mtype >= DATA_FLOAT
|| (mtype == DATA_BLOB
&& 0 == (prtype & DATA_BINARY_TYPE)
&& dtype_get_charset_coll(prtype)
!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
int ret = cmp_whole_field(mtype, prtype,
rec1_b_ptr,
(unsigned) rec1_f_len,
rec2_b_ptr,
(unsigned) rec2_f_len);
if (ret) {
return(ret);
}
goto next_field;
}
/* Compare the fields */
for (cur_bytes = 0;; cur_bytes++, rec1_b_ptr++, rec2_b_ptr++) {
if (rec2_f_len <= cur_bytes) {
if (rec1_f_len <= cur_bytes) {
goto next_field;
}
rec2_byte = dtype_get_pad_char(mtype, prtype);
if (rec2_byte == ULINT_UNDEFINED) {
return(1);
}
} else {
rec2_byte = *rec2_b_ptr;
}
if (rec1_f_len <= cur_bytes) {
rec1_byte = dtype_get_pad_char(mtype, prtype);
if (rec1_byte == ULINT_UNDEFINED) {
return(-1);
}
} else {
rec1_byte = *rec1_b_ptr;
}
if (rec1_byte == rec2_byte) {
/* If the bytes are equal, they will remain
such even after the collation transformation
below */
continue;
}
if (mtype <= DATA_CHAR
|| (mtype == DATA_BLOB
&& !(prtype & DATA_BINARY_TYPE))) {
rec1_byte = cmp_collate(rec1_byte);
rec2_byte = cmp_collate(rec2_byte);
}
if (rec1_byte < rec2_byte) {
return(-1);
} else if (rec1_byte > rec2_byte) {
return(1);
}
}
next_field:
continue;
}
/* If we ran out of fields, rec1 was equal to rec2. */
return(0);
}
#endif /* !UNIV_HOTBACKUP */
/***************************************************************** /*****************************************************************
This function is used to compare two physical records. Only the common This function is used to compare two physical records. Only the common
first fields are compared, and if an externally stored field is first fields are compared, and if an externally stored field is
......
...@@ -236,6 +236,14 @@ rec_init_offsets_comp_ordinary( ...@@ -236,6 +236,14 @@ rec_init_offsets_comp_ordinary(
dict_field_t* field; dict_field_t* field;
ulint null_mask = 1; ulint null_mask = 1;
#ifdef UNIV_DEBUG
/* We cannot invoke rec_offs_make_valid() here, because it can hold
that extra != REC_N_NEW_EXTRA_BYTES. Similarly, rec_offs_validate()
will fail in that case, because it invokes rec_get_status(). */
offsets[2] = (ulint) rec;
offsets[3] = (ulint) index;
#endif /* UNIV_DEBUG */
/* read the lengths of fields 0..n */ /* read the lengths of fields 0..n */
do { do {
ulint len; ulint len;
...@@ -713,41 +721,50 @@ Determines the size of a data tuple in ROW_FORMAT=COMPACT. */ ...@@ -713,41 +721,50 @@ Determines the size of a data tuple in ROW_FORMAT=COMPACT. */
ulint ulint
rec_get_converted_size_comp( rec_get_converted_size_comp(
/*========================*/ /*========================*/
/* out: size */ /* out: total size */
dict_index_t* index, /* in: record descriptor; dict_index_t* index, /* in: record descriptor;
dict_table_is_comp() is assumed to hold */ dict_table_is_comp() is assumed to hold */
const dtuple_t* dtuple, /* in: data tuple */ ulint status, /* in: status bits of the record */
const dfield_t* fields, /* in: array of data fields */
ulint n_fields,/* in: number of data fields */
const ulint* ext, /* in: array of extern field numbers */ const ulint* ext, /* in: array of extern field numbers */
ulint n_ext) /* in: number of elements in ext */ ulint n_ext, /* in: number of elements in ext */
ulint* extra) /* out: extra size */
{ {
ulint size = REC_N_NEW_EXTRA_BYTES ulint extra_size;
+ UT_BITS_IN_BYTES(index->n_nullable); ulint data_size;
ulint i; ulint i;
ulint j; ulint j;
ulint n_fields; ut_ad(index);
ut_ad(index && dtuple); ut_ad(fields);
ut_ad(dtuple_validate(dtuple)); ut_ad(n_fields > 0);
switch (dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) { switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
case REC_STATUS_ORDINARY: case REC_STATUS_ORDINARY:
n_fields = dict_index_get_n_fields(index); ut_ad(n_fields == dict_index_get_n_fields(index));
ut_ad(n_fields == dtuple_get_n_fields(dtuple)); data_size = 0;
break; break;
case REC_STATUS_NODE_PTR: case REC_STATUS_NODE_PTR:
n_fields = dict_index_get_n_unique_in_tree(index); n_fields--;
ut_ad(n_fields + 1 == dtuple_get_n_fields(dtuple)); ut_ad(n_fields == dict_index_get_n_unique_in_tree(index));
ut_ad(dtuple_get_nth_field(dtuple, n_fields)->len == 4); ut_ad(fields[n_fields].len == 4);
size += 4; /* child page number */ ut_ad(!n_ext);
data_size = 4; /* child page number */
break; break;
case REC_STATUS_INFIMUM: case REC_STATUS_INFIMUM:
case REC_STATUS_SUPREMUM: case REC_STATUS_SUPREMUM:
/* infimum or supremum record, 8 data bytes */ /* infimum or supremum record, 8 data bytes */
return(REC_N_NEW_EXTRA_BYTES + 8); extra_size = REC_N_NEW_EXTRA_BYTES;
data_size = 8;
goto func_exit;
default: default:
ut_error; ut_error;
return(ULINT_UNDEFINED); return(ULINT_UNDEFINED);
} }
extra_size = REC_N_NEW_EXTRA_BYTES
+ UT_BITS_IN_BYTES(index->n_nullable);
/* read the lengths of fields 0..n */ /* read the lengths of fields 0..n */
for (i = j = 0; i < n_fields; i++) { for (i = j = 0; i < n_fields; i++) {
dict_field_t* field; dict_field_t* field;
...@@ -755,12 +772,11 @@ rec_get_converted_size_comp( ...@@ -755,12 +772,11 @@ rec_get_converted_size_comp(
const dict_col_t* col; const dict_col_t* col;
field = dict_index_get_nth_field(index, i); field = dict_index_get_nth_field(index, i);
len = dtuple_get_nth_field(dtuple, i)->len; len = fields[i].len;
col = dict_field_get_col(field); col = dict_field_get_col(field);
ut_ad(dict_col_type_assert_equal( ut_ad(dict_col_type_assert_equal(col,
col, dfield_get_type(dtuple_get_nth_field( dfield_get_type(&fields[i])));
dtuple, i))));
if (len == UNIV_SQL_NULL) { if (len == UNIV_SQL_NULL) {
/* No length is stored for NULL fields. */ /* No length is stored for NULL fields. */
...@@ -777,23 +793,28 @@ rec_get_converted_size_comp( ...@@ -777,23 +793,28 @@ rec_get_converted_size_comp(
|| field->fixed_len == field->prefix_len); || field->fixed_len == field->prefix_len);
} else if (UNIV_UNLIKELY(j < n_ext) && i == ext[j]) { } else if (UNIV_UNLIKELY(j < n_ext) && i == ext[j]) {
j++; j++;
size += 2; extra_size += 2;
} else if (len < 128 } else if (len < 128
|| (col->len < 256 && col->mtype != DATA_BLOB)) { || (col->len < 256 && col->mtype != DATA_BLOB)) {
size++; extra_size++;
} else { } else {
/* For variable-length columns, we look up the /* For variable-length columns, we look up the
maximum length from the column itself. If this maximum length from the column itself. If this
is a prefix index column shorter than 256 bytes, is a prefix index column shorter than 256 bytes,
this will waste one byte. */ this will waste one byte. */
size += 2; extra_size += 2;
} }
size += len; data_size += len;
} }
ut_ad(j == n_ext); ut_ad(j == n_ext);
return(size); func_exit:
if (UNIV_LIKELY_NULL(extra)) {
*extra = extra_size;
}
return(extra_size + data_size);
} }
/*************************************************************** /***************************************************************
...@@ -980,23 +1001,23 @@ rec_convert_dtuple_to_rec_old( ...@@ -980,23 +1001,23 @@ rec_convert_dtuple_to_rec_old(
/************************************************************* /*************************************************************
Builds a ROW_FORMAT=COMPACT record out of a data tuple. */ Builds a ROW_FORMAT=COMPACT record out of a data tuple. */
byte* void
rec_convert_dtuple_to_rec_comp( rec_convert_dtuple_to_rec_comp(
/*===========================*/ /*===========================*/
/* out: pointer to the start of data payload */ rec_t* rec, /* in: origin of record */
byte* buf, /* in: start address of the data area */
ulint extra, /* in: number of bytes to reserve between ulint extra, /* in: number of bytes to reserve between
the record header and the data payload the record header and the data payload
(usually REC_N_NEW_EXTRA_BYTES) */ (normally REC_N_NEW_EXTRA_BYTES) */
dict_index_t* index, /* in: record descriptor */ dict_index_t* index, /* in: record descriptor */
const dtuple_t* dtuple, /* in: data tuple */ ulint status, /* in: status bits of the record */
const dfield_t* fields, /* in: array of data fields */
ulint n_fields,/* in: number of data fields */
const ulint* ext, /* in: array of extern field numbers, const ulint* ext, /* in: array of extern field numbers,
in ascending order */ in ascending order */
ulint n_ext) /* in: number of elements in ext */ ulint n_ext) /* in: number of elements in ext */
{ {
const dfield_t* field; const dfield_t* field;
const dtype_t* type; const dtype_t* type;
rec_t* rec = buf + extra;
byte* end; byte* end;
byte* nulls; byte* nulls;
byte* lens; byte* lens;
...@@ -1006,18 +1027,10 @@ rec_convert_dtuple_to_rec_comp( ...@@ -1006,18 +1027,10 @@ rec_convert_dtuple_to_rec_comp(
ulint n_node_ptr_field; ulint n_node_ptr_field;
ulint fixed_len; ulint fixed_len;
ulint null_mask = 1; ulint null_mask = 1;
const ulint n_fields = dtuple_get_n_fields(dtuple);
ut_ad(dict_table_is_comp(index->table)); ut_ad(dict_table_is_comp(index->table));
ut_ad(n_fields > 0); ut_ad(n_fields > 0);
/* Try to ensure that the memset() between the for() loops switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
completes fast. The address is not exact, but UNIV_PREFETCH
should never generate a memory fault. */
UNIV_PREFETCH_RW(buf - n_fields);
UNIV_PREFETCH_RW(rec);
switch (UNIV_EXPECT(dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK,
REC_STATUS_ORDINARY)) {
case REC_STATUS_ORDINARY: case REC_STATUS_ORDINARY:
ut_ad(n_fields <= dict_index_get_n_fields(index)); ut_ad(n_fields <= dict_index_get_n_fields(index));
n_node_ptr_field = ULINT_UNDEFINED; n_node_ptr_field = ULINT_UNDEFINED;
...@@ -1030,62 +1043,12 @@ rec_convert_dtuple_to_rec_comp( ...@@ -1030,62 +1043,12 @@ rec_convert_dtuple_to_rec_comp(
case REC_STATUS_SUPREMUM: case REC_STATUS_SUPREMUM:
ut_ad(n_fields == 1); ut_ad(n_fields == 1);
n_node_ptr_field = ULINT_UNDEFINED; n_node_ptr_field = ULINT_UNDEFINED;
ut_d(j = 0); break;
goto init;
default: default:
ut_error; ut_error;
return(0); return;
}
/* Calculate the offset of the origin in the physical record.
We must loop over all fields to do this. */
rec += UT_BITS_IN_BYTES(index->n_nullable);
for (i = j = 0; i < n_fields; i++) {
if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
#ifdef UNIV_DEBUG
field = dtuple_get_nth_field(dtuple, i);
type = dfield_get_type(field);
ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL);
ut_ad(dfield_get_len(field) == 4);
#endif /* UNIV_DEBUG */
goto init;
}
field = dtuple_get_nth_field(dtuple, i);
type = dfield_get_type(field);
len = dfield_get_len(field);
fixed_len = dict_index_get_nth_field(index, i)->fixed_len;
ut_ad(dict_col_type_assert_equal(
dict_field_get_col(dict_index_get_nth_field(
index, i)),
dfield_get_type(field)));
if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) {
if (len == UNIV_SQL_NULL)
continue;
}
/* only nullable fields can be null */
ut_ad(len != UNIV_SQL_NULL);
if (fixed_len) {
ut_ad(len == fixed_len);
} else {
ut_ad(len <= dtype_get_len(type)
|| dtype_get_mtype(type) == DATA_BLOB);
rec++;
if (len >= 128
&& (dtype_get_len(type) >= 256
|| dtype_get_mtype(type) == DATA_BLOB)) {
rec++;
} else if (UNIV_UNLIKELY(j < n_ext) && i == ext[j]) {
j++;
rec++;
}
}
} }
init:
ut_ad(j == n_ext);
end = rec; end = rec;
nulls = rec - (extra + 1); nulls = rec - (extra + 1);
lens = nulls - UT_BITS_IN_BYTES(index->n_nullable); lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
...@@ -1094,8 +1057,7 @@ rec_convert_dtuple_to_rec_comp( ...@@ -1094,8 +1057,7 @@ rec_convert_dtuple_to_rec_comp(
/* Store the data and the offsets */ /* Store the data and the offsets */
for (i = j = 0; i < n_fields; i++) { for (i = j = 0, field = fields; i < n_fields; i++, field++) {
field = dtuple_get_nth_field(dtuple, i);
type = dfield_get_type(field); type = dfield_get_type(field);
len = dfield_get_len(field); len = dfield_get_len(field);
...@@ -1106,7 +1068,6 @@ rec_convert_dtuple_to_rec_comp( ...@@ -1106,7 +1068,6 @@ rec_convert_dtuple_to_rec_comp(
end += 4; end += 4;
break; break;
} }
fixed_len = dict_index_get_nth_field(index, i)->fixed_len;
if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) {
/* nullable field */ /* nullable field */
...@@ -1130,6 +1091,9 @@ rec_convert_dtuple_to_rec_comp( ...@@ -1130,6 +1091,9 @@ rec_convert_dtuple_to_rec_comp(
} }
/* only nullable fields can be null */ /* only nullable fields can be null */
ut_ad(len != UNIV_SQL_NULL); ut_ad(len != UNIV_SQL_NULL);
fixed_len = dict_index_get_nth_field(index, i)->fixed_len;
if (fixed_len) { if (fixed_len) {
ut_ad(len == fixed_len); ut_ad(len == fixed_len);
} else { } else {
...@@ -1157,8 +1121,6 @@ rec_convert_dtuple_to_rec_comp( ...@@ -1157,8 +1121,6 @@ rec_convert_dtuple_to_rec_comp(
} }
ut_ad(j == n_ext); ut_ad(j == n_ext);
return(rec);
} }
/************************************************************* /*************************************************************
...@@ -1177,8 +1139,19 @@ rec_convert_dtuple_to_rec_new( ...@@ -1177,8 +1139,19 @@ rec_convert_dtuple_to_rec_new(
in ascending order */ in ascending order */
ulint n_ext) /* in: number of elements in ext */ ulint n_ext) /* in: number of elements in ext */
{ {
rec_t* rec = rec_convert_dtuple_to_rec_comp( ulint extra_size;
buf, REC_N_NEW_EXTRA_BYTES, index, dtuple, ext, n_ext); ulint status;
rec_t* rec;
status = dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK;
rec_get_converted_size_comp(index, status,
dtuple->fields, dtuple->n_fields,
ext, n_ext, &extra_size);
rec = buf + extra_size;
rec_convert_dtuple_to_rec_comp(
rec, REC_N_NEW_EXTRA_BYTES, index, status,
dtuple->fields, dtuple->n_fields, ext, n_ext);
/* Set the info bits of the record */ /* Set the info bits of the record */
rec_set_info_and_status_bits(rec, dtuple_get_info_bits(dtuple)); rec_set_info_and_status_bits(rec, dtuple_get_info_bits(dtuple));
......
/****************************************************** /******************************************************
New index creation routines using a merge sort New index creation routines using a merge sort
(c) 2005 Innobase Oy (c) 2005,2007 Innobase Oy
Created 12/4/2005 Jan Lindstrom Created 12/4/2005 Jan Lindstrom
Completed by Sunny Bains and Marko Makela
*******************************************************/ *******************************************************/
/****************************************************** /******************************************************
...@@ -14,15 +15,7 @@ Created 12/4/2005 Jan Lindstrom ...@@ -14,15 +15,7 @@ Created 12/4/2005 Jan Lindstrom
2. Add more test cases and fix bugs founds. 2. Add more test cases and fix bugs founds.
3. If we are using variable length keys, then in 3. Run benchmarks.
some cases these keys do not fit into two empty blocks
in a different order. Therefore, some empty space is
left in every block. However, it has not been shown
that this empty space is enough for all cases. Therefore,
in the above case these overloaded records should be put
on another block.
4. Run benchmarks.
*******************************************************/ *******************************************************/
#include "row0merge.h" #include "row0merge.h"
...@@ -55,1406 +48,796 @@ Created 12/4/2005 Jan Lindstrom ...@@ -55,1406 +48,796 @@ Created 12/4/2005 Jan Lindstrom
#include "pars0pars.h" #include "pars0pars.h"
#include "mem0mem.h" #include "mem0mem.h"
#include "log0log.h" #include "log0log.h"
#include "ut0sort.h"
/* Records are stored in the memory for main memory linked list
to this structure */
struct merge_rec_struct {
struct merge_rec_struct *next; /* Pointer to next record
in the list */
rec_t* rec; /* Record */
};
typedef struct merge_rec_struct merge_rec_t;
/* This structure is head element for main memory linked list
used for main memory linked list merge sort */
struct merge_rec_list_struct {
merge_rec_t* head; /* Pointer to head of the
list */
merge_rec_t* tail; /* Pointer to tail of the
list */
#ifdef UNIV_DEBUG
ulint n_records; /* Number of records in
the list */
#endif /* UNIV_DEBUG */
ulint total_size; /* Total size of all records in
the list */
mem_heap_t* heap; /* Heap where memory for this
list is allocated */
};
typedef struct merge_rec_list_struct merge_rec_list_t;
/* Block size for I/O operations in merge sort */ /* Block size for I/O operations in merge sort */
#define MERGE_BLOCK_SIZE 1048576 /* 1M */ typedef byte row_merge_block_t[1048576];
/* Intentional free space on every block */ /* Secondary buffer for I/O operations of merge records */
#define MERGE_BLOCK_SAFETY_MARGIN 128
typedef byte mrec_buf_t[UNIV_PAGE_SIZE / 2];
/* Enable faster index creation debug code */
/* #define UNIV_DEBUG_INDEX_CREATE 1 */ /* Merge record in row_merge_block_t. The format is the same as a
record in ROW_FORMAT=COMPACT with the exception that the
/* This block header structure is used to create linked list of the REC_N_NEW_EXTRA_BYTES are omitted. */
blocks to the disk. Every block contains one header.*/ typedef byte mrec_t;
struct merge_block_header_struct { /* Buffer for sorting in main memory. */
ulint n_records; /* Number of records in the block. */ struct row_merge_buf_struct {
ulint offset; /* Offset of this block */ mem_heap_t* heap; /* memory heap where allocated */
ulint next; /* Offset of next block */ dict_index_t* index; /* the index the tuples belong to */
ulint total_size; /* total amount of data bytes */
ulint n_tuples; /* number of data tuples */
ulint max_tuples; /* maximum number of data tuples */
const dfield_t**tuples; /* array of pointers to
arrays of fields that form
the data tuples */
const dfield_t**tmp_tuples; /* temporary copy of tuples,
for sorting */
}; };
typedef struct merge_block_header_struct merge_block_header_t; typedef struct row_merge_buf_struct row_merge_buf_t;
/* This block structure is used to hold index records in the disk /* Information about temporary files used in merge sort are stored
and the memory */ to this structure */
struct merge_block_struct { struct merge_file_struct {
merge_block_header_t header; /* Block header information */ int fd; /* File descriptor */
char data[MERGE_BLOCK_SIZE - sizeof(merge_block_header_t)];/* Data area i.e. heap */ ulint offset; /* File offset */
}; };
typedef struct merge_block_struct merge_block_t; typedef struct merge_file_struct merge_file_t;
/**************************************************************************
Search an index object by name and column names. If several indexes match,
return the index with the max id. */
static
dict_index_t*
row_merge_dict_table_get_index(
/*===========================*/
/* out: matching index,
NULL if not found */
dict_table_t* table, /* in: table */
const merge_index_def_t*index_def) /* in: index definition */
{
ulint i;
dict_index_t* index;
const char** column_names;
column_names = mem_alloc(index_def->n_fields * sizeof *column_names);
for (i = 0; i < index_def->n_fields; ++i) {
column_names[i] = index_def->fields[i].field_name;
}
index = dict_table_get_index_by_max_id(
table, index_def->name, column_names, index_def->n_fields);
mem_free(column_names);
return(index);
}
/************************************************************************
Creates and initializes a merge block */
static
merge_block_t*
row_merge_block_create(void)
/*========================*/
/* out: pointer to block */
{
merge_block_t* mblock;
mblock = mem_alloc(sizeof *mblock);
memset(&mblock->header, 0, sizeof mblock->header); /**********************************************************
Allocate a sort buffer. */
return(mblock);
}
/************************************************************************
Read a merge block from the file system. */
static static
ibool row_merge_buf_t*
row_merge_read( row_merge_buf_create_low(
/*===========*/ /*=====================*/
/* out: TRUE if request was /* out,own: sort buffer */
successful, FALSE if fail */ mem_heap_t* heap, /* in: heap where allocated */
os_file_t file, /* in: file handle */ dict_index_t* index, /* in: secondary index */
ulint offset, /* in: offset where to read */ ulint buf_size, /* in: size of the buffer, in bytes */
void* buf, /* out: data */ ulint max_tuples) /* in: maximum number of data tuples */
ulint size) /* in: number of bytes to read */
{ {
ib_uint64_t ofs = ((ib_uint64_t) offset) * MERGE_BLOCK_SIZE; row_merge_buf_t* buf;
ut_ad(size <= MERGE_BLOCK_SIZE); buf = mem_heap_alloc(heap, buf_size);
memset(buf, 0, buf_size);
return(UNIV_LIKELY(os_file_read(file, buf, buf->heap = heap;
(ulint) (ofs & 0xFFFFFFFF), buf->index = index;
(ulint) (ofs >> 32), buf->max_tuples = max_tuples;
size))); buf->tuples = mem_heap_alloc(heap,
2 * max_tuples * sizeof *buf->tuples);
buf->tmp_tuples = buf->tuples + max_tuples;
return(buf);
} }
/************************************************************************ /**********************************************************
Read a merge block from the file system. */ Allocate a sort buffer. */
static static
ibool row_merge_buf_t*
row_merge_block_read( row_merge_buf_create(
/*=================*/ /*=================*/
/* out: TRUE if request was /* out,own: sort buffer */
successful, FALSE if fail */ dict_index_t* index) /* in: secondary index */
os_file_t file, /* in: file handle */
ulint offset, /* in: offset where to read */
merge_block_t* block) /* out: merge block */
{
return(row_merge_read(file, offset, block, sizeof *block));
}
/************************************************************************
Read a merge block header from the disk */
static
ibool
row_merge_block_header_read(
/*========================*/
/* out: TRUE if request was
successful, FALSE if fail */
os_file_t file, /* in: handle to a file */
ulint offset, /* in: offset where to read */
merge_block_header_t* header) /* out: merge block header */
{
return(row_merge_read(file, offset, header, sizeof *header));
}
/************************************************************************
Read a merge block from the file system. */
static
ibool
row_merge_write(
/*============*/
/* out: TRUE if request was
successful, FALSE if fail */
os_file_t file, /* in: file handle */
ulint offset, /* in: offset where to write */
const void* buf, /* in: data */
ulint size) /* in: number of bytes to write */
{ {
ib_uint64_t ofs = ((ib_uint64_t) offset) * MERGE_BLOCK_SIZE; row_merge_buf_t* buf;
ulint max_tuples;
ulint buf_size;
mem_heap_t* heap;
ut_ad(size <= MERGE_BLOCK_SIZE); max_tuples = sizeof(row_merge_block_t)
/ ut_max(1, dict_index_get_min_size(index));
return(UNIV_LIKELY(os_file_write("(merge)", file, buf, buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
(ulint) (ofs & 0xFFFFFFFF),
(ulint) (ofs >> 32),
size)));
}
/************************************************************************ heap = mem_heap_create(buf_size + sizeof(row_merge_block_t));
Write a merge block header to the disk */
static
ibool
row_merge_block_header_write(
/*=========================*/
/* out: TRUE if request was
successful, FALSE if fail */
os_file_t file, /* in: handle to a file */
const merge_block_header_t* header) /* in: block header */
{
return(row_merge_write(file, header->offset, header, sizeof *header));
}
/************************************************************************ buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
Write a merge block to the disk */
static
ibool
row_merge_block_write(
/*==================*/
/* out: TRUE if request was
successful, FALSE if fail */
os_file_t file, /* in: handle to a file */
ulint offset, /* in: file offset */
const merge_block_t* block) /* in: block header */
{
ut_ad(offset == block->header.offset);
return(row_merge_write(file, offset, block, sizeof *block)); return(buf);
} }
/************************************************************** /**********************************************************
Create a merge record and copy a index data tuple to the merge Empty a sort buffer. */
record */
static static
merge_rec_t* void
row_merge_rec_create( row_merge_buf_empty(
/*=================*/ /*================*/
/* out: merge record */ row_merge_buf_t* buf) /* in/out: sort buffer */
const dtuple_t* dtuple, /* in: data tuple */
const ulint* ext, /* in: array of extern field numbers */
ulint n_ext, /* in: number of elements in ext */
dict_index_t* index, /* in: index record descriptor */
mem_heap_t* heap) /* in: heap where memory is allocated */
{ {
merge_rec_t* m_rec; ulint buf_size;
ulint rec_size; ulint max_tuples = buf->max_tuples;
byte* buf; mem_heap_t* heap = buf->heap;
dict_index_t* index = buf->index;
ut_ad(dtuple && index && heap); buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
ut_ad(dtuple_validate(dtuple));
m_rec = (merge_rec_t*) mem_heap_alloc(heap, sizeof(merge_rec_t)); mem_heap_empty(heap);
rec_size = rec_get_converted_size(index, dtuple, ext, n_ext); buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
buf = mem_heap_alloc(heap, rec_size);
m_rec->rec = rec_convert_dtuple_to_rec(buf, index, dtuple,
ext, n_ext);
m_rec->next = NULL;
return(m_rec);
} }
/************************************************************************ /**********************************************************
Checks that a record fits to a block */ Deallocate a sort buffer. */
static static
ibool void
row_merge_rec_fits_to_block( row_merge_buf_free(
/*========================*/ /*===============*/
/* out: TRUE if record fits to merge block, row_merge_buf_t* buf) /* in,own: sort buffer, to be freed */
FALSE if record does not fit to block */
const ulint* offsets,/* in: record offsets */
ulint offset) /* in: offset where to store in the block */
{ {
ulint rec_len; mem_heap_free(buf->heap);
ut_ad(offsets);
rec_len = mach_get_compressed_size(rec_offs_extra_size(offsets))
+ rec_offs_size(offsets);
/* Note that we intentionally leave free space on
every block. This free space might be later needed when two
blocks are merged and variable length keys are used. Variable
length keys on two blocks might be interleaved on such a manner
that they do not fit on two blocks if blocks are too full */
return((offset + rec_len) < (MERGE_BLOCK_SIZE
- MERGE_BLOCK_SAFETY_MARGIN
- sizeof(merge_block_header_t)));
} }
/************************************************************************ /**********************************************************
Store a record to a merge file block. Note that this function does Insert a data tuple into a sort buffer. */
not check that the record fits to the block. */
static static
ulint ibool
row_merge_store_rec_to_block( row_merge_buf_add(
/*=========================*/ /*==============*/
/* out: offset for next data tuple */ /* out: TRUE if added,
const rec_t* rec, /* in: record to be stored in the memory */ FALSE if out of space */
const ulint* offsets,/* in: record offsets */ row_merge_buf_t* buf, /* in/out: sort buffer */
merge_block_t* mblock, /* in: block where data tuple is stored */ const dtuple_t* row, /* in: row in clustered index */
ulint offset) /* in: offset where to store */ row_ext_t* ext) /* in/out: cache of externally stored
column prefixes, or NULL */
{ {
char* dest_data; ulint i;
ulint rec_len; ulint n_fields;
ulint extra_len; ulint data_size;
ulint storage_size; ulint extra_size;
dfield_t* entry;
ut_ad(rec && mblock && offsets); dfield_t* field;
ut_ad(rec_validate(rec, offsets));
/* Find the position in the block where this data tuple is stored.
If we are at the start of the block, remember to add size of header
to the offset */
if (offset == 0) { if (buf->n_tuples >= buf->max_tuples) {
dest_data = mblock->data; return(FALSE);
} else {
dest_data = ((char *)mblock + offset);
} }
ut_ad(dest_data < (char*) &mblock[1]); n_fields = dict_index_get_n_fields(buf->index);
extra_len = rec_offs_extra_size(offsets); entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry);
rec_len = rec_offs_size(offsets); buf->tuples[buf->n_tuples] = entry;
field = entry;
/* 1. Store the extra_len */
storage_size = mach_write_compressed((byte *)dest_data, extra_len); data_size = 0;
dest_data+=storage_size; extra_size = UT_BITS_IN_BYTES(buf->index->n_nullable);
ut_ad(dest_data < (char*) &mblock[1]);
for (i = 0; i < n_fields; i++, field++) {
/* 2. Store the record */ dict_field_t* ifield;
memcpy(dest_data, rec - extra_len, rec_len); const dict_col_t* col;
dest_data+=rec_len; ulint col_no;
ut_ad(dest_data < (char*) &mblock[1]); const dfield_t* row_field;
ifield = dict_index_get_nth_field(buf->index, i);
col = ifield->col;
col_no = dict_col_get_no(col);
row_field = dtuple_get_nth_field(row, col_no);
dfield_copy(field, row_field);
if (UNIV_LIKELY_NULL(ext)
&& dfield_get_len(row_field) != UNIV_SQL_NULL) {
/* See if the column is stored externally. */
byte* buf = row_ext_lookup(ext, col_no,
row_field->data,
row_field->len,
&field->len);
if (UNIV_LIKELY_NULL(buf)) {
field->data = buf;
}
}
mblock->header.n_records++; if (field->len == UNIV_SQL_NULL) {
ut_ad(!(col->prtype & DATA_NOT_NULL));
field->data = NULL;
continue;
}
/* Return next offset */ /* If a column prefix index, take only the prefix */
return((char *)dest_data - (char *)mblock);
}
/************************************************************************ if (ifield->prefix_len) {
Read a record from the block */ field->len = dtype_get_at_most_n_mbchars(
static col->prtype,
merge_rec_t* col->mbminlen, col->mbmaxlen,
row_merge_read_rec_from_block( ifield->prefix_len,
/*==========================*/ field->len, field->data);
/* out: record or NULL*/ }
merge_block_t* mblock, /* in: memory block where to read */
ulint* offset, /* in/out: offset where to read a record */
mem_heap_t* heap, /* in: heap were this memory for this record
is allocated */
dict_index_t* index) /* in: index record desriptor */
{
merge_rec_t* mrec;
char* from_data;
ulint extra_len;
ulint data_len;
ulint tmp_offset;
ulint storage_len;
rec_t* rec;
mem_heap_t* offset_heap = NULL;
ulint sec_offsets_[REC_OFFS_SMALL_SIZE];
ulint* sec_offs = sec_offsets_;
*sec_offsets_ = (sizeof sec_offsets_) / sizeof *sec_offsets_; ut_ad(field->len <= col->len || col->mtype == DATA_BLOB);
ut_ad(mblock && offset && heap); if (ifield->fixed_len) {
ut_ad(field->len == ifield->fixed_len);
} else if (field->len < 128
|| (col->len < 256 && col->mtype != DATA_BLOB)) {
extra_size++;
} else {
extra_size += 2;
}
data_size += field->len;
}
tmp_offset = *offset; #ifdef UNIV_DEBUG
{
ulint size;
ulint extra;
/* Find the position in the block where this data tuple is stored. size = rec_get_converted_size_comp(buf->index,
If we are at the start of the block, remember to add size of header REC_STATUS_ORDINARY,
to the offset */ entry, n_fields, NULL, 0,
&extra);
if (tmp_offset == 0) { ut_ad(data_size + extra_size + REC_N_NEW_EXTRA_BYTES == size);
from_data = mblock->data; ut_ad(extra_size + REC_N_NEW_EXTRA_BYTES == extra);
} else {
from_data = ((char *)mblock + tmp_offset);
} }
#endif /* UNIV_DEBUG */
ut_ad(from_data < (const char*) &mblock[1]); /* Add to the total size of the record in row_merge_block_t
the encoded length of extra_size and the extra bytes (extra_size).
mrec = mem_heap_alloc(heap, sizeof(merge_rec_t)); See row_merge_buf_write() for the variable-length encoding
of extra_size. */
/* 1. Read the extra len and calculate its storage length */ data_size += extra_size + (extra_size >= 127);
extra_len = mach_read_compressed((byte *)from_data);
storage_len = mach_get_compressed_size(extra_len);
from_data+=storage_len;
ut_ad(from_data < (const char*) &mblock[1]);
/* 2. Read the record */ /* Reserve one byte for the end marker of row_merge_block_t. */
rec = (rec_t*)(from_data + extra_len); if (buf->total_size + data_size >= sizeof(row_merge_block_t) - 1) {
mrec->rec = rec; return(FALSE);
sec_offs = rec_get_offsets(mrec->rec, index, sec_offs, ULINT_UNDEFINED, }
&offset_heap);
data_len = rec_offs_size(sec_offs);
ut_ad(rec_validate(rec, sec_offs));
from_data+=data_len; buf->total_size += data_size;
ut_ad(from_data < (const char*) &mblock[1]); buf->n_tuples++;
/* Return also start offset of the next data tuple */ field = entry;
*offset = ((char *)from_data - (char *)mblock);
if (offset_heap) { /* Copy the data fields. */
mem_heap_free(offset_heap); for (i = 0; i < n_fields; i++, field++) {
if (field->len != UNIV_SQL_NULL) {
field->data = mem_heap_dup(buf->heap,
field->data, field->len);
}
} }
return(mrec); return(TRUE);
} }
/***************************************************************** /*****************************************************************
Compare two merge records. */ Compare two tuples. */
static static
int int
row_merge_cmp( row_merge_tuple_cmp(
/*==========*/ /*================*/
/* out: 1, 0, -1 if mrec1 is /* out: 1, 0, -1 if a is greater,
greater, equal, less, equal, less, respectively, than b */
respectively, than mrec2 */ ulint n_field,/* in: number of fields */
merge_rec_t* mrec1, /* in: first merge record to be ulint* n_dup, /* in/out: number of duplicates */
compared */ const dfield_t* a, /* in: first tuple to be compared */
merge_rec_t* mrec2, /* in: second merge record to be const dfield_t* b) /* in: second tuple to be compared */
compared */
const ulint* offsets1, /* in: first record offsets */
const ulint* offsets2, /* in: second record offsets */
dict_index_t* index) /* in: index */
{ {
ut_ad(mrec1 && mrec2 && offsets1 && offsets2 && index); int cmp;
ut_ad(rec_validate(mrec1->rec, offsets1));
ut_ad(rec_validate(mrec2->rec, offsets2));
return(cmp_rec_rec(mrec1->rec, mrec2->rec, offsets1, offsets2, index));
}
/*****************************************************************
Merge sort for linked list in memory.
Merge sort takes the input list and makes log N passes along
the list and in each pass it combines each adjacent pair of
small sorted lists into one larger sorted list. When only one
pass is needed the whole output list must have been sorted.
In each pass, two lists of size block_size are merged into lists of
size block_size*2. Initially block_size=1. Merge starts by pointing
a temporary pointer list1 at the head of the list and also preparing
an empty list list_tail where elements will be appended. Then:
1) If list1 is NULL we terminate this pass.
2) Otherwise, there is at least one element in the next
pair of block_size lists therefore, increase the number of
merges performed in this pass.
3) Point another temporary pointer list2 as the same do {
place as list1. Iterate list2 by block_size elements cmp = cmp_dfield_dfield(a++, b++);
or until the end of the list. Let the list_size1 be the } while (!cmp && --n_field);
number of elements in the list2.
4) Let list_size1=merge_size. Now we merge list starting at
list1 of length list_size2 with a list starting at list2 of
length at most list_size1.
5) So, as long as either the list1 is non-empty (list_size1)
or the list2 is non-empty (list_size2 and list2 pointing to
a element):
5.1) Select which list to take the next element from.
If either lists is empty, we choose from the other one.
If both lists are non-empty, compare the first element
of each and choose the lower one.
5.2) Remove that element, tmp, from the start of its
lists, by advancing list1 or list2 to next element
and decreasing list1_size or list2_size.
5.3) Append tmp to list_tail
6) At this point, we have advanced list1 until it is where
list2 started out and we have advanced list2 until it is
pointing at the next pair of block_size lists to merge.
Thus, set list1 to the value of list2 and go back to the
start of this loop.
As soon as a pass like this is performed with only one merge, the
algorithm terminates and output list list_head is sorted. Otherwise,
double the value of block_size and go back to the beginning. */
static
ibool
row_merge_sort_linked_list(
/*=======================*/
/* out: FALSE on error */
dict_index_t* index, /* in: index to be created */
merge_rec_list_t* list) /* in: Pointer to head element */
{
ibool success;
merge_rec_t* list1;
merge_rec_t* list2;
merge_rec_t* list_head;
merge_rec_t* list_tail;
ulint block_size;
ulint list1_size;
ulint list2_size;
ulint i;
mem_heap_t* heap = NULL;
ulint offsets1_[REC_OFFS_SMALL_SIZE];
ulint* offsets1 = offsets1_;
ulint offsets2_[REC_OFFS_SMALL_SIZE];
ulint* offsets2 = offsets2_;
ut_ad(list && list->head && index);
*offsets1_ = (sizeof offsets1_) / sizeof *offsets1_;
*offsets2_ = (sizeof offsets2_) / sizeof *offsets2_;
list_head = list->head;
for (block_size = 1;; block_size *= 2) {
ibool sorted = TRUE;
list1 = list_head;
list_head = NULL;
list_tail = NULL;
for (;;) {
list2 = list1;
list1_size = 0;
list2_size = block_size;
/* Step at most block_size elements along from
list2. */
for (i = 0; i < block_size; i++) {
list1_size++;
list2 = list2->next;
if (!list2) {
list2_size = 0;
break;
}
}
/* If list2 is not NULL, we have two lists to merge.
Otherwise, we have a sorted list. */
while (list1_size || list2_size) {
merge_rec_t* tmp;
/* Merge sort two lists by deciding whether
next element of merge comes from list1 or
list2. */
if (list1_size == 0) {
/* First list is empty, next element
must come from the second list. */
goto pick2;
}
if (list2_size == 0) {
/* Second list is empty, next element
must come from the first list. */
goto pick1;
}
offsets1 = rec_get_offsets(list1->rec, index,
offsets1,
ULINT_UNDEFINED,
&heap);
offsets2 = rec_get_offsets(list2->rec, index,
offsets2,
ULINT_UNDEFINED,
&heap);
switch (row_merge_cmp(list1, list2,
offsets1, offsets2,
index)) {
case 0:
if (UNIV_UNLIKELY
(dict_index_is_unique(index))) {
success = FALSE;
goto func_exit;
}
/* fall through */
case -1:
pick1:
tmp = list1;
list1 = list1->next;
list1_size--;
break;
case 1:
pick2:
tmp = list2;
list2 = list2->next;
if (list2) {
list2_size--;
} else {
list2_size = 0;
}
break;
default:
ut_error;
}
/* Append the element to the merged list */
if (list_tail) {
list_tail->next = tmp;
} else {
list_head = tmp;
}
list_tail = tmp;
}
if (!list2) {
if (!sorted) {
break;
}
list->head = list_head;
list_tail->next = NULL;
success = TRUE;
goto func_exit;
}
sorted = FALSE;
list1 = list2;
}
list_tail->next = NULL; if (!cmp) {
(*n_dup)++;
} }
func_exit: return(cmp);
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
return(success);
} }
/***************************************************************** /**************************************************************************
Create and initialize record list used for in-memory merge sort */ Merge sort the tuple buffer in main memory. */
static static
merge_rec_list_t* void
row_merge_create_list(void) row_merge_tuple_sort(
/*=======================*/ /*=================*/
/* out: pointer to list */ ulint n_field,/* in: number of fields */
ulint* n_dup, /* in/out: number of duplicates */
const dfield_t** tuples, /* in/out: tuples */
const dfield_t** aux, /* in/out: work area */
ulint low, /* in: lower bound of the
sorting area, inclusive */
ulint high) /* in: upper bound of the
sorting area, exclusive */
{ {
merge_rec_list_t* list_header; #define row_merge_tuple_sort_ctx(a,b,c,d) \
mem_heap_t* heap = NULL; row_merge_tuple_sort(n_field, n_dup, a, b, c, d)
#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, n_dup, a, b)
/* Create list header */ UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
heap = mem_heap_create((MERGE_BLOCK_SIZE + sizeof(merge_rec_list_t))); tuples, aux, low, high, row_merge_tuple_cmp_ctx);
list_header = mem_heap_alloc(heap, sizeof(merge_rec_list_t));
list_header->head = NULL;
list_header->tail = NULL;
ut_d(list_header->n_records = 0);
list_header->total_size = sizeof(merge_rec_list_t);
list_header->heap = heap;
return(list_header);
} }
/***************************************************************** /**********************************************************
Add one record to the merge list */ Sort a buffer. */
static static
void ulint
row_merge_list_add( row_merge_buf_sort(
/*===============*/ /*===============*/
merge_rec_t* m_rec, /* in: record to be /* out: number of duplicates
inserted to the list */ encountered */
ulint rec_len, /* in: record length */ row_merge_buf_t* buf) /* in/out: sort buffer */
merge_rec_list_t* list_header) /* in/out: list header */
{ {
ut_ad(m_rec && list_header); ulint n_dup = 0;
m_rec->next = NULL;
list_header->total_size+=rec_len;
if (list_header->tail == NULL) {
list_header->tail = list_header->head = m_rec; row_merge_tuple_sort(dict_index_get_n_fields(buf->index), &n_dup,
} else { buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
list_header->tail->next = m_rec;
list_header->tail = m_rec;
}
ut_d(list_header->n_records++); return(n_dup);
} }
/***************************************************************** /**********************************************************
Write records from the list to the merge block */ Write a buffer to a block. */
static static
merge_rec_list_t* void
row_merge_write_list_to_block( row_merge_buf_write(
/*==========================*/ /*================*/
/* out: pointer to a new list const row_merge_buf_t* buf, /* in: sorted buffer */
where rest of the items are stored */ row_merge_block_t* block) /* out: buffer for writing to file */
merge_rec_list_t* list, /* in: Record list */
merge_block_t* output, /* in: Pointer to block */
dict_index_t* index) /* in: Record descriptor */
{ {
ulint offset = 0; dict_index_t* index = buf->index;
merge_rec_t* m_rec = NULL; ulint n_fields= dict_index_get_n_fields(index);
merge_rec_list_t* new_list = NULL; byte* b = &(*block)[0];
mem_heap_t* heap = NULL;
ulint sec_offsets_[REC_OFFS_SMALL_SIZE]; ulint i;
ulint* sec_offs = sec_offsets_;
for (i = 0; i < buf->n_tuples; i++) {
ut_ad(list && output && index); ulint size;
ulint extra_size;
*sec_offsets_ = (sizeof sec_offsets_) / sizeof *sec_offsets_; const dfield_t* entry = buf->tuples[i];
output->header.n_records = 0;
size = rec_get_converted_size_comp(buf->index,
/* Write every record which fits to block to the block */ REC_STATUS_ORDINARY,
entry, n_fields, NULL, 0,
m_rec = list->head; &extra_size);
ut_ad(size > extra_size);
while (m_rec) { ut_ad(extra_size >= REC_N_NEW_EXTRA_BYTES);
extra_size -= REC_N_NEW_EXTRA_BYTES;
sec_offs = rec_get_offsets(m_rec->rec, index, sec_offs, size -= REC_N_NEW_EXTRA_BYTES;
ULINT_UNDEFINED, &heap);
/* Encode extra_size + 1 */
if (!row_merge_rec_fits_to_block(sec_offs, offset)) { if (extra_size + 1 < 0x80) {
break; *b++ = extra_size + 1;
} else {
ut_ad(extra_size < 0x8000);
*b++ = 0x80 | ((extra_size + 1) >> 8);
*b++ = (byte) (extra_size + 1);
} }
offset = row_merge_store_rec_to_block(m_rec->rec, ut_ad(b + size < block[1]);
sec_offs, output, offset);
m_rec = m_rec->next;
ut_d(list->n_records--);
}
/* Now create a new list and store rest of the records there.
Note that records must be copied because we deallocate memory
allocated for the original list. */
new_list = row_merge_create_list();
while (m_rec) {
rec_t* rec;
merge_rec_t* n_rec;
void* buff;
*sec_offsets_ = (sizeof sec_offsets_) / sizeof *sec_offsets_;
sec_offs = rec_get_offsets(m_rec->rec, index, sec_offs, rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index,
ULINT_UNDEFINED, &heap); REC_STATUS_ORDINARY,
entry, n_fields, NULL, 0);
buff = mem_heap_alloc(new_list->heap, b += size;
rec_offs_size(sec_offs));
n_rec = mem_heap_alloc(new_list->heap, sizeof(merge_rec_t));
rec = rec_copy(buff, m_rec->rec, sec_offs);
n_rec->rec = rec;
row_merge_list_add(n_rec, rec_offs_size(sec_offs), new_list);
m_rec = m_rec->next;
}
/* We can now free original list */
mem_heap_free(list->heap);
if (heap) {
mem_heap_free(heap);
} }
return(new_list); /* Write an "end-of-chunk" marker. */
ut_a(b < block[1]);
*b++ = 0;
#ifdef UNIV_DEBUG_VALGRIND
/* The rest of the block is uninitialized. Initialize it
to avoid bogus warnings. */
memset(b, 0, block[1] - b);
#endif /* UNIV_DEBUG_VALGRIND */
} }
#ifdef UNIV_DEBUG /**********************************************************
/************************************************************************* Create a memory heap and allocate space for row_merge_rec_offsets(). */
Validate contents of the block */
static static
ibool mem_heap_t*
row_merge_block_validate( row_merge_heap_create(
/*=====================*/ /*==================*/
merge_block_t* block, /* in: block to be printed */ /* out: memory heap */
dict_index_t* index) /* in: record descriptor */ dict_index_t* index, /* in: record descriptor */
ulint** offsets1, /* out: offsets */
ulint** offsets2) /* out: offsets */
{ {
merge_rec_t* mrec; ulint i = REC_OFFS_HEADER_SIZE
ulint offset = 0; + dict_index_get_n_fields(index);
ulint n_recs = 0; mem_heap_t* heap = mem_heap_create(2 * i * sizeof *offsets1);
mem_heap_t* heap;
ulint sec_offsets1_[REC_OFFS_SMALL_SIZE];
ulint* sec_offs1 = sec_offsets1_;
*sec_offsets1_ = (sizeof sec_offsets1_) / sizeof *sec_offsets1_;
ut_a(block && index);
heap = mem_heap_create(1024);
fprintf(stderr, *offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1);
"Block validate %lu records, " *offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2);
"offset %lu, next %lu\n",
block->header.n_records,
block->header.offset, block->header.next);
ut_a(block->header.n_records > 0); (*offsets1)[0] = (*offsets2)[0] = i;
(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
for (n_recs = 0; n_recs < block->header.n_records; n_recs++) { return(heap);
mrec = row_merge_read_rec_from_block(block, &offset, heap,
index);
sec_offs1 = rec_get_offsets(mrec->rec, index, sec_offs1,
ULINT_UNDEFINED, &heap);
ut_a(rec_validate(mrec->rec, sec_offs1));
mem_heap_empty(heap);
}
mem_heap_free(heap);
return(TRUE);
} }
#endif /* UNIV_DEBUG */
/************************************************************************* /**************************************************************************
Merge two blocks resulting a two sorted blocks. */ Search an index object by name and column names. If several indexes match,
return the index with the max id. */
static static
merge_block_t* dict_index_t*
row_merge_block_merge( row_merge_dict_table_get_index(
/*==================*/ /*===========================*/
/* out: Pointer to first sorted block /* out: matching index,
or NULL in case of error */ NULL if not found */
merge_block_t* block1, /* in: First block to be merged */ dict_table_t* table, /* in: table */
merge_block_t** block2, /* in/out: Second block to be merged. const merge_index_def_t*index_def) /* in: index definition */
Note that contents of the second sorted
block is returned with this parameter.*/
dict_index_t* index) /* in: Index to be created */
{ {
merge_block_t* new_block1; ulint i;
merge_block_t* new_block2; dict_index_t* index;
merge_block_t* tmp; const char** column_names;
ulint nth_rec1 = 0;
ulint nth_rec2 = 0;
ulint offset1 = 0;
ulint offset2 = 0;
ulint offset3 = 0;
ulint offset4 = 0;
ibool fits_to_new = TRUE;
mem_heap_t* heap;
mem_heap_t* offset_heap = NULL;
ulint sec_offsets1_[REC_OFFS_SMALL_SIZE];
ulint* sec_offs1 = sec_offsets1_;
ulint sec_offsets2_[REC_OFFS_SMALL_SIZE];
ulint* sec_offs2 = sec_offsets2_;
ut_ad(block1 && block2 && *block2 && index);
ut_ad(row_merge_block_validate(block1, index));
ut_ad(row_merge_block_validate(*block2, index));
*sec_offsets1_ = (sizeof sec_offsets1_) / sizeof *sec_offsets1_;
*sec_offsets2_ = (sizeof sec_offsets2_) / sizeof *sec_offsets2_;
new_block1 = row_merge_block_create();
new_block2 = row_merge_block_create();
tmp = *block2;
heap = mem_heap_create(256);
/* Copy block offset and next block offset to new blocks */
new_block1->header = block1->header;
new_block2->header = tmp->header;
new_block1->header.n_records = 0;
new_block2->header.n_records = 0;
/* Merge all records from both blocks */
while (nth_rec1 < block1->header.n_records ||
nth_rec2 < tmp->header.n_records) {
merge_rec_t* mrec1 = NULL;
merge_rec_t* mrec2 = NULL;
const ulint* rec_offsets;
mem_heap_empty(heap);
if (nth_rec1 < block1->header.n_records &&
nth_rec2 >= tmp->header.n_records) {
/* If the second block is empty read record from
the first block */
mrec1 = row_merge_read_rec_from_block(
block1, &offset1, heap, index);
sec_offs1 = rec_get_offsets(
mrec1->rec, index, sec_offs1, ULINT_UNDEFINED,
&offset_heap);
rec_offsets = sec_offs1;
ut_ad(rec_validate(mrec1->rec, sec_offs1));
nth_rec1++;
} else if (nth_rec2 < tmp->header.n_records &&
nth_rec1 >= block1->header.n_records) {
/* If the first block is empty read data tuple from
the second block */
mrec1 = row_merge_read_rec_from_block(
tmp, &offset2, heap, index);
sec_offs1 = rec_get_offsets(
mrec1->rec, index, sec_offs1, ULINT_UNDEFINED,
&offset_heap);
rec_offsets = sec_offs1;
ut_ad(rec_validate(mrec1->rec, sec_offs1));
nth_rec2++;
} else {
ulint tmp_offset1 = offset1;
ulint tmp_offset2 = offset2;
/* Both blocks contain record and thus they must
be compared */
mrec1 = row_merge_read_rec_from_block(
block1, &offset1, heap, index);
sec_offs1 = rec_get_offsets(
mrec1->rec, index, sec_offs1, ULINT_UNDEFINED,
&offset_heap);
ut_ad(rec_validate(mrec1->rec, sec_offs1));
mrec2 = row_merge_read_rec_from_block(
tmp, &offset2, heap, index);
sec_offs2 = rec_get_offsets(
mrec2->rec, index, sec_offs2, ULINT_UNDEFINED,
&offset_heap);
ut_ad(rec_validate(mrec2->rec, sec_offs2));
switch (row_merge_cmp(mrec1, mrec2,
sec_offs1, sec_offs2, index)) {
case 0:
if (UNIV_UNLIKELY
(dict_index_is_unique(index))) {
goto error_handling;
}
/* fall through */
case -1:
rec_offsets = sec_offs1;
nth_rec1++;
offset2 = tmp_offset2;
break;
case 1:
mrec1 = mrec2;
rec_offsets = sec_offs2;
nth_rec2++;
offset1 = tmp_offset1;
break;
default:
ut_error;
}
}
ut_ad(mrec1);
ut_ad(rec_validate(mrec1->rec, rec_offsets));
/* If the first output block is not yet full test whether this
new data tuple fits to block. If not this new data tuple must
be inserted to second output block */
if (fits_to_new) {
fits_to_new = row_merge_rec_fits_to_block(
rec_offsets, offset3);
}
if (fits_to_new) {
offset3 = row_merge_store_rec_to_block(
mrec1->rec, rec_offsets, new_block1, offset3);
} else {
ut_a(row_merge_rec_fits_to_block(rec_offsets,
offset4));
offset4 = row_merge_store_rec_to_block(
mrec1->rec, rec_offsets, new_block2, offset4);
}
/* TODO: If we are using variable length keys, then in
some cases these keys do not fit to two empty blocks
in a different order. Therefore, some empty space is
left to every block. However, it has not been prooven
that this empty space is enough in all cases. Therefore,
here these overloaded records should be put on another
block. */
}
/* Free memory from old blocks and return pointers to new blocks */ column_names = mem_alloc(index_def->n_fields * sizeof *column_names);
if (offset_heap) { for (i = 0; i < index_def->n_fields; ++i) {
mem_heap_free(offset_heap); column_names[i] = index_def->fields[i].field_name;
} }
mem_heap_free(heap); index = dict_table_get_index_by_max_id(
mem_free(block1); table, index_def->name, column_names, index_def->n_fields);
mem_free(tmp);
ut_ad(row_merge_block_validate(new_block1, index));
ut_ad(row_merge_block_validate(new_block2, index));
*block2 = new_block2;
return(new_block1);
error_handling:
/* Duplicate key was found and unique key was requested. Free all
allocated memory and return NULL */
if (offset_heap) {
mem_heap_free(offset_heap);
}
mem_heap_free(heap); mem_free(column_names);
mem_free(block1);
mem_free(tmp);
mem_free(new_block1);
mem_free(new_block2);
return(NULL); return(index);
} }
/***************************************************************** /************************************************************************
Merge sort for linked list in the disk. Read a merge block from the file system. */
static
Merge sort takes the input list and makes log N passes along ibool
the list and in each pass it combines each adjacent pair of row_merge_read(
small sorted lists into one larger sorted list. When only one /*===========*/
pass is needed the whole output list must have been sorted. /* out: TRUE if request was
successful, FALSE if fail */
The linked list is stored in the file system. File blocks represent int fd, /* in: file descriptor */
items of linked list. The list is singly linked by the next offset ulint offset, /* in: offset where to read */
stored in block header. Offset is calculated from the start of the row_merge_block_t* buf) /* out: data */
file. Thus whenever next item in the list is requested this item is {
read from the disk. Similarly every item is witten back to the disk ib_uint64_t ofs = ((ib_uint64_t) offset) * sizeof *buf;
when we have sorted two blocks in the memory.
In each pass, two lists of size block_size are merged into lists of return(UNIV_LIKELY(os_file_read(OS_FILE_FROM_FD(fd), buf,
size block_size*2. Initially block_size=1. Merge starts by pointing (ulint) (ofs & 0xFFFFFFFF),
a temporary pointer list1 at the head of the list and also preparing (ulint) (ofs >> 32),
an empty list list_tail where elements will be appended. Then: sizeof *buf)));
}
1) If block1 is NULL we terminate this pass. /************************************************************************
Read a merge block from the file system. */
static
ibool
row_merge_write(
/*============*/
/* out: TRUE if request was
successful, FALSE if fail */
int fd, /* in: file descriptor */
ulint offset, /* in: offset where to write */
const void* buf) /* in: data */
{
ib_uint64_t ofs = ((ib_uint64_t) offset)
* sizeof(row_merge_block_t);
2) Otherwise, there is at least one element in the next return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
pair of block_size lists therefore, increase the number of (ulint) (ofs & 0xFFFFFFFF),
merges performed in this pass. (ulint) (ofs >> 32),
sizeof(row_merge_block_t))));
}
3) Point another temporary pointer list2 as the same /************************************************************************
place as list1. Iterate list2 by block_size elements Read a merge record. */
or until the end of the list. Let the list_size1 be the static
number of elements in the list2. const byte*
row_merge_read_rec(
/*===============*/
/* out: pointer to next record,
or NULL on I/O error
or end of list */
row_merge_block_t* block, /* in/out: file buffer */
mrec_buf_t* buf, /* in/out: secondary buffer */
const byte* b, /* in: pointer to record */
dict_index_t* index, /* in: index of the record */
int fd, /* in: file descriptor */
ulint* foffs, /* in/out: file offset */
const mrec_t** mrec, /* out: pointer to merge record,
or NULL on end of list
(non-NULL on I/O error) */
ulint* offsets)/* out: offsets of mrec */
{
ulint extra_size;
ulint data_size;
ulint avail_size;
ut_ad(block);
ut_ad(buf);
ut_ad(b >= block[0]);
ut_ad(b < block[1]);
ut_ad(index);
ut_ad(foffs);
ut_ad(mrec);
ut_ad(offsets);
4) Let list_size1=merge_size. Now we merge list starting at ut_ad(*offsets == REC_OFFS_HEADER_SIZE
list1 of length list_size2 with a list starting at list2 of + dict_index_get_n_fields(index));
length at most list_size1.
5) So, as long as either the list1 is non-empty (list_size1) extra_size = *b++;
or the list2 is non-empty (list_size2 and list2 pointing to
a element):
5.1) Select which list to take the next element from. if (UNIV_UNLIKELY(!extra_size)) {
If either lists is empty, we choose from the other one. /* End of list */
If both lists are non-empty, compare the first element *mrec = NULL;
of each and choose the lower one. return(NULL);
}
5.2) Remove that element, tmp, from the start of its if (extra_size >= 0x80) {
lists, by advancing list1 or list2 to next element /* Read another byte of extra_size. */
and decreasing list1_size or list2_size.
5.3) Append tmp to list_tail if (UNIV_UNLIKELY(b >= block[1])) {
if (!row_merge_read(fd, ++(*foffs), block)) {
err_exit:
/* Signal I/O error. */
*mrec = b;
return(NULL);
}
6) At this point, we have advanced list1 until it is where /* Wrap around to the beginning of the buffer. */
list2 started out and we have advanced list2 until it is b = block[0];
pointing at the next pair of block_size lists to merge. }
Thus, set list1 to the value of list2 and go back to the
start of this loop.
As soon as a pass like this is performed with only one merge, the extra_size = (extra_size & 0x7f) << 8;
algorithm terminates. Otherwise, double the value of block_size extra_size |= *b++;
and go back to the beginning. */ }
ulint /* Normalize extra_size. Above, value 0 signals "end of list. */
row_merge_sort_linked_list_in_disk( extra_size--;
/*===============================*/
/* out: offset to first block in
the list or ULINT_UNDEFINED in
case of error */
dict_index_t* index, /* in: index to be created */
os_file_t file, /* in: File handle */
int* error) /* out: 0 or error */
{
merge_block_t* block1;
merge_block_t* block2;
merge_block_t* backup1;
merge_block_t* backup2;
merge_file_t output;
ulint block_size;
ulint list_head = 0;
ut_ad(index); /* Read the extra bytes. */
/* Allocate memory for blocks */ if (UNIV_UNLIKELY(b + extra_size >= block[1])) {
backup1 = block1 = row_merge_block_create(); /* The record spans two blocks. Copy the entire record
backup2 = block2 = row_merge_block_create(); to the auxiliary buffer and handle this as a special
case. */
output.file = file; avail_size = block[1] - b;
for (block_size = 1;; block_size *= 2) { memcpy(*buf, b, avail_size);
ibool sorted = TRUE;
ibool list_is_empty = TRUE;
block1 = backup1; if (!row_merge_read(fd, ++(*foffs), block)) {
if (!row_merge_block_read(file, list_head, block1)) {
file_error:
*error = DB_CORRUPTION;
goto err_exit; goto err_exit;
} }
ut_ad(row_merge_block_validate(block1, index));
for (;;) { /* Wrap around to the beginning of the buffer. */
ulint offset = block1->header.offset; b = block[0];
ulint list1_size = 0;
ulint list2_size = block_size;
ulint i;
/* Count how many list elements we have in the list. */ /* Copy the record. */
memcpy(*buf + avail_size, b, extra_size - avail_size);
b += extra_size - avail_size;
for (i = 0; i < block_size; i++) { *mrec = *buf + extra_size;
merge_block_header_t header;
list1_size++; rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
/* Here read only the header to iterate the data_size = rec_offs_data_size(offsets);
list in the disk. */
if (!row_merge_block_header_read(file, offset, /* These overflows should be impossible given that
&header)) { records are much smaller than either buffer, and
goto file_error; the record starts near the beginning of each buffer. */
} ut_a(extra_size + data_size < sizeof *buf);
ut_a(b + data_size < block[1]);
offset = header.next; /* Copy the data bytes. */
memcpy(*buf + extra_size, b, data_size);
b += data_size;
/* If the offset is zero we have arrived to the return(b);
end of disk list */ }
if (!offset) { *mrec = b + extra_size;
break;
}
}
/* If offset is zero we have reached end of the list in rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
the disk. */
if (!offset) { data_size = rec_offs_data_size(offsets);
block2 = NULL; ut_ad(extra_size + data_size < sizeof *buf);
} else {
block2 = backup2;
if (!row_merge_block_read(
file, offset, block2)) {
goto file_error;
}
ut_ad(row_merge_block_validate(block2, index));
}
/* If list2 is not empty, we have two lists to merge. b += extra_size + data_size;
Otherwise, we have a sorted list. */
while (list1_size > 0 || (list2_size > 0 && block2)) {
/* Merge sort two lists by deciding whether
next element of merge comes from list1 or
list2. */
merge_block_t* tmp;
if (list1_size == 0) {
/* First list is empty, next element
must come from the second list. */
tmp = block2;
if (!block2->header.next) {
block2 = NULL;
list2_size = 0;
} else {
list2_size--;
}
} else if (list2_size == 0 || !block2) {
/* Second list is empty, next record
must come from the first list. */
tmp = block1;
list1_size--;
} else {
/* Both lists contain a block and we
need to merge records on these block */
tmp = row_merge_block_merge(
block1, &block2, index);
if (tmp == NULL) {
*error = DB_DUPLICATE_KEY;
goto err_exit;
}
block1 = backup1 = tmp;
backup2 = block2;
list1_size--;
}
/* Store the head offset of the disk if (UNIV_LIKELY(b < block[1])) {
list. Note that only records in the /* The record fits entirely in the block.
blocks are changed not the order of This is the normal case. */
the blocks in the disk. */ return(b);
}
if (list_is_empty) { /* The record spans two blocks. Copy it to buf. */
list_is_empty = FALSE;
list_head = tmp->header.offset;
}
ut_ad(row_merge_block_validate(tmp, index)); avail_size = block[1] - b;
memcpy(*buf, b, avail_size);
*mrec = *buf + extra_size;
rec_offs_make_valid(*mrec, index, offsets);
if (!row_merge_block_write( if (!row_merge_read(fd, ++(*foffs), block)) {
file, tmp->header.offset, tmp)) {
goto file_error;
}
/* Now we can read the next record from the goto err_exit;
selected list if it contains more records */ }
if (tmp->header.next /* Wrap around to the beginning of the buffer. */
&& !row_merge_block_read(file, b = block[0];
tmp->header.next,
tmp)) {
goto file_error;
}
}
/* Now we have processed block_size items from /* Copy the rest of the record. */
the disk. Swap blocks using pointers. */ memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
b += extra_size + data_size - avail_size;
if (!block2) { return(b);
if (sorted) { }
goto func_exit;
}
break;
}
sorted = FALSE; /************************************************************************
block2 = backup1; Write a merge record. */
block1 = backup2; static
backup2 = block2; void
backup1 = block1; row_merge_write_rec_low(
} /*====================*/
byte* b, /* out: buffer */
ulint e, /* in: encoded extra_size */
const mrec_t* mrec, /* in: record to write */
const ulint* offsets)/* in: offsets of mrec */
{
if (e < 0x80) {
*b++ = e;
} else {
*b++ = 0x80 | (e >> 8);
*b++ = (byte) e;
} }
err_exit: memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
list_head = ULINT_UNDEFINED;
func_exit:
mem_free(backup1);
mem_free(backup2);
return(list_head);
} }
/************************************************************************ /************************************************************************
Merge sort linked list in the memory and store part of the linked Write a merge record. */
list into a block and write this block to the disk. */
static static
ibool byte*
row_merge_sort_and_store( row_merge_write_rec(
/*=====================*/ /*================*/
/* out: FALSE on error */ /* out: pointer to end of block,
dict_index_t* index, /* in: Index */ or NULL on error */
merge_file_t* file, /* in: File where to write index row_merge_block_t* block, /* in/out: file buffer */
entries */ mrec_buf_t* buf, /* in/out: secondary buffer */
merge_block_t* block, /* in/out: Block where to store byte* b, /* in: pointer to end of block */
the list */ int fd, /* in: file descriptor */
merge_rec_list_t** list) /* in/out: Pointer to the list */ ulint* foffs, /* in/out: file offset */
const mrec_t* mrec, /* in: record to write */
const ulint* offsets)/* in: offsets of mrec */
{ {
ut_ad(index && file && block && list); ulint extra_size;
ulint size;
ulint avail_size;
ut_ad(block);
ut_ad(buf);
ut_ad(b >= block[0]);
ut_ad(b < block[1]);
ut_ad(mrec);
ut_ad(foffs);
ut_ad(mrec < block[0] || mrec > block[1]);
ut_ad(mrec < buf[0] || mrec > buf[1]);
/* Normalize extra_size. Value 0 signals "end of list". */
extra_size = rec_offs_extra_size(offsets) + 1;
size = extra_size + (extra_size >= 0x80)
+ rec_offs_data_size(offsets);
if (UNIV_UNLIKELY(b + size >= block[1])) {
/* The record spans two blocks.
Copy it to the temporary buffer first. */
avail_size = block[1] - b;
row_merge_write_rec_low(buf[0], extra_size, mrec, offsets);
/* Copy the head of the temporary buffer, write
the completed block, and copy the tail of the
record to the head of the new block. */
memcpy(b, buf[0], avail_size);
if (!row_merge_write(fd, (*foffs)++, block)) {
return(NULL);
}
/* Firstly, merge sort linked list in the memory */ /* Copy the rest. */
if (!row_merge_sort_linked_list(index, *list)) { b = block[0];
return(FALSE); memcpy(b, buf[0] + avail_size, size - avail_size);
b += size - avail_size;
} else {
row_merge_write_rec_low(b, extra_size, mrec, offsets);
b += rec_offs_size(offsets);
} }
/* Secondly, write part of the linked list to the block */ return(b);
*list = row_merge_write_list_to_block(*list, block, index);
ut_ad(row_merge_block_validate(block, index));
/* Next block will be written directly behind this one. This will
create a 'linked list' of blocks to the disk. */
block->header.offset = file->offset;
block->header.next = ++file->offset;
/* Thirdly, write block to the disk */
return(row_merge_block_write(file->file, block->header.offset, block));
} }
#ifdef UNIV_DEBUG_INDEX_CREATE
/************************************************************************ /************************************************************************
Pretty print data tuple */ Write an end-of-list marker. */
static static
void byte*
row_merge_dtuple_print( row_merge_write_eof(
/*===================*/ /*================*/
FILE* f, /* in: output stream */ /* out: pointer to end of block,
dtuple_t* dtuple) /* in: data tuple */ or NULL on error */
row_merge_block_t* block, /* in/out: file buffer */
byte* b, /* in: pointer to end of block */
int fd, /* in: file descriptor */
ulint* foffs) /* in/out: file offset */
{ {
ulint n_fields; ut_ad(block);
ulint i; ut_ad(b >= block[0]);
ut_ad(b < block[1]);
ut_ad(f && dtuple); ut_ad(foffs);
n_fields = dtuple_get_n_fields(dtuple); *b++ = 0;
#ifdef UNIV_DEBUG_VALGRIND
fprintf(f, "DATA TUPLE: %lu fields;\n", (ulong) n_fields); /* The rest of the block is uninitialized. Initialize it
to avoid bogus warnings. */
for (i = 0; i < n_fields; i++) { memset(b, 0, block[1] - b);
dfield_t* dfield; #endif /* UNIV_DEBUG_VALGRIND */
dfield = dtuple_get_nth_field(dtuple, i); if (!row_merge_write(fd, (*foffs)++, block)) {
return(NULL);
fprintf(f, "%lu: ", (ulong) i);
if (dfield->len != UNIV_SQL_NULL) {
dfield_print_also_hex(dfield);
} else {
fputs(" SQL NULL", f);
}
putc(';', f);
} }
putc('\n', f); return(block[0]);
ut_ad(dtuple_validate(dtuple)); }
/*****************************************************************
Compare two merge records. */
static
int
row_merge_cmp(
/*==========*/
/* out: 1, 0, -1 if mrec1 is
greater, equal, less,
respectively, than mrec2 */
const mrec_t* mrec1, /* in: first merge record to be
compared */
const mrec_t* mrec2, /* in: second merge record to be
compared */
const ulint* offsets1, /* in: first record offsets */
const ulint* offsets2, /* in: second record offsets */
dict_index_t* index) /* in: index */
{
return(cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index));
} }
#endif /* UNIV_DEBUG_INDEX_CREATE */
/************************************************************************ /************************************************************************
Reads clustered index of the table and create temporary files Reads clustered index of the table and create temporary files
containing index entries for indexes to be built. */ containing index entries for indexes to be built. */
static
ulint ulint
row_merge_read_clustered_index( row_merge_read_clustered_index(
/*===========================*/ /*===========================*/
/* out: DB_SUCCESS if successfull, /* out: DB_SUCCESS or error */
or ERROR code */ trx_t* trx, /* in: transaction */
trx_t* trx, /* in: transaction */ dict_table_t* table, /* in: table where index is created */
dict_table_t* table, /* in: table where index is created */ dict_index_t** index, /* in: indexes to be created */
dict_index_t** index, /* in: indexes to be created */ merge_file_t* files, /* in: temporary files */
merge_file_t* files, /* in: Files where to write index ulint n_index,/* in: number of indexes to create */
entries */ row_merge_block_t* block) /* in/out: file buffer */
ulint num_of_idx) /* in: number of indexes to be
created */
{ {
dict_index_t* clust_index; /* Clustered index */ dict_index_t* clust_index; /* Clustered index */
merge_rec_t* new_mrec; /* New merge record */ mem_heap_t* row_heap; /* Heap memory to create
mem_heap_t* row_heap; /* Heap memory to create
clustered index records */ clustered index records */
mem_heap_t* heap; /* Memory heap for row_merge_buf_t** merge_buf; /* Temporary list for records*/
record lists and offsets */ btr_pcur_t pcur; /* Persistent cursor on the
merge_block_t* block; /* Merge block where records
are stored for memory sort and
then written to the disk */
merge_rec_list_t** merge_list; /* Temporary list for records*/
btr_pcur_t pcur; /* Persistent cursor on the
clustered index */ clustered index */
mtr_t mtr; /* Mini transaction */ mtr_t mtr; /* Mini transaction */
ulint err = DB_SUCCESS; /* Return code */ ulint err = DB_SUCCESS;/* Return code */
ulint idx_num = 0; /* Index number */ ulint i;
ulint n_blocks = 0; /* Number of blocks written
to disk */
ulint sec_offsets_[REC_OFFS_NORMAL_SIZE];
ulint* sec_offs = sec_offsets_;
*sec_offsets_ = (sizeof sec_offsets_) / sizeof *sec_offsets_;
trx->op_info="reading clustered index"; trx->op_info = "reading clustered index";
ut_ad(trx); ut_ad(trx);
ut_ad(table); ut_ad(table);
ut_ad(index); ut_ad(index);
ut_ad(files); ut_ad(files);
/* Create block where index entries are stored */ /* Create and initialize memory for record buffers */
block = row_merge_block_create();
/* Create and initialize memory for record lists */ merge_buf = mem_alloc(n_index * sizeof *merge_buf);
heap = mem_heap_create(256); for (i = 0; i < n_index; i++) {
merge_list = mem_heap_alloc(heap, num_of_idx * sizeof *merge_list); merge_buf[i] = row_merge_buf_create(index[i]);
for (idx_num = 0; idx_num < num_of_idx; idx_num++) {
merge_list[idx_num] = row_merge_create_list();
} }
mtr_start(&mtr); mtr_start(&mtr);
...@@ -1467,13 +850,14 @@ row_merge_read_clustered_index( ...@@ -1467,13 +850,14 @@ row_merge_read_clustered_index(
btr_pcur_open_at_index_side( btr_pcur_open_at_index_side(
TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
row_heap = mem_heap_create(512); row_heap = mem_heap_create(UNIV_PAGE_SIZE);
/* Iterate all records in the clustered index */ /* Scan the clustered index. */
for (;;) { for (;;) {
const rec_t* rec; const rec_t* rec;
dtuple_t* row; dtuple_t* row;
row_ext_t* ext; row_ext_t* ext;
ibool has_next = TRUE;
btr_pcur_move_to_next_on_page(&pcur, &mtr); btr_pcur_move_to_next_on_page(&pcur, &mtr);
...@@ -1486,174 +870,328 @@ row_merge_read_clustered_index( ...@@ -1486,174 +870,328 @@ row_merge_read_clustered_index(
mtr_start(&mtr); mtr_start(&mtr);
btr_pcur_restore_position(BTR_SEARCH_LEAF, btr_pcur_restore_position(BTR_SEARCH_LEAF,
&pcur, &mtr); &pcur, &mtr);
if (!btr_pcur_move_to_next_user_rec(&pcur, &mtr)) { has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr);
break;
}
} }
rec = btr_pcur_get_rec(&pcur); if (UNIV_LIKELY(has_next)) {
rec = btr_pcur_get_rec(&pcur);
/* We don't count the delete marked records as "Inserted" */ /* Skip delete marked records. */
if (!rec_get_deleted_flag(rec, dict_table_is_comp(table))) { if (rec_get_deleted_flag(rec,
dict_table_is_comp(table))) {
continue;
}
srv_n_rows_inserted++; srv_n_rows_inserted++;
/* Build row based on clustered index */
row = row_build(ROW_COPY_POINTERS, clust_index,
rec, NULL, &ext, row_heap);
/* Build all entries for all the indexes to be created
in a single scan of the clustered index. */
} }
/* Build row based on clustered index */ for (i = 0; i < n_index; i++) {
mem_heap_empty(row_heap); row_merge_buf_t* buf = merge_buf[i];
merge_file_t* file = &files[i];
row = row_build(ROW_COPY_POINTERS, if (UNIV_LIKELY
clust_index, rec, NULL, &ext, row_heap); (has_next && row_merge_buf_add(buf, row, ext))) {
continue;
}
ut_ad(buf->n_tuples || !has_next);
/* If the user has requested the creation of several indexes /* We have enough data tuples to form a block.
for the same table. We build all index entries in a single Sort them and write to disk. */
pass over the clustered index. */
for (idx_num = 0; idx_num < num_of_idx; idx_num++) { if (buf->n_tuples
&& row_merge_buf_sort(buf)
&& dict_index_is_unique(buf->index)) {
err = DB_DUPLICATE_KEY;
goto func_exit;
}
dtuple_t* index_tuple; row_merge_buf_write(buf, block);
index_tuple = row_build_index_entry( if (!row_merge_write(file->fd, file->offset++,
row, ext, block)) {
index[idx_num], merge_list[idx_num]->heap); trx->error_key_num = i;
err = DB_OUT_OF_FILE_SPACE;
goto func_exit;
}
#ifdef UNIV_DEBUG_INDEX_CREATE row_merge_buf_empty(buf);
row_merge_dtuple_print(stderr, index_tuple); }
#endif
new_mrec = row_merge_rec_create( mem_heap_empty(row_heap);
index_tuple,
ext ? ext->ext : NULL, ext ? ext->n_ext : 0,
index[idx_num], merge_list[idx_num]->heap);
sec_offs = rec_get_offsets( if (UNIV_UNLIKELY(!has_next)) {
new_mrec->rec, index[idx_num], sec_offs, goto func_exit;
ULINT_UNDEFINED, &heap); }
}
/* Add data tuple to linked list of data tuples */ func_exit:
btr_pcur_close(&pcur);
mtr_commit(&mtr);
mem_heap_free(row_heap);
row_merge_list_add( for (i = 0; i < n_index; i++) {
new_mrec, rec_offs_size(sec_offs), row_merge_buf_free(merge_buf[i]);
merge_list[idx_num]); }
/* If we have enough data tuples to form a block mem_free(merge_buf);
sort linked list and store it to the block and
write this block to the disk. Note that not all
data tuples in the list fit to the block.*/
if (merge_list[idx_num]->total_size >= trx->op_info = "";
MERGE_BLOCK_SIZE) {
if (!row_merge_sort_and_store( return(err);
index[idx_num], }
&files[idx_num],
block,
&(merge_list[idx_num]))) {
trx->error_key_num = idx_num; /*****************************************************************
err = DB_DUPLICATE_KEY; Merge two blocks of linked lists on disk and write a bigger block. */
goto error_handling; static
} ulint
row_merge_blocks(
/*=============*/
/* out: DB_SUCCESS or error code */
dict_index_t* index, /* in: index being created */
merge_file_t* file, /* in/out: file containing
index entries */
row_merge_block_t* block1, /* in/out: input buffer */
row_merge_block_t* block2, /* in/out: input buffer */
row_merge_block_t* block3, /* in/out: output buffer */
ulint* foffs1, /* in/out: offset of first
source list in the file */
ulint* foffs2, /* in/out: offset of second
source list in the file */
merge_file_t* of) /* in/out: output file */
{
mem_heap_t* heap; /* memory heap for offsets1, offsets2 */
mrec_buf_t buf1; /* buffer for handling split mrec1 in block1 */
mrec_buf_t buf2; /* buffer for handling split mrec2 in block2 */
mrec_buf_t buf3; /* buffer for handling split mrec in block3 */
const byte* b1; /* pointer to block1 */
const byte* b2; /* pointer to block2 */
byte* b3; /* pointer to block3 */
const mrec_t* mrec1; /* merge record, points to block1 or buf1 */
const mrec_t* mrec2; /* merge record, points to block2 or buf2 */
ulint* offsets1;/* offsets of mrec1 */
ulint* offsets2;/* offsets of mrec2 */
heap = row_merge_heap_create(index, &offsets1, &offsets2);
/* Write a record and read the next record. Split the output
file in two halves, which can be merged on the following pass. */
#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END) \
do { \
b3 = row_merge_write_rec(block3, &buf3, b3, \
of->fd, &of->offset, \
mrec##N, offsets##N); \
if (UNIV_UNLIKELY(!b3)) { \
goto corrupt; \
} \
b##N = row_merge_read_rec(block##N, &buf##N, \
b##N, index, \
file->fd, foffs##N, \
&mrec##N, offsets##N); \
if (UNIV_UNLIKELY(!b##N)) { \
if (mrec##N) { \
goto corrupt; \
} \
AT_END; \
} \
} while (0)
if (!row_merge_read(file->fd, *foffs1, block1)
|| !row_merge_read(file->fd, *foffs2, block2)) {
corrupt:
mem_heap_free(heap);
return(DB_CORRUPTION);
}
b1 = *block1;
b2 = *block2;
b3 = *block3;
b1 = row_merge_read_rec(block1, &buf1, b1, index, file->fd,
foffs1, &mrec1, offsets1);
b2 = row_merge_read_rec(block2, &buf2, b2, index, file->fd,
foffs2, &mrec2, offsets2);
if (UNIV_UNLIKELY(!b1 && mrec1)
|| UNIV_UNLIKELY(!b2 && mrec2)) {
n_blocks++; goto corrupt;
files[idx_num].num_of_blocks++; }
while (mrec1 && mrec2) {
switch (row_merge_cmp(mrec1, mrec2,
offsets1, offsets2, index)) {
case 0:
if (UNIV_UNLIKELY
(dict_index_is_unique(index))) {
mem_heap_free(heap);
return(DB_DUPLICATE_KEY);
} }
/* fall through */
case -1:
ROW_MERGE_WRITE_GET_NEXT(1, goto merged);
break;
case 1:
ROW_MERGE_WRITE_GET_NEXT(2, goto merged);
break;
default:
ut_error;
} }
} }
/* Now we have to write all remaining items in the list to merged:
blocks and write these blocks to the disk */ if (mrec1) {
/* append all mrec1 to output */
for (;;) {
ROW_MERGE_WRITE_GET_NEXT(1, break);
}
}
for (idx_num = 0; idx_num < num_of_idx; idx_num++) { if (mrec2) {
/* append all mrec2 to output */
for (;;) {
ROW_MERGE_WRITE_GET_NEXT(2, break);
}
}
/* While we have items in the list write them mem_heap_free(heap);
to the block */ b3 = row_merge_write_eof(block3, b3, of->fd, &of->offset);
ut_ad(!merge_list[idx_num]->head return(b3 ? DB_SUCCESS : DB_CORRUPTION);
== !merge_list[idx_num]->tail); }
ut_ad(!merge_list[idx_num]->n_records
== !merge_list[idx_num]->head);
if (merge_list[idx_num]->head) { /*****************************************************************
Merge disk files. */
static
ulint
row_merge(
/*======*/
/* out: DB_SUCCESS
or error code */
dict_index_t* index, /* in: index being created */
merge_file_t* file, /* in/out: file containing
index entries */
row_merge_block_t* block1, /* in/out: input buffer */
row_merge_block_t* block2, /* in/out: input buffer */
row_merge_block_t* block3, /* in/out: output buffer */
int* tmpfd) /* in/out: temporary file
handle */
{
ulint foffs1; /* first input offset */
ulint foffs2; /* second input offset */
ulint half; /* upper limit of foffs1 */
ulint error; /* error code */
merge_file_t of; /* output file */
/* Next block will be written directly of.fd = *tmpfd;
behind this one. This will create a of.offset = 0;
'linked list' of blocks to the disk. */
block->header.offset = files[idx_num].offset; /* Split the input file in two halves. */
block->header.next = files[idx_num].offset + 1; half = file->offset / 2;
if (!row_merge_sort_and_store( /* Merge blocks to the output file. */
index[idx_num], foffs1 = 0;
&files[idx_num], foffs2 = half;
block,
&(merge_list[idx_num]))) {
trx->error_key_num = idx_num; for (; foffs1 < half; foffs1++, foffs2++) {
err = DB_DUPLICATE_KEY; error = row_merge_blocks(index, file, block1, block2, block3,
goto error_handling; &foffs1, &foffs2, &of);
}
files[idx_num].num_of_blocks++; if (error != DB_SUCCESS) {
n_blocks++; return(error);
} }
}
/* Write the last block. */ /* Copy the last block, if there is one. */
block->header.next = 0; /* end-of-list marker */ while (foffs2 < file->offset) {
if (!row_merge_read(file->fd, foffs2++, block2)
if (!row_merge_block_header_write( || !row_merge_write(of.fd, of.offset++, block2)) {
files[idx_num].file, &block->header)) { return(DB_CORRUPTION);
err = DB_CORRUPTION;
goto error_handling;
} }
} }
#ifdef UNIV_DEBUG_INDEX_CREATE /* Swap file descriptors for the next pass. */
fprintf(stderr, "Stored %lu blocks\n", n_blocks); *tmpfd = file->fd;
#endif *file = of;
error_handling: return(DB_SUCCESS);
}
/* Cleanup resources */ /*****************************************************************
Merge disk files. */
static
ulint
row_merge_sort(
/*===========*/
/* out: DB_SUCCESS
or error code */
dict_index_t* index, /* in: index being created */
merge_file_t* file, /* in/out: file containing
index entries */
row_merge_block_t* block1, /* in/out: input buffer */
row_merge_block_t* block2, /* in/out: input buffer */
row_merge_block_t* block3, /* in/out: output buffer */
int* tmpfd) /* in/out: temporary file
handle */
{
ulint blksz; /* block size */
btr_pcur_close(&pcur); blksz = 1;
mtr_commit(&mtr);
mem_heap_free(row_heap);
mem_free(block);
for (idx_num = 0; idx_num < num_of_idx; idx_num++) { for (;; blksz *= 2) {
mem_heap_free(merge_list[idx_num]->heap); ulint error = row_merge(index, file,
} block1, block2, block3, tmpfd);
if (error != DB_SUCCESS) {
return(error);
}
mem_heap_free(heap); if (blksz >= file->offset) {
/* everything is in a single block */
break;
}
trx->op_info=""; /* Round up the file size to a multiple of blksz. */
file->offset = ut_2pow_round(file->offset - 1, blksz) + blksz;
}
return(err); return(DB_SUCCESS);
} }
/************************************************************************ /************************************************************************
Read sorted file containing index data tuples and insert these data Read sorted file containing index data tuples and insert these data
tuples to the index */ tuples to the index */
static
ulint ulint
row_merge_insert_index_tuples( row_merge_insert_index_tuples(
/*==========================*/ /*==========================*/
/* out: 0 or error number */ /* out: DB_SUCCESS or error number */
trx_t* trx, /* in: transaction */ trx_t* trx, /* in: transaction */
dict_index_t* index, /* in: index */ dict_index_t* index, /* in: index */
dict_table_t* table, /* in: table */ dict_table_t* table, /* in: table */
os_file_t file, /* in: file handle */ int fd, /* in: file descriptor */
ulint offset) /* in: offset where to start row_merge_block_t* block) /* in/out: file buffer */
reading */
{ {
merge_block_t* block; mrec_buf_t buf;
que_thr_t* thr; const byte* b;
ins_node_t* node; que_thr_t* thr;
mem_heap_t* heap; ins_node_t* node;
mem_heap_t* graph_heap; mem_heap_t* tuple_heap;
ulint error = DB_SUCCESS; mem_heap_t* graph_heap;
ulint error = DB_SUCCESS;
ulint foffs = 0;
ulint* offsets;
ut_ad(trx && index && table); ut_ad(trx);
ut_ad(index);
ut_ad(table);
/* We use the insert query graph as the dummy graph /* We use the insert query graph as the dummy graph
needed in the row module call */ needed in the row module call */
...@@ -1667,70 +1205,67 @@ row_merge_insert_index_tuples( ...@@ -1667,70 +1205,67 @@ row_merge_insert_index_tuples(
que_thr_move_to_run_state_for_mysql(thr, trx); que_thr_move_to_run_state_for_mysql(thr, trx);
block = row_merge_block_create(); tuple_heap = mem_heap_create(1000);
heap = mem_heap_create(1000);
do {
ulint n_rec;
ulint tuple_offset = 0;
if (!row_merge_block_read(file, offset, block)) {
error = DB_CORRUPTION;
break;
}
ut_ad(row_merge_block_validate(block, index));
for (n_rec = 0; n_rec < block->header.n_records; n_rec++) {
merge_rec_t* mrec = row_merge_read_rec_from_block(
block, &tuple_offset, heap, index);
if (!rec_get_deleted_flag(mrec->rec, 0)) { {
ulint i = REC_OFFS_HEADER_SIZE
+ dict_index_get_n_fields(index);
offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets);
offsets[0] = i;
offsets[1] = dict_index_get_n_fields(index);
}
dtuple_t* dtuple = row_rec_to_index_entry( b = *block;
ROW_COPY_POINTERS,
index, mrec->rec, heap);
node->row = dtuple; if (!row_merge_read(fd, foffs, block)) {
node->table = table; error = DB_CORRUPTION;
node->trx_id = trx->id; } else {
for (;;) {
const mrec_t* mrec;
dtuple_t* dtuple;
b = row_merge_read_rec(block, &buf, b, index,
fd, &foffs, &mrec, offsets);
if (UNIV_UNLIKELY(!b)) {
/* End of list, or I/O error */
if (mrec) {
error = DB_CORRUPTION;
}
break;
}
ut_ad(dtuple_validate(dtuple)); dtuple = row_rec_to_index_entry_low(
mrec, index, offsets, tuple_heap);
#ifdef UNIV_DEBUG_INDEX_CREATE node->row = dtuple;
row_merge_dtuple_print(stderr, dtuple); node->table = table;
#endif node->trx_id = trx->id;
do { ut_ad(dtuple_validate(dtuple));
thr->run_node = thr;
thr->prev_node = thr->common.parent;
error = row_ins_index_entry( do {
index, dtuple, NULL, 0, thr); thr->run_node = thr;
thr->prev_node = thr->common.parent;
if (error == DB_SUCCESS) { error = row_ins_index_entry(
goto next_rec; index, dtuple, NULL, 0, thr);
}
thr->lock_state = QUE_THR_LOCK_ROW; if (UNIV_LIKELY(error == DB_SUCCESS)) {
trx->error_state = error; goto next_rec;
que_thr_stop_for_mysql(thr); }
thr->lock_state = QUE_THR_LOCK_NOLOCK;
} while (row_mysql_handle_errors(&error, trx,
thr, NULL));
goto err_exit; thr->lock_state = QUE_THR_LOCK_ROW;
} trx->error_state = error;
que_thr_stop_for_mysql(thr);
thr->lock_state = QUE_THR_LOCK_NOLOCK;
} while (row_mysql_handle_errors(&error, trx,
thr, NULL));
goto err_exit;
next_rec: next_rec:
mem_heap_empty(heap); mem_heap_empty(tuple_heap);
} }
}
offset = block->header.next;
/* If we have reached the end of the disk list we have
inserted all of the index entries to the index. */
} while (offset);
que_thr_stop_for_mysql_no_error(thr, trx); que_thr_stop_for_mysql_no_error(thr, trx);
err_exit: err_exit:
...@@ -1738,8 +1273,7 @@ row_merge_insert_index_tuples( ...@@ -1738,8 +1273,7 @@ row_merge_insert_index_tuples(
trx->op_info = ""; trx->op_info = "";
mem_free(block); mem_heap_free(tuple_heap);
mem_heap_free(heap);
return(error); return(error);
} }
...@@ -1827,17 +1361,29 @@ row_merge_drop_indexes( ...@@ -1827,17 +1361,29 @@ row_merge_drop_indexes(
} }
/************************************************************************* /*************************************************************************
Allocate and initialize memory for a merge file structure */ Create a merge file. */
static
void void
row_merge_file_create( row_merge_file_create(
/*==================*/ /*==================*/
merge_file_t* merge_file) /* out: merge file structure */ merge_file_t* merge_file) /* out: merge file structure */
{ {
merge_file->file = innobase_mysql_tmpfile(); merge_file->fd = innobase_mysql_tmpfile();
merge_file->offset = 0; merge_file->offset = 0;
merge_file->num_of_blocks = 0; }
/*************************************************************************
Destroy a merge file. */
static
void
row_merge_file_destroy(
/*===================*/
merge_file_t* merge_file) /* out: merge file structure */
{
if (merge_file->fd != -1) {
close(merge_file->fd);
merge_file->fd = -1;
}
} }
/************************************************************************* /*************************************************************************
...@@ -1858,10 +1404,7 @@ row_merge_create_temporary_table( ...@@ -1858,10 +1404,7 @@ row_merge_create_temporary_table(
ulint error; ulint error;
ut_ad(table_name && table && error); ut_ad(table_name && table && error);
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&dict_sys->mutex)); ut_ad(mutex_own(&dict_sys->mutex));
#endif /* UNIV_SYNC_DEBUG */
error = row_undo_report_create_table_dict_operation(trx, table_name); error = row_undo_report_create_table_dict_operation(trx, table_name);
...@@ -1901,11 +1444,12 @@ row_merge_create_temporary_table( ...@@ -1901,11 +1444,12 @@ row_merge_create_temporary_table(
} }
/************************************************************************* /*************************************************************************
Rename the indexes in the dicitionary. */ Rename the indexes in the dictionary. */
ulint ulint
row_merge_rename_index( row_merge_rename_index(
/*===================*/ /*===================*/
/* out: DB_SUCCESS if all OK */
trx_t* trx, /* in: Transaction */ trx_t* trx, /* in: Transaction */
dict_table_t* table, /* in: Table for index */ dict_table_t* table, /* in: Table for index */
dict_index_t* index) /* in: Index to rename */ dict_index_t* index) /* in: Index to rename */
...@@ -1976,7 +1520,7 @@ row_merge_create_index( ...@@ -1976,7 +1520,7 @@ row_merge_create_index(
/* Create the index prototype, using the passed in def, this is not /* Create the index prototype, using the passed in def, this is not
a persistent operation. We pass 0 as the space id, and determine at a persistent operation. We pass 0 as the space id, and determine at
a lower level the space id where to store the table.*/ a lower level the space id where to store the table. */
index = dict_mem_index_create(table->name, index_def->name, index = dict_mem_index_create(table->name, index_def->name,
0, index_def->ind_type, n_fields); 0, index_def->ind_type, n_fields);
...@@ -2045,7 +1589,7 @@ row_merge_create_index( ...@@ -2045,7 +1589,7 @@ row_merge_create_index(
} }
/************************************************************************* /*************************************************************************
Check if a transaction can use an index.*/ Check if a transaction can use an index. */
ibool ibool
row_merge_is_index_usable( row_merge_is_index_usable(
...@@ -2061,13 +1605,12 @@ row_merge_is_index_usable( ...@@ -2061,13 +1605,12 @@ row_merge_is_index_usable(
} }
/************************************************************************* /*************************************************************************
Drop the old table.*/ Drop the old table. */
ulint ulint
row_merge_drop_table( row_merge_drop_table(
/*=================*/ /*=================*/
/* out: DB_SUCCESS if all OK else /* out: DB_SUCCESS or error code */
error code.*/
trx_t* trx, /* in: transaction */ trx_t* trx, /* in: transaction */
dict_table_t* table) /* in: table to drop */ dict_table_t* table) /* in: table to drop */
{ {
...@@ -2084,7 +1627,7 @@ row_merge_drop_table( ...@@ -2084,7 +1627,7 @@ row_merge_drop_table(
/* Drop the table immediately iff it is not references by MySQL */ /* Drop the table immediately iff it is not references by MySQL */
if (table->n_mysql_handles_opened == 0) { if (table->n_mysql_handles_opened == 0) {
/* Set the commit flag to FALSE.*/ /* Set the commit flag to FALSE. */
err = row_drop_table_for_mysql(table->name, trx, FALSE); err = row_drop_table_for_mysql(table->name, trx, FALSE);
} }
...@@ -2094,3 +1637,103 @@ row_merge_drop_table( ...@@ -2094,3 +1637,103 @@ row_merge_drop_table(
return(err); return(err);
} }
/*************************************************************************
Build indexes on a table by reading a clustered index,
creating a temporary file containing index entries, merge sorting
these index entries and inserting sorted index entries to indexes. */
ulint
row_merge_build_indexes(
/*====================*/
/* out: DB_SUCCESS or error code */
trx_t* trx, /* in: transaction */
dict_table_t* old_table, /* in: Table where rows are
read from */
dict_table_t* new_table, /* in: Table where indexes are
created. Note that old_table ==
new_table if we are creating a
secondary keys. */
dict_index_t** indexes, /* in: indexes to be created */
ulint n_indexes) /* in: size of indexes[] */
{
merge_file_t* merge_files;
row_merge_block_t* block1;
row_merge_block_t* block2;
row_merge_block_t* block3;
ulint i;
ulint error;
int tmpfd;
ut_ad(trx);
ut_ad(old_table);
ut_ad(new_table);
ut_ad(indexes);
ut_ad(n_indexes);
trx_start_if_not_started(trx);
/* Allocate memory for merge file data structure and initialize
fields */
merge_files = mem_alloc(n_indexes * sizeof *merge_files);
block1 = mem_alloc(sizeof *block1);
block2 = mem_alloc(sizeof *block2);
block3 = mem_alloc(sizeof *block3);
for (i = 0; i < n_indexes; i++) {
row_merge_file_create(&merge_files[i]);
}
tmpfd = innobase_mysql_tmpfile();
/* Read clustered index of the table and create files for
secondary index entries for merge sort */
error = row_merge_read_clustered_index(
trx, old_table, indexes, merge_files, n_indexes, block1);
if (error != DB_SUCCESS) {
goto func_exit;
}
trx_start_if_not_started(trx);
/* Now we have files containing index entries ready for
sorting and inserting. */
for (i = 0; i < n_indexes; i++) {
error = row_merge_sort(indexes[i], &merge_files[i],
block1, block2, block3, &tmpfd);
if (error == DB_SUCCESS) {
error = row_merge_insert_index_tuples(
trx, indexes[i], new_table,
merge_files[i].fd, block1);
}
/* Close the temporary file to free up space. */
row_merge_file_destroy(&merge_files[i]);
if (error != DB_SUCCESS) {
trx->error_key_num = i;
goto func_exit;
}
}
func_exit:
close(tmpfd);
for (i = 0; i < n_indexes; i++) {
row_merge_file_destroy(&merge_files[i]);
}
mem_free(merge_files);
mem_free(block1);
mem_free(block2);
mem_free(block3);
return(error);
}
...@@ -33,7 +33,6 @@ Created 9/17/2000 Heikki Tuuri ...@@ -33,7 +33,6 @@ Created 9/17/2000 Heikki Tuuri
#include "btr0sea.h" #include "btr0sea.h"
#include "fil0fil.h" #include "fil0fil.h"
#include "ibuf0ibuf.h" #include "ibuf0ibuf.h"
#include "row0merge.h"
/* A dummy variable used to fool the compiler */ /* A dummy variable used to fool the compiler */
ibool row_mysql_identically_false = FALSE; ibool row_mysql_identically_false = FALSE;
...@@ -4492,93 +4491,6 @@ row_create_index_graph_for_mysql( ...@@ -4492,93 +4491,6 @@ row_create_index_graph_for_mysql(
return(err); return(err);
} }
/*************************************************************************
Build new indexes to a table by reading a clustered index,
creating a temporary file containing index entries, merge sorting
these index entries and inserting sorted index entries to indexes. */
ulint
row_build_index_for_mysql(
/*======================*/
/* out: 0 or error code */
trx_t* trx, /* in: transaction */
dict_table_t* old_table, /* in: Table where rows are
read from */
dict_table_t* new_table, /* in: Table where indexes are
created. Note that old_table ==
new_table if we are creating a
secondary keys. */
dict_index_t** index, /* in: Indexes to be created */
ulint num_of_keys) /* in: Number of indexes to be
created */
{
merge_file_t* merge_files;
ulint index_num;
ulint error;
ut_ad(trx && old_table && new_table && index && num_of_keys);
trx_start_if_not_started(trx);
/* Allocate memory for merge file data structure and initialize
fields */
merge_files = mem_alloc(num_of_keys * sizeof *merge_files);
for (index_num = 0; index_num < num_of_keys; index_num++) {
row_merge_file_create(&merge_files[index_num]);
}
/* Read clustered index of the table and create files for
secondary index entries for merge sort */
error = row_merge_read_clustered_index(
trx, old_table, index, merge_files, num_of_keys);
if (error != DB_SUCCESS) {
goto func_exit;
}
trx_start_if_not_started(trx);
/* Now we have files containing index entries ready for
sorting and inserting. */
for (index_num = 0; index_num < num_of_keys; index_num++) {
/* Do a merge sort and insert from those files
which we have written at least one block */
if (merge_files[index_num].num_of_blocks > 0) {
/* Merge sort file using linked list merge
sort for files. */
row_merge_sort_linked_list_in_disk(
index[index_num],
merge_files[index_num].file,
(int *)&error);
if (error == DB_SUCCESS) {
error = row_merge_insert_index_tuples(
trx, index[index_num], new_table,
merge_files[index_num].file, 0);
}
if (error != DB_SUCCESS) {
trx->error_key_num = index_num;
goto func_exit;
}
}
}
func_exit:
mem_free(merge_files);
return(error);
}
#endif /* !UNIV_HOTBACKUP */ #endif /* !UNIV_HOTBACKUP */
/************************************************************************* /*************************************************************************
......
...@@ -141,7 +141,7 @@ row_build_index_entry( ...@@ -141,7 +141,7 @@ row_build_index_entry(
} }
/*********************************************************************** /***********************************************************************
An inverse function to dict_row_build_index_entry. Builds a row from a An inverse function to row_build_index_entry. Builds a row from a
record in a clustered index. */ record in a clustered index. */
dtuple_t* dtuple_t*
...@@ -256,6 +256,53 @@ row_build( ...@@ -256,6 +256,53 @@ row_build(
return(row); return(row);
} }
/***********************************************************************
Converts an index record to a typed data tuple. */
dtuple_t*
row_rec_to_index_entry_low(
/*=======================*/
/* out, index entry built; does not
set info_bits, and the data fields in
the entry will point directly to rec */
const rec_t* rec, /* in: record in the index */
dict_index_t* index, /* in: index */
const ulint* offsets,/* in: rec_get_offsets(rec, index) */
mem_heap_t* heap) /* in: memory heap from which the memory
needed is allocated */
{
dtuple_t* entry;
dfield_t* dfield;
ulint i;
const byte* field;
ulint len;
ulint rec_len;
ut_ad(rec && heap && index);
rec_len = rec_offs_n_fields(offsets);
entry = dtuple_create(heap, rec_len);
dtuple_set_n_fields_cmp(entry,
dict_index_get_n_unique_in_tree(index));
ut_ad(rec_len == dict_index_get_n_fields(index));
dict_index_copy_types(entry, index, rec_len);
for (i = 0; i < rec_len; i++) {
dfield = dtuple_get_nth_field(entry, i);
field = rec_get_nth_field(rec, offsets, i, &len);
dfield_set_data(dfield, field, len);
}
ut_ad(dtuple_check_typed(entry));
return(entry);
}
/*********************************************************************** /***********************************************************************
Converts an index record to a typed data tuple. NOTE that externally Converts an index record to a typed data tuple. NOTE that externally
stored (often big) fields are NOT copied to heap. */ stored (often big) fields are NOT copied to heap. */
...@@ -281,11 +328,6 @@ row_rec_to_index_entry( ...@@ -281,11 +328,6 @@ row_rec_to_index_entry(
needed is allocated */ needed is allocated */
{ {
dtuple_t* entry; dtuple_t* entry;
dfield_t* dfield;
ulint i;
const byte* field;
ulint len;
ulint rec_len;
byte* buf; byte* buf;
mem_heap_t* tmp_heap = NULL; mem_heap_t* tmp_heap = NULL;
ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint offsets_[REC_OFFS_NORMAL_SIZE];
...@@ -305,29 +347,12 @@ row_rec_to_index_entry( ...@@ -305,29 +347,12 @@ row_rec_to_index_entry(
rec_offs_make_valid(rec, index, offsets); rec_offs_make_valid(rec, index, offsets);
} }
rec_len = rec_offs_n_fields(offsets); entry = row_rec_to_index_entry_low(rec, index, offsets, heap);
entry = dtuple_create(heap, rec_len);
dtuple_set_n_fields_cmp(entry,
dict_index_get_n_unique_in_tree(index));
ut_ad(rec_len == dict_index_get_n_fields(index));
dict_index_copy_types(entry, index, rec_len);
dtuple_set_info_bits(entry, dtuple_set_info_bits(entry,
rec_get_info_bits(rec, rec_offs_comp(offsets))); rec_get_info_bits(rec, rec_offs_comp(offsets)));
for (i = 0; i < rec_len; i++) { if (UNIV_LIKELY_NULL(tmp_heap)) {
dfield = dtuple_get_nth_field(entry, i);
field = rec_get_nth_field(rec, offsets, i, &len);
dfield_set_data(dfield, field, len);
}
ut_ad(dtuple_check_typed(entry));
if (tmp_heap) {
mem_heap_free(tmp_heap); mem_heap_free(tmp_heap);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment