branches/zip: Fix a critical bug in fast index creation that could

corrupt the created indexes. row_merge(): Make "half" an in/out parameter. Determine the offset of half the output file. Copy the last blocks record-by-record instead of block-by-block, so that the records can be counted. Check that the input and output have matching n_rec. row_merge_sort(): Do not assume that two blocks of size N are merged into a block of size 2*N. The output block can be shorter than the input if the last page of each input block is almost empty. Use an accurate termination condition, based on the "half" computed by row_merge(). row_merge_read(), row_merge_write(), row_merge_blocks(): Add debug output. merge_file_t, row_merge_file_create(): Add n_rec, the number of records in the merge file. row_merge_read_clustered_index(): Update n_rec. row_merge_blocks(): Update and check n_rec. row_merge_blocks_copy(): New function, for copying the last blocks in row_merge(). Update and check n_rec. This bug was discovered with a user-supplied test case that creates an index where the initial temporary file is 249 one-megabyte blocks and the merged files become smaller. In the test, possible merge record sizes are 10, 18, and 26 bytes. rb://150 approved by Sunny Bains. This addresses Issue #320.

branches/zip: Fix a critical bug in fast index creation that could
corrupt the created indexes. row_merge(): Make "half" an in/out parameter. Determine the offset of half the output file. Copy the last blocks record-by-record instead of block-by-block, so that the records can be counted. Check that the input and output have matching n_rec. row_merge_sort(): Do not assume that two blocks of size N are merged into a block of size 2*N. The output block can be shorter than the input if the last page of each input block is almost empty. Use an accurate termination condition, based on the "half" computed by row_merge(). row_merge_read(), row_merge_write(), row_merge_blocks(): Add debug output. merge_file_t, row_merge_file_create(): Add n_rec, the number of records in the merge file. row_merge_read_clustered_index(): Update n_rec. row_merge_blocks(): Update and check n_rec. row_merge_blocks_copy(): New function, for copying the last blocks in row_merge(). Update and check n_rec. This bug was discovered with a user-supplied test case that creates an index where the initial temporary file is 249 one-megabyte blocks and the merged files become smaller. In the test, possible merge record sizes are 10, 18, and 26 bytes. rb://150 approved by Sunny Bains. This addresses Issue #320.
4e0ad399 · marko · 0f7895d4 · 4e0ad399 · 4e0ad399
Commit 4e0ad399 authored Aug 27, 2009 by marko
Hide whitespace changes
Inline Side-by-side

Showing with 183 additions and 32 deletions

ChangeLog ChangeLog +7 -0

row/row0merge.c row/row0merge.c +176 -32

No files found.
--- a/ChangeLog
+++ b/ChangeLog
+2009-08-27	The InnoDB Team
+
+	* row/row0merge.c:
+	Fix a bug in the merge sort that can corrupt indexes in fast index
+	creation.  Add some consistency checks.  Check that the number of
+	records remains constant in every merge sort pass.
+
 2009-08-27	The InnoDB Team

 	* buf/buf0buf.c, buf/buf0lru.c, buf/buf0rea.c,

--- a/row/row0merge.c
+++ b/row/row0merge.c
@@ -60,9 +60,19 @@ Completed by Sunny Bains and Marko Makela
 #ifdef UNIV_DEBUG
 /** Set these in order ot enable debug printout. */
 /* @{ */
+/** Log the outcome of each row_merge_cmp() call, comparing records. */
 static ibool	row_merge_print_cmp;
+/** Log each record read from temporary file. */
 static ibool	row_merge_print_read;
+/** Log each record write to temporary file. */
 static ibool	row_merge_print_write;
+/** Log each row_merge_blocks() call, merging two blocks of records to
+a bigger one. */
+static ibool	row_merge_print_block;
+/** Log each block read from temporary file. */
+static ibool	row_merge_print_block_read;
+/** Log each block read from temporary file. */
+static ibool	row_merge_print_block_write;
 /* @} */
 #endif /* UNIV_DEBUG */

@@ -109,8 +119,9 @@ typedef struct row_merge_buf_struct row_merge_buf_t;

 /** Information about temporary files used in merge sort */
 struct merge_file_struct {
-	int	fd;		/*!< file descriptor */
-	ulint	offset;		/*!< file offset */
+	int		fd;		/*!< file descriptor */
+	ulint		offset;		/*!< file offset (end of file) */
+	ib_uint64_t	n_rec;		/*!< number of records in the file */
 };

 /** Information about temporary files used in merge sort */
@@ -682,6 +693,13 @@ row_merge_read(
 	ib_uint64_t	ofs = ((ib_uint64_t) offset) * sizeof *buf;
 	ibool		success;

+#ifdef UNIV_DEBUG
+	if (row_merge_print_block_read) {
+		fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n",
+			fd, (ulong) offset);
+	}
+#endif /* UNIV_DEBUG */
+
 	success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
 						 (ulint) (ofs & 0xFFFFFFFF),
 						 (ulint) (ofs >> 32),
@@ -709,6 +727,13 @@ row_merge_write(
 	ib_uint64_t	ofs = ((ib_uint64_t) offset)
 		* sizeof(row_merge_block_t);

+#ifdef UNIV_DEBUG
+	if (row_merge_print_block_write) {
+		fprintf(stderr, "row_merge_write fd=%d ofs=%lu\n",
+			fd, (ulong) offset);
+	}
+#endif /* UNIV_DEBUG */
+
 	return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
 					 (ulint) (ofs & 0xFFFFFFFF),
 					 (ulint) (ofs >> 32),
@@ -718,7 +743,7 @@ row_merge_write(
 /********************************************************************//**
 Read a merge record.
 @return	pointer to next record, or NULL on I/O error or end of list */
-static
+static __attribute__((nonnull))
 const byte*
 row_merge_read_rec(
 /*===============*/
@@ -1070,7 +1095,7 @@ row_merge_cmp(
 Reads clustered index of the table and create temporary files
 containing the index entries for the indexes to be built.
 @return	DB_SUCCESS or error */
-static
+static __attribute__((nonnull))
 ulint
 row_merge_read_clustered_index(
 /*===========================*/
@@ -1233,6 +1258,7 @@ row_merge_read_clustered_index(

 			if (UNIV_LIKELY
 			    (row && row_merge_buf_add(buf, row, ext))) {
+				file->n_rec++;
 				continue;
 			}

@@ -1274,14 +1300,19 @@ err_exit:
 			UNIV_MEM_INVALID(block[0], sizeof block[0]);
 			merge_buf[i] = row_merge_buf_empty(buf);

-			/* Try writing the record again, now that
-			the buffer has been written out and emptied. */
+			if (UNIV_LIKELY(row != NULL)) {
+				/* Try writing the record again, now
+				that the buffer has been written out
+				and emptied. */

-			if (UNIV_UNLIKELY
-			    (row && !row_merge_buf_add(buf, row, ext))) {
-				/* An empty buffer should have enough
-				room for at least one record. */
-				ut_error;
+				if (UNIV_UNLIKELY
+				    (!row_merge_buf_add(buf, row, ext))) {
+					/* An empty buffer should have enough
+					room for at least one record. */
+					ut_error;
+				}
+
+				file->n_rec++;
 			}
 		}

@@ -1320,7 +1351,7 @@ func_exit:
 		b2 = row_merge_write_rec(&block[2], &buf[2], b2,	\
 					 of->fd, &of->offset,		\
 					 mrec##N, offsets##N);		\
-		if (UNIV_UNLIKELY(!b2)) {				\
+		if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) {	\
 			goto corrupt;					\
 		}							\
 		b##N = row_merge_read_rec(&block[N], &buf[N],		\
@@ -1336,14 +1367,14 @@ func_exit:
 	} while (0)

 /*************************************************************//**
-Merge two blocks of linked lists on disk and write a bigger block.
+Merge two blocks of records on disk and write a bigger block.
 @return	DB_SUCCESS or error code */
 static
 ulint
 row_merge_blocks(
 /*=============*/
 	const dict_index_t*	index,	/*!< in: index being created */
-	merge_file_t*		file,	/*!< in/out: file containing
+	const merge_file_t*	file,	/*!< in: file containing
 					index entries */
 	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
 	ulint*			foffs0,	/*!< in/out: offset of first
@@ -1366,6 +1397,17 @@ row_merge_blocks(
 	ulint*		offsets0;/* offsets of mrec0 */
 	ulint*		offsets1;/* offsets of mrec1 */

+#ifdef UNIV_DEBUG
+	if (row_merge_print_block) {
+		fprintf(stderr,
+			"row_merge_blocks fd=%d ofs=%lu + fd=%d ofs=%lu"
+			" = fd=%d ofs=%lu\n",
+			file->fd, (ulong) *foffs0,
+			file->fd, (ulong) *foffs1,
+			of->fd, (ulong) of->offset);
+	}
+#endif /* UNIV_DEBUG */
+
 	heap = row_merge_heap_create(index, &offsets0, &offsets1);

 	/* Write a record and read the next record.  Split the output
@@ -1437,17 +1479,88 @@ done1:
 	return(b2 ? DB_SUCCESS : DB_CORRUPTION);
 }

+/*************************************************************//**
+Copy a block of index entries.
+@return	TRUE on success, FALSE on failure */
+static __attribute__((nonnull))
+ibool
+row_merge_blocks_copy(
+/*==================*/
+	const dict_index_t*	index,	/*!< in: index being created */
+	const merge_file_t*	file,	/*!< in: input file */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	ulint*			foffs0,	/*!< in/out: input file offset */
+	merge_file_t*		of)	/*!< in/out: output file */
+{
+	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
+
+	mrec_buf_t	buf[3];	/*!< buffer for handling
+				split mrec in block[] */
+	const byte*	b0;	/*!< pointer to block[0] */
+	byte*		b2;	/*!< pointer to block[2] */
+	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] */
+	ulint*		offsets0;/* offsets of mrec0 */
+	ulint*		offsets1;/* dummy offsets */
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block) {
+		fprintf(stderr,
+			"row_merge_blocks_copy fd=%d ofs=%lu"
+			" = fd=%d ofs=%lu\n",
+			file->fd, (ulong) foffs0,
+			of->fd, (ulong) of->offset);
+	}
+#endif /* UNIV_DEBUG */
+
+	heap = row_merge_heap_create(index, &offsets0, &offsets1);
+
+	/* Write a record and read the next record.  Split the output
+	file in two halves, which can be merged on the following pass. */
+
+	if (!row_merge_read(file->fd, *foffs0, &block[0])) {
+corrupt:
+		mem_heap_free(heap);
+		return(FALSE);
+	}
+
+	b0 = block[0];
+	b2 = block[2];
+
+	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
+				foffs0, &mrec0, offsets0);
+	if (UNIV_UNLIKELY(!b0 && mrec0)) {
+
+		goto corrupt;
+	}
+
+	if (mrec0) {
+		/* append all mrec0 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
+		}
+	}
+done0:
+
+	/* The file offset points to the beginning of the last page
+	that has been read.  Update it to point to the next block. */
+	(*foffs0)++;
+
+	mem_heap_free(heap);
+	return(row_merge_write_eof(&block[2], b2, of->fd, &of->offset)
+	       != NULL);
+}
+
 /*************************************************************//**
 Merge disk files.
 @return	DB_SUCCESS or error code */
-static
+static __attribute__((nonnull))
 ulint
 row_merge(
 /*======*/
 	const dict_index_t*	index,	/*!< in: index being created */
 	merge_file_t*		file,	/*!< in/out: file containing
 					index entries */
-	ulint			half,	/*!< in: half the file */
+	ulint*			half,	/*!< in/out: half the file */
 	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
 	int*			tmpfd,	/*!< in/out: temporary file handle */
 	TABLE*			table)	/*!< in/out: MySQL table, for
@@ -1458,43 +1571,76 @@ row_merge(
 	ulint		foffs1;	/*!< second input offset */
 	ulint		error;	/*!< error code */
 	merge_file_t	of;	/*!< output file */
+	const ulint	ihalf	= *half;
+				/*!< half the input file */
+	ulint		ohalf;	/*!< half the output file */

 	UNIV_MEM_ASSERT_W(block[0], 3 * sizeof block[0]);
-	ut_ad(half > 0);
+	ut_ad(ihalf > 0);
+	ut_ad(ihalf < file->offset);

 	of.fd = *tmpfd;
 	of.offset = 0;
+	of.n_rec = 0;

 	/* Merge blocks to the output file. */
+	ohalf = 0;
 	foffs0 = 0;
-	foffs1 = half;
+	foffs1 = ihalf;
+
+	for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) {
+		ulint	ahalf;	/*!< arithmetic half the input file */

-	for (; foffs0 < half && foffs1 < file->offset; foffs0++, foffs1++) {
 		error = row_merge_blocks(index, file, block,
 					 &foffs0, &foffs1, &of, table);

 		if (error != DB_SUCCESS) {
 			return(error);
 		}
+
+		/* Record the offset of the output file when
+		approximately half the output has been generated.  In
+		this way, the next invocation of row_merge() will
+		spend most of the time in this loop.  The initial
+		estimate is ohalf==0. */
+		ahalf = file->offset / 2;
+		ut_ad(ohalf <= of.offset);
+
+		/* Improve the estimate until reaching half the input
+		file size, or we can not get any closer to it.  All
+		comparands should be non-negative when !(ohalf < ahalf)
+		because ohalf <= of.offset. */
+		if (ohalf < ahalf || of.offset - ahalf < ohalf - ahalf) {
+			ohalf = of.offset;
+		}
 	}

-	/* Copy the last block, if there is one. */
-	while (foffs0 < half) {
-		if (!row_merge_read(file->fd, foffs0++, block)
-		    || !row_merge_write(of.fd, of.offset++, block)) {
+	/* Copy the last blocks, if there are any. */
+
+	while (foffs0 < ihalf) {
+		if (!row_merge_blocks_copy(index, file, block, &foffs0, &of)) {
 			return(DB_CORRUPTION);
 		}
 	}
+
+	ut_ad(foffs0 == ihalf);
+
 	while (foffs1 < file->offset) {
-		if (!row_merge_read(file->fd, foffs1++, block)
-		    || !row_merge_write(of.fd, of.offset++, block)) {
+		if (!row_merge_blocks_copy(index, file, block, &foffs1, &of)) {
 			return(DB_CORRUPTION);
 		}
 	}

+	ut_ad(foffs1 == file->offset);
+
+	if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) {
+		return(DB_CORRUPTION);
+	}
+
 	/* Swap file descriptors for the next pass. */
 	*tmpfd = file->fd;
 	*file = of;
+	*half = ohalf;

 	UNIV_MEM_INVALID(block[0], 3 * sizeof block[0]);

@@ -1517,20 +1663,17 @@ row_merge_sort(
 					reporting erroneous key value
 					if applicable */
 {
-	ulint	blksz;	/*!< block size */
+	ulint	half = file->offset / 2;

-	for (blksz = 1; blksz < file->offset; blksz *= 2) {
-		ulint	half;
+	do {
 		ulint	error;

-		ut_ad(ut_is_2pow(blksz));
-		half = ut_2pow_round((file->offset + (blksz - 1)) / 2, blksz);
-		error = row_merge(index, file, half, block, tmpfd, table);
+		error = row_merge(index, file, &half, block, tmpfd, table);

 		if (error != DB_SUCCESS) {
 			return(error);
 		}
-	}
+	} while (half < file->offset && half > 0);

 	return(DB_SUCCESS);
 }
@@ -1909,6 +2052,7 @@ row_merge_file_create(
 {
 	merge_file->fd = innobase_mysql_tmpfile();
 	merge_file->offset = 0;
+	merge_file->n_rec = 0;
 }

 /*********************************************************************//**