row0merge.c 59.1 KB
Newer Older
1 2 3
/******************************************************
New index creation routines using a merge sort

4
(c) 2005,2007 Innobase Oy
5 6

Created 12/4/2005 Jan Lindstrom
7
Completed by Sunny Bains and Marko Makela
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
*******************************************************/

#include "row0merge.h"
#include "row0ext.h"
#include "row0row.h"
#include "row0upd.h"
#include "row0ins.h"
#include "row0sel.h"
#include "dict0dict.h"
#include "dict0mem.h"
#include "dict0boot.h"
#include "dict0crea.h"
#include "dict0load.h"
#include "btr0btr.h"
#include "mach0data.h"
#include "trx0rseg.h"
#include "trx0trx.h"
#include "trx0roll.h"
#include "trx0undo.h"
#include "trx0purge.h"
#include "trx0rec.h"
#include "que0que.h"
#include "rem0cmp.h"
#include "read0read.h"
#include "os0file.h"
#include "lock0lock.h"
#include "data0data.h"
#include "data0type.h"
#include "que0que.h"
#include "pars0pars.h"
#include "mem0mem.h"
#include "log0log.h"
40
#include "ut0sort.h"
41
#include "handler0alter.h"
42

43 44 45 46 47 48 49
#ifdef UNIV_DEBUG
/* Set these in order ot enable debug printout. */
static ibool	row_merge_print_cmp;
static ibool	row_merge_print_read;
static ibool	row_merge_print_write;
#endif /* UNIV_DEBUG */

50 51 52 53 54 55
/* Block size for I/O operations in merge sort.  The minimum is
UNIV_PAGE_SIZE, or page_get_free_space_of_empty() rounded to a power of 2.

When not creating a PRIMARY KEY that contains column prefixes, this
can be set as small as UNIV_PAGE_SIZE / 2.  See the comment above
ut_ad(data_size < sizeof(row_merge_block_t)). */
56

57
typedef byte	row_merge_block_t[1048576];
58

59 60 61 62
/* Secondary buffer for I/O operations of merge records.  This buffer
is used for writing or reading a record that spans two row_merge_block_t.
Thus, it must be able to hold one merge record, whose maximum size is
the same as the minimum size of row_merge_block_t. */
63

64
typedef byte	mrec_buf_t[UNIV_PAGE_SIZE];
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82

/* Merge record in row_merge_block_t.  The format is the same as a
record in ROW_FORMAT=COMPACT with the exception that the
REC_N_NEW_EXTRA_BYTES are omitted. */
typedef byte	mrec_t;

/* Buffer for sorting in main memory. */
struct row_merge_buf_struct {
	mem_heap_t*	heap;		/* memory heap where allocated */
	dict_index_t*	index;		/* the index the tuples belong to */
	ulint		total_size;	/* total amount of data bytes */
	ulint		n_tuples;	/* number of data tuples */
	ulint		max_tuples;	/* maximum number of data tuples */
	const dfield_t**tuples;		/* array of pointers to
					arrays of fields that form
					the data tuples */
	const dfield_t**tmp_tuples;	/* temporary copy of tuples,
					for sorting */
83 84
};

85
typedef struct row_merge_buf_struct row_merge_buf_t;
86

87 88
/* Information about temporary files used in merge sort are stored
to this structure */
89

90 91 92
struct merge_file_struct {
	int	fd;		/* File descriptor */
	ulint	offset;		/* File offset */
93 94
};

95
typedef struct merge_file_struct merge_file_t;
96

97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
#ifdef UNIV_DEBUG
/**********************************************************
Display a merge tuple. */
static
void
row_merge_tuple_print(
/*==================*/
	FILE*		f,	/* in: output stream */
	const dfield_t*	entry,	/* in: tuple to print */
	ulint		n_fields)/* in: number of fields in the tuple */
{
	ulint	j;

	for (j = 0; j < n_fields; j++) {
		const dfield_t*	field = &entry[j];

		if (dfield_is_null(field)) {
			fputs("\n NULL;", f);
		} else {
116 117
			ulint	field_len	= dfield_get_len(field);
			ulint	len		= ut_min(field_len, 20);
118 119 120 121 122
			if (dfield_is_ext(field)) {
				fputs("\nE", f);
			} else {
				fputs("\n ", f);
			}
123 124 125
			ut_print_buf(f, dfield_get_data(field), len);
			if (len != field_len) {
				fprintf(f, " (total %lu bytes)", field_len);
126 127 128 129 130 131 132
			}
		}
	}
	putc('\n', f);
}
#endif /* UNIV_DEBUG */

133 134
/**********************************************************
Allocate a sort buffer. */
135
static
136 137 138 139 140 141
row_merge_buf_t*
row_merge_buf_create_low(
/*=====================*/
					/* out,own: sort buffer */
	mem_heap_t*	heap,		/* in: heap where allocated */
	dict_index_t*	index,		/* in: secondary index */
142 143
	ulint		max_tuples,	/* in: maximum number of data tuples */
	ulint		buf_size)	/* in: size of the buffer, in bytes */
144
{
145 146
	row_merge_buf_t*	buf;

147 148 149 150
	ut_ad(max_tuples > 0);
	ut_ad(max_tuples <= sizeof(row_merge_block_t));
	ut_ad(max_tuples < buf_size);

151
	buf = mem_heap_zalloc(heap, buf_size);
152 153 154 155 156 157 158 159
	buf->heap = heap;
	buf->index = index;
	buf->max_tuples = max_tuples;
	buf->tuples = mem_heap_alloc(heap,
				     2 * max_tuples * sizeof *buf->tuples);
	buf->tmp_tuples = buf->tuples + max_tuples;

	return(buf);
160 161
}

162 163
/**********************************************************
Allocate a sort buffer. */
164
static
165 166
row_merge_buf_t*
row_merge_buf_create(
167
/*=================*/
168 169
				/* out,own: sort buffer */
	dict_index_t*	index)	/* in: secondary index */
170
{
171 172 173 174
	row_merge_buf_t*	buf;
	ulint			max_tuples;
	ulint			buf_size;
	mem_heap_t*		heap;
175

176 177
	max_tuples = sizeof(row_merge_block_t)
		/ ut_max(1, dict_index_get_min_size(index));
178

179
	buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
180

181
	heap = mem_heap_create(buf_size + sizeof(row_merge_block_t));
182

183
	buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
184

185
	return(buf);
186 187
}

188 189
/**********************************************************
Empty a sort buffer. */
190
static
191
row_merge_buf_t*
192 193
row_merge_buf_empty(
/*================*/
194 195
					/* out: sort buffer */
	row_merge_buf_t*	buf)	/* in,own: sort buffer */
196
{
197 198 199 200
	ulint		buf_size;
	ulint		max_tuples	= buf->max_tuples;
	mem_heap_t*	heap		= buf->heap;
	dict_index_t*	index		= buf->index;
201

202
	buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
203

204
	mem_heap_empty(heap);
205

206
	return(row_merge_buf_create_low(heap, index, max_tuples, buf_size));
207 208
}

209 210
/**********************************************************
Deallocate a sort buffer. */
211
static
212 213 214 215
void
row_merge_buf_free(
/*===============*/
	row_merge_buf_t*	buf)	/* in,own: sort buffer, to be freed */
216
{
217
	mem_heap_free(buf->heap);
218 219
}

220 221
/**********************************************************
Insert a data tuple into a sort buffer. */
222
static
223 224 225 226 227 228 229
ibool
row_merge_buf_add(
/*==============*/
					/* out: TRUE if added,
					FALSE if out of space */
	row_merge_buf_t*	buf,	/* in/out: sort buffer */
	const dtuple_t*		row,	/* in: row in clustered index */
230
	const row_ext_t*	ext)	/* in: cache of externally stored
231
					column prefixes, or NULL */
232
{
233 234 235 236 237 238 239
	ulint			i;
	ulint			n_fields;
	ulint			data_size;
	ulint			extra_size;
	const dict_index_t*	index;
	dfield_t*		entry;
	dfield_t*		field;
240

241 242
	if (buf->n_tuples >= buf->max_tuples) {
		return(FALSE);
243 244
	}

245 246
	UNIV_PREFETCH_R(row->fields);

247 248 249
	index = buf->index;

	n_fields = dict_index_get_n_fields(index);
250 251 252 253 254 255

	entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry);
	buf->tuples[buf->n_tuples] = entry;
	field = entry;

	data_size = 0;
256
	extra_size = UT_BITS_IN_BYTES(index->n_nullable);
257

258
	for (i = 0; i < n_fields; i++, field++) {
259
		const dict_field_t*	ifield;
260 261 262
		const dict_col_t*	col;
		ulint			col_no;
		const dfield_t*		row_field;
263
		ulint			len;
264

265
		ifield = dict_index_get_nth_field(index, i);
266 267 268 269
		col = ifield->col;
		col_no = dict_col_get_no(col);
		row_field = dtuple_get_nth_field(row, col_no);
		dfield_copy(field, row_field);
270
		len = dfield_get_len(field);
271

272 273 274 275 276 277
		if (dfield_is_null(field)) {
			ut_ad(!(col->prtype & DATA_NOT_NULL));
			continue;
		} else if (UNIV_LIKELY(!ext)) {
		} else if (dict_index_is_clust(index)) {
			/* Flag externally stored fields. */
278 279
			const byte*	buf = row_ext_lookup(ext, col_no,
							     &len);
280
			if (UNIV_LIKELY_NULL(buf)) {
281
				ut_a(buf != field_ref_zero);
282 283 284 285
				if (i < dict_index_get_n_unique(index)) {
					dfield_set_data(field, buf, len);
				} else {
					dfield_set_ext(field);
286
					len = dfield_get_len(field);
287
				}
288 289
			}
		} else {
290 291
			const byte*	buf = row_ext_lookup(ext, col_no,
							     &len);
292
			if (UNIV_LIKELY_NULL(buf)) {
293
				ut_a(buf != field_ref_zero);
294
				dfield_set_data(field, buf, len);
295 296
			}
		}
297

298
		/* If a column prefix index, take only the prefix */
299

300
		if (ifield->prefix_len) {
301
			len = dtype_get_at_most_n_mbchars(
302 303 304
				col->prtype,
				col->mbminlen, col->mbmaxlen,
				ifield->prefix_len,
305 306
				len, dfield_get_data(field));
			dfield_set_len(field, len);
307
		}
308

309
		ut_ad(len <= col->len || col->mtype == DATA_BLOB);
310

311
		if (ifield->fixed_len) {
312
			ut_ad(len == ifield->fixed_len);
313 314 315
			ut_ad(!dfield_is_ext(field));
		} else if (dfield_is_ext(field)) {
			extra_size += 2;
316
		} else if (len < 128
317 318 319
			   || (col->len < 256 && col->mtype != DATA_BLOB)) {
			extra_size++;
		} else {
320 321 322 323
			/* For variable-length columns, we look up the
			maximum length from the column itself.  If this
			is a prefix index column shorter than 256 bytes,
			this will waste one byte. */
324 325
			extra_size += 2;
		}
326
		data_size += len;
327
	}
328

329 330 331 332
#ifdef UNIV_DEBUG
	{
		ulint	size;
		ulint	extra;
333

334
		size = rec_get_converted_size_comp(index,
335
						   REC_STATUS_ORDINARY,
336
						   entry, n_fields, &extra);
337

338 339
		ut_ad(data_size + extra_size + REC_N_NEW_EXTRA_BYTES == size);
		ut_ad(extra_size + REC_N_NEW_EXTRA_BYTES == extra);
340
	}
341
#endif /* UNIV_DEBUG */
342

343 344 345 346
	/* Add to the total size of the record in row_merge_block_t
	the encoded length of extra_size and the extra bytes (extra_size).
	See row_merge_buf_write() for the variable-length encoding
	of extra_size. */
347
	data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
348

349 350 351 352 353 354 355 356
	/* The following assertion may fail if row_merge_block_t is
	declared very small and a PRIMARY KEY is being created with
	many prefix columns.  In that case, the record may exceed the
	page_zip_rec_needs_ext() limit.  However, no further columns
	will be moved to external storage until the record is inserted
	to the clustered index B-tree. */
	ut_ad(data_size < sizeof(row_merge_block_t));

357 358 359 360
	/* Reserve one byte for the end marker of row_merge_block_t. */
	if (buf->total_size + data_size >= sizeof(row_merge_block_t) - 1) {
		return(FALSE);
	}
361

362 363
	buf->total_size += data_size;
	buf->n_tuples++;
364

365
	field = entry;
366

367
	/* Copy the data fields. */
368 369

	do {
370
		dfield_dup(field++, buf->heap);
371
	} while (--n_fields);
372

373
	return(TRUE);
374 375
}

376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
/* Structure for reporting duplicate records. */
struct row_merge_dup_struct {
	const dict_index_t*	index;		/* index being sorted */
	TABLE*			table;		/* MySQL table object */
	ulint			n_dup;		/* number of duplicates */
};

typedef struct row_merge_dup_struct row_merge_dup_t;

/*****************************************************************
Report a duplicate key. */
static
void
row_merge_dup_report(
/*=================*/
	row_merge_dup_t*	dup,	/* in/out: for reporting duplicates */
	const dfield_t*		entry)	/* in: duplicate index entry */
{
394
	mrec_buf_t 		buf;
395 396
	const dtuple_t*		tuple;
	dtuple_t		tuple_store;
397
	const rec_t*		rec;
398 399 400 401 402
	const dict_index_t*	index	= dup->index;
	ulint			n_fields= dict_index_get_n_fields(index);
	mem_heap_t*		heap	= NULL;
	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
	ulint*			offsets;
403
	ulint			n_ext;
404 405 406 407 408 409 410

	if (dup->n_dup++) {
		/* Only report the first duplicate record,
		but count all duplicate records. */
		return;
	}

411
	rec_offs_init(offsets_);
412

413
	/* Convert the tuple to a record and then to MySQL format. */
414

415 416
	tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
	n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
417

418 419 420
	rec = rec_convert_dtuple_to_rec(buf, index, tuple, n_ext);
	offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED,
				  &heap);
421

422
	innobase_rec_to_mysql(dup->table, rec, index, offsets);
423 424 425 426 427 428

	if (UNIV_LIKELY_NULL(heap)) {
		mem_heap_free(heap);
	}
}

429
/*****************************************************************
430
Compare two tuples. */
431
static
432
int
433 434 435 436 437 438
row_merge_tuple_cmp(
/*================*/
					/* out: 1, 0, -1 if a is greater,
					equal, less, respectively, than b */
	ulint			n_field,/* in: number of fields */
	const dfield_t*		a,	/* in: first tuple to be compared */
439 440
	const dfield_t*		b,	/* in: second tuple to be compared */
	row_merge_dup_t*	dup)	/* in/out: for reporting duplicates */
441
{
442 443
	int		cmp;
	const dfield_t*	field	= a;
444

445 446 447
	do {
		cmp = cmp_dfield_dfield(a++, b++);
	} while (!cmp && --n_field);
448

449 450
	if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) {
		row_merge_dup_report(dup, field);
451
	}
452

453
	return(cmp);
454 455
}

456 457
/**************************************************************************
Merge sort the tuple buffer in main memory. */
458
static
459 460 461 462
void
row_merge_tuple_sort(
/*=================*/
	ulint			n_field,/* in: number of fields */
463
	row_merge_dup_t*	dup,	/* in/out: for reporting duplicates */
464 465 466 467 468 469
	const dfield_t**	tuples,	/* in/out: tuples */
	const dfield_t**	aux,	/* in/out: work area */
	ulint			low,	/* in: lower bound of the
					sorting area, inclusive */
	ulint			high)	/* in: upper bound of the
					sorting area, exclusive */
470
{
471
#define row_merge_tuple_sort_ctx(a,b,c,d) \
472 473
	row_merge_tuple_sort(n_field, dup, a, b, c, d)
#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup)
474

475 476
	UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
			      tuples, aux, low, high, row_merge_tuple_cmp_ctx);
477 478
}

479 480
/**********************************************************
Sort a buffer. */
481
static
482
void
483
row_merge_buf_sort(
484
/*===============*/
485 486
	row_merge_buf_t*	buf,	/* in/out: sort buffer */
	row_merge_dup_t*	dup)	/* in/out: for reporting duplicates */
487
{
488
	row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup,
489
			     buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
490 491
}

492 493
/**********************************************************
Write a buffer to a block. */
494
static
495 496 497 498
void
row_merge_buf_write(
/*================*/
	const row_merge_buf_t*	buf,	/* in: sorted buffer */
499 500 501
#ifdef UNIV_DEBUG
	const merge_file_t*	of,	/* in: output file */
#endif /* UNIV_DEBUG */
502
	row_merge_block_t*	block)	/* out: buffer for writing to file */
503 504 505
#ifndef UNIV_DEBUG
# define row_merge_buf_write(buf, of, block) row_merge_buf_write(buf, block)
#endif /* !UNIV_DEBUG */
506
{
507 508 509
	const dict_index_t*	index	= buf->index;
	ulint			n_fields= dict_index_get_n_fields(index);
	byte*			b	= &(*block)[0];
510

511
	ulint		i;
512 513 514 515

	for (i = 0; i < buf->n_tuples; i++) {
		ulint		size;
		ulint		extra_size;
516
		const dfield_t*	entry		= buf->tuples[i];
517

518
		size = rec_get_converted_size_comp(index,
519
						   REC_STATUS_ORDINARY,
520
						   entry, n_fields,
521 522 523 524 525 526 527 528
						   &extra_size);
		ut_ad(size > extra_size);
		ut_ad(extra_size >= REC_N_NEW_EXTRA_BYTES);
		extra_size -= REC_N_NEW_EXTRA_BYTES;
		size -= REC_N_NEW_EXTRA_BYTES;

		/* Encode extra_size + 1 */
		if (extra_size + 1 < 0x80) {
529
			*b++ = (byte) (extra_size + 1);
530
		} else {
531
			ut_ad((extra_size + 1) < 0x8000);
532
			*b++ = (byte) (0x80 | ((extra_size + 1) >> 8));
533
			*b++ = (byte) (extra_size + 1);
534 535
		}

536
		ut_ad(b + size < block[1]);
537

538 539
		rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index,
					       REC_STATUS_ORDINARY,
540
					       entry, n_fields);
541

542
		b += size;
543 544 545 546 547 548 549 550 551

#ifdef UNIV_DEBUG
		if (row_merge_print_write) {
			fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu",
				(void*) b, of->fd, (ulong) of->offset,
				(ulong) i);
			row_merge_tuple_print(stderr, entry, n_fields);
		}
#endif /* UNIV_DEBUG */
552 553
	}

554 555
	/* Write an "end-of-chunk" marker. */
	ut_a(b < block[1]);
556
	ut_a(b == block[0] + buf->total_size);
557 558 559 560
	*b++ = 0;
#ifdef UNIV_DEBUG_VALGRIND
	/* The rest of the block is uninitialized.  Initialize it
	to avoid bogus warnings. */
561
	memset(b, 0xff, block[1] - b);
562
#endif /* UNIV_DEBUG_VALGRIND */
563 564
#ifdef UNIV_DEBUG
	if (row_merge_print_write) {
565 566
		fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n",
			(void*) b, of->fd, (ulong) of->offset);
567 568
	}
#endif /* UNIV_DEBUG */
569 570
}

571 572
/**********************************************************
Create a memory heap and allocate space for row_merge_rec_offsets(). */
573
static
574 575 576
mem_heap_t*
row_merge_heap_create(
/*==================*/
577 578 579 580
						/* out: memory heap */
	const dict_index_t*	index,		/* in: record descriptor */
	ulint**			offsets1,	/* out: offsets */
	ulint**			offsets2)	/* out: offsets */
581
{
582
	ulint		i	= 1 + REC_OFFS_HEADER_SIZE
583 584
		+ dict_index_get_n_fields(index);
	mem_heap_t*	heap	= mem_heap_create(2 * i * sizeof *offsets1);
585

586 587
	*offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1);
	*offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2);
588

589 590
	(*offsets1)[0] = (*offsets2)[0] = i;
	(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
591

592
	return(heap);
593 594
}

595 596 597
/**************************************************************************
Search an index object by name and column names.  If several indexes match,
return the index with the max id. */
598
static
599 600 601 602 603 604 605
dict_index_t*
row_merge_dict_table_get_index(
/*===========================*/
						/* out: matching index,
						NULL if not found */
	dict_table_t*		table,		/* in: table */
	const merge_index_def_t*index_def)	/* in: index definition */
606
{
607 608 609
	ulint		i;
	dict_index_t*	index;
	const char**	column_names;
610

611
	column_names = mem_alloc(index_def->n_fields * sizeof *column_names);
612

613 614
	for (i = 0; i < index_def->n_fields; ++i) {
		column_names[i] = index_def->fields[i].field_name;
615 616
	}

617 618
	index = dict_table_get_index_by_max_id(
		table, index_def->name, column_names, index_def->n_fields);
619

620
	mem_free((void*) column_names);
621

622
	return(index);
623 624
}

625 626 627 628 629 630 631 632 633 634 635 636 637
/************************************************************************
Read a merge block from the file system. */
static
ibool
row_merge_read(
/*===========*/
					/* out: TRUE if request was
					successful, FALSE if fail */
	int			fd,	/* in: file descriptor */
	ulint			offset,	/* in: offset where to read */
	row_merge_block_t*	buf)	/* out: data */
{
	ib_uint64_t	ofs = ((ib_uint64_t) offset) * sizeof *buf;
638 639 640 641 642 643 644 645 646 647 648
	ibool		success;

	success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
						 (ulint) (ofs & 0xFFFFFFFF),
						 (ulint) (ofs >> 32),
						 sizeof *buf);
	if (UNIV_UNLIKELY(!success)) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: failed to read merge block at %llu\n", ofs);
	}
649

650
	return(UNIV_LIKELY(success));
651
}
652

653 654 655 656 657 658 659 660 661 662 663 664 665 666
/************************************************************************
Read a merge block from the file system. */
static
ibool
row_merge_write(
/*============*/
				/* out: TRUE if request was
				successful, FALSE if fail */
	int		fd,	/* in: file descriptor */
	ulint		offset,	/* in: offset where to write */
	const void*	buf)	/* in: data */
{
	ib_uint64_t	ofs = ((ib_uint64_t) offset)
		* sizeof(row_merge_block_t);
667

668 669 670 671 672
	return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
					 (ulint) (ofs & 0xFFFFFFFF),
					 (ulint) (ofs >> 32),
					 sizeof(row_merge_block_t))));
}
673

674 675 676 677 678 679 680 681 682 683 684 685
/************************************************************************
Read a merge record. */
static
const byte*
row_merge_read_rec(
/*===============*/
					/* out: pointer to next record,
					or NULL on I/O error
					or end of list */
	row_merge_block_t*	block,	/* in/out: file buffer */
	mrec_buf_t*		buf,	/* in/out: secondary buffer */
	const byte*		b,	/* in: pointer to record */
686
	const dict_index_t*	index,	/* in: index of the record */
687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705
	int			fd,	/* in: file descriptor */
	ulint*			foffs,	/* in/out: file offset */
	const mrec_t**		mrec,	/* out: pointer to merge record,
					or NULL on end of list
					(non-NULL on I/O error) */
	ulint*			offsets)/* out: offsets of mrec */
{
	ulint	extra_size;
	ulint	data_size;
	ulint	avail_size;

	ut_ad(block);
	ut_ad(buf);
	ut_ad(b >= block[0]);
	ut_ad(b < block[1]);
	ut_ad(index);
	ut_ad(foffs);
	ut_ad(mrec);
	ut_ad(offsets);
706

707
	ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE
708
	      + dict_index_get_n_fields(index));
709

710
	extra_size = *b++;
711

712 713 714
	if (UNIV_UNLIKELY(!extra_size)) {
		/* End of list */
		*mrec = NULL;
715 716
#ifdef UNIV_DEBUG
		if (row_merge_print_read) {
717 718 719
			fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n",
				(const void*) b, (const void*) block,
				fd, (ulong) *foffs);
720 721
		}
#endif /* UNIV_DEBUG */
722 723
		return(NULL);
	}
724

725 726
	if (extra_size >= 0x80) {
		/* Read another byte of extra_size. */
727

728 729 730 731 732 733 734
		if (UNIV_UNLIKELY(b >= block[1])) {
			if (!row_merge_read(fd, ++(*foffs), block)) {
err_exit:
				/* Signal I/O error. */
				*mrec = b;
				return(NULL);
			}
735

736 737 738
			/* Wrap around to the beginning of the buffer. */
			b = block[0];
		}
739

740 741 742
		extra_size = (extra_size & 0x7f) << 8;
		extra_size |= *b++;
	}
743

744
	/* Normalize extra_size.  Above, value 0 signals "end of list". */
745
	extra_size--;
746

747
	/* Read the extra bytes. */
748

749 750 751 752
	if (UNIV_UNLIKELY(b + extra_size >= block[1])) {
		/* The record spans two blocks.  Copy the entire record
		to the auxiliary buffer and handle this as a special
		case. */
753

754
		avail_size = block[1] - b;
755

756
		memcpy(*buf, b, avail_size);
757

758
		if (!row_merge_read(fd, ++(*foffs), block)) {
759

760 761
			goto err_exit;
		}
762

763 764
		/* Wrap around to the beginning of the buffer. */
		b = block[0];
765

766 767 768
		/* Copy the record. */
		memcpy(*buf + avail_size, b, extra_size - avail_size);
		b += extra_size - avail_size;
769

770
		*mrec = *buf + extra_size;
771

772
		rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
773

774
		data_size = rec_offs_data_size(offsets);
775

776 777 778 779 780
		/* These overflows should be impossible given that
		records are much smaller than either buffer, and
		the record starts near the beginning of each buffer. */
		ut_a(extra_size + data_size < sizeof *buf);
		ut_a(b + data_size < block[1]);
781

782 783 784
		/* Copy the data bytes. */
		memcpy(*buf + extra_size, b, data_size);
		b += data_size;
785

786
		goto func_exit;
787
	}
788

789
	*mrec = b + extra_size;
790

791
	rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
792

793 794
	data_size = rec_offs_data_size(offsets);
	ut_ad(extra_size + data_size < sizeof *buf);
795

796
	b += extra_size + data_size;
797

798 799 800
	if (UNIV_LIKELY(b < block[1])) {
		/* The record fits entirely in the block.
		This is the normal case. */
801
		goto func_exit;
802
	}
803

804
	/* The record spans two blocks.  Copy it to buf. */
805

806
	b -= extra_size + data_size;
807 808 809 810
	avail_size = block[1] - b;
	memcpy(*buf, b, avail_size);
	*mrec = *buf + extra_size;
	rec_offs_make_valid(*mrec, index, offsets);
811

812
	if (!row_merge_read(fd, ++(*foffs), block)) {
813

814 815
		goto err_exit;
	}
816

817 818
	/* Wrap around to the beginning of the buffer. */
	b = block[0];
819

820 821 822
	/* Copy the rest of the record. */
	memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
	b += extra_size + data_size - avail_size;
823

824 825 826
func_exit:
#ifdef UNIV_DEBUG
	if (row_merge_print_read) {
827 828 829
		fprintf(stderr, "row_merge_read %p,%p,%d,%lu ",
			(const void*) b, (const void*) block,
			fd, (ulong) *foffs);
830 831 832 833 834
		rec_print_comp(stderr, *mrec, offsets);
		putc('\n', stderr);
	}
#endif /* UNIV_DEBUG */

835 836
	return(b);
}
837

838 839 840 841 842 843 844 845
/************************************************************************
Write a merge record. */
static
void
row_merge_write_rec_low(
/*====================*/
	byte*		b,	/* out: buffer */
	ulint		e,	/* in: encoded extra_size */
846 847
#ifdef UNIV_DEBUG
	ulint		size,	/* in: total size to write */
848 849
	int		fd,	/* in: file descriptor */
	ulint		foffs,	/* in: file offset */
850
#endif /* UNIV_DEBUG */
851 852
	const mrec_t*	mrec,	/* in: record to write */
	const ulint*	offsets)/* in: offsets of mrec */
853 854 855 856
#ifndef UNIV_DEBUG
# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets)	\
	row_merge_write_rec_low(b, e, mrec, offsets)
#endif /* !UNIV_DEBUG */
857
{
858 859 860 861
#ifdef UNIV_DEBUG
	const byte* const end = b + size;
	ut_ad(e == rec_offs_extra_size(offsets) + 1);

862
	if (row_merge_print_write) {
863 864
		fprintf(stderr, "row_merge_write %p,%d,%lu ",
			(void*) b, fd, (ulong) foffs);
865 866 867 868 869
		rec_print_comp(stderr, mrec, offsets);
		putc('\n', stderr);
	}
#endif /* UNIV_DEBUG */

870
	if (e < 0x80) {
871
		*b++ = (byte) e;
872
	} else {
873
		*b++ = (byte) (0x80 | (e >> 8));
874
		*b++ = (byte) e;
875 876
	}

877
	memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
878
	ut_ad(b + rec_offs_size(offsets) == end);
879 880 881
}

/************************************************************************
882
Write a merge record. */
883
static
884 885 886 887 888 889 890 891 892 893 894 895
byte*
row_merge_write_rec(
/*================*/
					/* out: pointer to end of block,
					or NULL on error */
	row_merge_block_t*	block,	/* in/out: file buffer */
	mrec_buf_t*		buf,	/* in/out: secondary buffer */
	byte*			b,	/* in: pointer to end of block */
	int			fd,	/* in: file descriptor */
	ulint*			foffs,	/* in/out: file offset */
	const mrec_t*		mrec,	/* in: record to write */
	const ulint*		offsets)/* in: offsets of mrec */
896
{
897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920
	ulint	extra_size;
	ulint	size;
	ulint	avail_size;

	ut_ad(block);
	ut_ad(buf);
	ut_ad(b >= block[0]);
	ut_ad(b < block[1]);
	ut_ad(mrec);
	ut_ad(foffs);
	ut_ad(mrec < block[0] || mrec > block[1]);
	ut_ad(mrec < buf[0] || mrec > buf[1]);

	/* Normalize extra_size.  Value 0 signals "end of list". */
	extra_size = rec_offs_extra_size(offsets) + 1;

	size = extra_size + (extra_size >= 0x80)
		+ rec_offs_data_size(offsets);

	if (UNIV_UNLIKELY(b + size >= block[1])) {
		/* The record spans two blocks.
		Copy it to the temporary buffer first. */
		avail_size = block[1] - b;

921
		row_merge_write_rec_low(buf[0],
922 923
					extra_size, size, fd, *foffs,
					mrec, offsets);
924 925 926 927 928 929 930 931 932

		/* Copy the head of the temporary buffer, write
		the completed block, and copy the tail of the
		record to the head of the new block. */
		memcpy(b, buf[0], avail_size);

		if (!row_merge_write(fd, (*foffs)++, block)) {
			return(NULL);
		}
933

934
		UNIV_MEM_INVALID(block[0], sizeof block[0]);
935

936 937 938 939 940
		/* Copy the rest. */
		b = block[0];
		memcpy(b, buf[0] + avail_size, size - avail_size);
		b += size - avail_size;
	} else {
941 942
		row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
					mrec, offsets);
943
		b += size;
944 945
	}

946
	return(b);
947 948 949
}

/************************************************************************
950
Write an end-of-list marker. */
951
static
952 953 954 955 956 957 958 959 960
byte*
row_merge_write_eof(
/*================*/
					/* out: pointer to end of block,
					or NULL on error */
	row_merge_block_t*	block,	/* in/out: file buffer */
	byte*			b,	/* in: pointer to end of block */
	int			fd,	/* in: file descriptor */
	ulint*			foffs)	/* in/out: file offset */
961
{
962 963 964 965
	ut_ad(block);
	ut_ad(b >= block[0]);
	ut_ad(b < block[1]);
	ut_ad(foffs);
966 967
#ifdef UNIV_DEBUG
	if (row_merge_print_write) {
968 969
		fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n",
			(void*) b, (void*) block, fd, (ulong) *foffs);
970 971
	}
#endif /* UNIV_DEBUG */
972 973

	*b++ = 0;
974 975
	UNIV_MEM_ASSERT_RW(block[0], b - block[0]);
	UNIV_MEM_ASSERT_W(block[0], sizeof block[0]);
976 977 978 979 980
#ifdef UNIV_DEBUG_VALGRIND
	/* The rest of the block is uninitialized.  Initialize it
	to avoid bogus warnings. */
	memset(b, 0xff, block[1] - b);
#endif /* UNIV_DEBUG_VALGRIND */
981 982 983

	if (!row_merge_write(fd, (*foffs)++, block)) {
		return(NULL);
984 985
	}

986
	UNIV_MEM_INVALID(block[0], sizeof block[0]);
987 988 989 990 991 992 993 994 995
	return(block[0]);
}

/*****************************************************************
Compare two merge records. */
static
int
row_merge_cmp(
/*==========*/
996 997 998 999 1000 1001 1002 1003 1004 1005
						/* out: 1, 0, -1 if
						mrec1 is greater, equal, less,
						respectively, than mrec2 */
	const mrec_t*		mrec1,		/* in: first merge
						record to be compared */
	const mrec_t*		mrec2,		/* in: second merge
						record to be compared */
	const ulint*		offsets1,	/* in: first record offsets */
	const ulint*		offsets2,	/* in: second record offsets */
	const dict_index_t*	index)		/* in: index */
1006
{
1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
	int	cmp;

	cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index);

#ifdef UNIV_DEBUG
	if (row_merge_print_cmp) {
		fputs("row_merge_cmp1 ", stderr);
		rec_print_comp(stderr, mrec1, offsets1);
		fputs("\nrow_merge_cmp2 ", stderr);
		rec_print_comp(stderr, mrec2, offsets2);
		fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp);
	}
#endif /* UNIV_DEBUG */

	return(cmp);
1022 1023 1024 1025
}

/************************************************************************
Reads clustered index of the table and create temporary files
1026
containing the index entries for the indexes to be built. */
1027
static
1028 1029 1030
ulint
row_merge_read_clustered_index(
/*===========================*/
1031 1032
					/* out: DB_SUCCESS or error */
	trx_t*			trx,	/* in: transaction */
1033 1034 1035
	TABLE*			table,	/* in/out: MySQL table object,
					for reporting erroneous records */
	const dict_table_t*	old_table,/* in: table where rows are
1036
					read from */
1037
	const dict_table_t*	new_table,/* in: table where indexes are
1038 1039
					created; identical to old_table
					unless creating a PRIMARY KEY */
1040 1041 1042 1043
	dict_index_t**		index,	/* in: indexes to be created */
	merge_file_t*		files,	/* in: temporary files */
	ulint			n_index,/* in: number of indexes to create */
	row_merge_block_t*	block)	/* in/out: file buffer */
1044
{
1045 1046
	dict_index_t*		clust_index;	/* Clustered index */
	mem_heap_t*		row_heap;	/* Heap memory to create
1047
						clustered index records */
1048 1049
	row_merge_buf_t**	merge_buf;	/* Temporary list for records*/
	btr_pcur_t		pcur;		/* Persistent cursor on the
1050
						clustered index */
1051 1052 1053
	mtr_t			mtr;		/* Mini transaction */
	ulint			err = DB_SUCCESS;/* Return code */
	ulint			i;
1054 1055 1056
	ulint			n_nonnull = 0;	/* number of columns
						changed to NOT NULL */
	ulint*			nonnull = NULL;	/* NOT NULL columns */
1057

1058
	trx->op_info = "reading clustered index";
1059

1060
	ut_ad(trx);
1061 1062
	ut_ad(old_table);
	ut_ad(new_table);
1063 1064
	ut_ad(index);
	ut_ad(files);
1065

1066
	/* Create and initialize memory for record buffers */
1067

1068
	merge_buf = mem_alloc(n_index * sizeof *merge_buf);
1069

1070 1071
	for (i = 0; i < n_index; i++) {
		merge_buf[i] = row_merge_buf_create(index[i]);
1072 1073 1074 1075 1076 1077 1078
	}

	mtr_start(&mtr);

	/* Find the clustered index and create a persistent cursor
	based on that. */

1079
	clust_index = dict_table_get_first_index(old_table);
1080 1081 1082 1083

	btr_pcur_open_at_index_side(
		TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);

1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116
	if (UNIV_UNLIKELY(old_table != new_table)) {
		ulint	n_cols = dict_table_get_n_cols(old_table);

		/* A primary key will be created.  Identify the
		columns that were flagged NOT NULL in the new table,
		so that we can quickly check that the records in the
		(old) clustered index do not violate the added NOT
		NULL constraints. */

		ut_a(n_cols == dict_table_get_n_cols(new_table));

		nonnull = mem_alloc(n_cols * sizeof *nonnull);

		for (i = 0; i < n_cols; i++) {
			if (dict_table_get_nth_col(old_table, i)->prtype
			    & DATA_NOT_NULL) {

				continue;
			}

			if (dict_table_get_nth_col(new_table, i)->prtype
			    & DATA_NOT_NULL) {

				nonnull[n_nonnull++] = i;
			}
		}

		if (!n_nonnull) {
			mem_free(nonnull);
			nonnull = NULL;
		}
	}

1117
	row_heap = mem_heap_create(sizeof(mrec_buf_t));
1118

1119
	/* Scan the clustered index. */
1120 1121
	for (;;) {
		const rec_t*	rec;
1122
		ulint*		offsets;
1123
		dtuple_t*	row		= NULL;
1124
		row_ext_t*	ext;
1125
		ibool		has_next	= TRUE;
1126

1127
		btr_pcur_move_to_next_on_page(&pcur);
1128

1129 1130 1131
		/* When switching pages, commit the mini-transaction
		in order to release the latch on the old page. */

1132
		if (btr_pcur_is_after_last_on_page(&pcur)) {
1133 1134 1135 1136 1137
			btr_pcur_store_position(&pcur, &mtr);
			mtr_commit(&mtr);
			mtr_start(&mtr);
			btr_pcur_restore_position(BTR_SEARCH_LEAF,
						  &pcur, &mtr);
1138
			has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr);
1139
		}
1140

1141 1142
		if (UNIV_LIKELY(has_next)) {
			rec = btr_pcur_get_rec(&pcur);
1143 1144
			offsets = rec_get_offsets(rec, clust_index, NULL,
						  ULINT_UNDEFINED, &row_heap);
1145

1146
			/* Skip delete marked records. */
1147 1148
			if (rec_get_deleted_flag(
				    rec, dict_table_is_comp(old_table))) {
1149 1150
				continue;
			}
1151 1152

			srv_n_rows_inserted++;
1153

1154
			/* Build a row based on the clustered index. */
1155 1156

			row = row_build(ROW_COPY_POINTERS, clust_index,
1157 1158
					rec, offsets,
					new_table, &ext, row_heap);
1159

1160 1161 1162 1163
			if (UNIV_LIKELY_NULL(nonnull)) {
				for (i = 0; i < n_nonnull; i++) {
					dfield_t*	field
						= &row->fields[nonnull[i]];
1164 1165
					dtype_t*	field_type
						= dfield_get_type(field);
1166

1167
					ut_a(!(field_type->prtype
1168 1169 1170 1171
					       & DATA_NOT_NULL));

					if (dfield_is_null(field)) {
						err = DB_PRIMARY_KEY_IS_NULL;
1172 1173
						i = 0;
						goto err_exit;
1174 1175
					}

1176
					field_type->prtype |= DATA_NOT_NULL;
1177 1178
				}
			}
1179 1180
		}

1181 1182 1183
		/* Build all entries for all the indexes to be created
		in a single scan of the clustered index. */

1184 1185 1186
		for (i = 0; i < n_index; i++) {
			row_merge_buf_t*	buf	= merge_buf[i];
			merge_file_t*		file	= &files[i];
1187
			const dict_index_t*	index	= buf->index;
1188

1189
			if (UNIV_LIKELY
1190
			    (row && row_merge_buf_add(buf, row, ext))) {
1191 1192 1193
				continue;
			}

1194 1195
			/* The buffer must be sufficiently large
			to hold at least one record. */
1196
			ut_ad(buf->n_tuples || !has_next);
1197

1198 1199
			/* We have enough data tuples to form a block.
			Sort them and write to disk. */
1200

1201 1202
			if (buf->n_tuples) {
				if (dict_index_is_unique(index)) {
1203 1204 1205 1206
					row_merge_dup_t	dup;
					dup.index = buf->index;
					dup.table = table;
					dup.n_dup = 0;
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218

					row_merge_buf_sort(buf, &dup);

					if (dup.n_dup) {
						err = DB_DUPLICATE_KEY;
err_exit:
						trx->error_key_num = i;
						goto func_exit;
					}
				} else {
					row_merge_buf_sort(buf, NULL);
				}
1219
			}
1220

1221
			row_merge_buf_write(buf, file, block);
1222

1223 1224 1225
			if (!row_merge_write(file->fd, file->offset++,
					     block)) {
				err = DB_OUT_OF_FILE_SPACE;
1226
				goto err_exit;
1227
			}
1228

1229
			UNIV_MEM_INVALID(block[0], sizeof block[0]);
1230
			merge_buf[i] = row_merge_buf_empty(buf);
1231 1232 1233 1234 1235 1236 1237 1238 1239 1240

			/* Try writing the record again, now that
			the buffer has been written out and emptied. */

			if (UNIV_UNLIKELY
			    (row && !row_merge_buf_add(buf, row, ext))) {
				/* An empty buffer should have enough
				room for at least one record. */
				ut_error;
			}
1241
		}
1242

1243
		mem_heap_empty(row_heap);
1244

1245 1246 1247 1248
		if (UNIV_UNLIKELY(!has_next)) {
			goto func_exit;
		}
	}
1249

1250 1251 1252 1253
func_exit:
	btr_pcur_close(&pcur);
	mtr_commit(&mtr);
	mem_heap_free(row_heap);
1254

1255 1256 1257 1258
	if (UNIV_LIKELY_NULL(nonnull)) {
		mem_free(nonnull);
	}

1259 1260 1261
	for (i = 0; i < n_index; i++) {
		row_merge_buf_free(merge_buf[i]);
	}
1262

1263
	mem_free(merge_buf);
1264

1265
	trx->op_info = "";
1266

1267 1268
	return(err);
}
1269

1270 1271 1272 1273 1274 1275 1276
/*****************************************************************
Merge two blocks of linked lists on disk and write a bigger block. */
static
ulint
row_merge_blocks(
/*=============*/
					/* out: DB_SUCCESS or error code */
1277
	const dict_index_t*	index,	/* in: index being created */
1278 1279
	merge_file_t*		file,	/* in/out: file containing
					index entries */
1280 1281
	row_merge_block_t*	block,	/* in/out: 3 buffers */
	ulint*			foffs0,	/* in/out: offset of first
1282
					source list in the file */
1283
	ulint*			foffs1,	/* in/out: offset of second
1284
					source list in the file */
1285 1286 1287 1288
	merge_file_t*		of,	/* in/out: output file */
	TABLE*			table)	/* in/out: MySQL table, for
					reporting erroneous key value
					if applicable */
1289
{
1290 1291 1292 1293 1294 1295 1296 1297 1298
	mem_heap_t*	heap;	/* memory heap for offsets0, offsets1 */

	mrec_buf_t	buf[3];	/* buffer for handling split mrec in block[] */
	const byte*	b0;	/* pointer to block[0] */
	const byte*	b1;	/* pointer to block[1] */
	byte*		b2;	/* pointer to block[2] */
	const mrec_t*	mrec0;	/* merge rec, points to block[0] or buf[0] */
	const mrec_t*	mrec1;	/* merge rec, points to block[1] or buf[1] */
	ulint*		offsets0;/* offsets of mrec0 */
1299 1300
	ulint*		offsets1;/* offsets of mrec1 */

1301
	heap = row_merge_heap_create(index, &offsets0, &offsets1);
1302 1303 1304 1305 1306

	/* Write a record and read the next record.  Split the output
	file in two halves, which can be merged on the following pass. */
#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END)				\
	do {								\
1307
		b2 = row_merge_write_rec(&block[2], &buf[2], b2,	\
1308 1309
					 of->fd, &of->offset,		\
					 mrec##N, offsets##N);		\
1310
		if (UNIV_UNLIKELY(!b2)) {				\
1311 1312
			goto corrupt;					\
		}							\
1313
		b##N = row_merge_read_rec(&block[N], &buf[N],		\
1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324
					  b##N, index,			\
					  file->fd, foffs##N,		\
					  &mrec##N, offsets##N);	\
		if (UNIV_UNLIKELY(!b##N)) {				\
			if (mrec##N) {					\
				goto corrupt;				\
			}						\
			AT_END;						\
		}							\
	} while (0)

1325 1326
	if (!row_merge_read(file->fd, *foffs0, &block[0])
	    || !row_merge_read(file->fd, *foffs1, &block[1])) {
1327 1328 1329 1330 1331
corrupt:
		mem_heap_free(heap);
		return(DB_CORRUPTION);
	}

1332 1333 1334
	b0 = block[0];
	b1 = block[1];
	b2 = block[2];
1335

1336 1337 1338
	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
				foffs0, &mrec0, offsets0);
	b1 = row_merge_read_rec(&block[1], &buf[1], b1, index, file->fd,
1339
				foffs1, &mrec1, offsets1);
1340 1341
	if (UNIV_UNLIKELY(!b0 && mrec0)
	    || UNIV_UNLIKELY(!b1 && mrec1)) {
1342

1343 1344 1345
		goto corrupt;
	}

1346 1347 1348
	while (mrec0 && mrec1) {
		switch (row_merge_cmp(mrec0, mrec1,
				      offsets0, offsets1, index)) {
1349 1350 1351
		case 0:
			if (UNIV_UNLIKELY
			    (dict_index_is_unique(index))) {
1352 1353
				innobase_rec_to_mysql(table, mrec0,
						      index, offsets0);
1354 1355
				mem_heap_free(heap);
				return(DB_DUPLICATE_KEY);
1356
			}
1357 1358
			/* fall through */
		case -1:
1359
			ROW_MERGE_WRITE_GET_NEXT(0, goto merged);
1360 1361
			break;
		case 1:
1362
			ROW_MERGE_WRITE_GET_NEXT(1, goto merged);
1363 1364 1365
			break;
		default:
			ut_error;
1366
		}
1367

1368 1369
	}

1370
merged:
1371 1372
	if (mrec0) {
		/* append all mrec0 to output */
1373
		for (;;) {
1374
			ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
1375 1376
		}
	}
1377
done0:
1378 1379
	if (mrec1) {
		/* append all mrec1 to output */
1380
		for (;;) {
1381
			ROW_MERGE_WRITE_GET_NEXT(1, goto done1);
1382 1383
		}
	}
1384
done1:
1385

1386
	mem_heap_free(heap);
1387 1388
	b2 = row_merge_write_eof(&block[2], b2, of->fd, &of->offset);
	return(b2 ? DB_SUCCESS : DB_CORRUPTION);
1389
}
1390

1391 1392 1393 1394 1395 1396
/*****************************************************************
Merge disk files. */
static
ulint
row_merge(
/*======*/
1397 1398 1399 1400 1401 1402 1403 1404 1405 1406
					/* out: DB_SUCCESS or error code */
	const dict_index_t*	index,	/* in: index being created */
	merge_file_t*		file,	/* in/out: file containing
					index entries */
	ulint			half,	/* in: half the file */
	row_merge_block_t*	block,	/* in/out: 3 buffers */
	int*			tmpfd,	/* in/out: temporary file handle */
	TABLE*			table)	/* in/out: MySQL table, for
					reporting erroneous key value
					if applicable */
1407
{
1408 1409
	ulint		foffs0;	/* first input offset */
	ulint		foffs1;	/* second input offset */
1410 1411
	ulint		error;	/* error code */
	merge_file_t	of;	/* output file */
1412

1413
	UNIV_MEM_ASSERT_W(block[0], 3 * sizeof block[0]);
1414
	ut_ad(half > 0);
1415

1416 1417
	of.fd = *tmpfd;
	of.offset = 0;
1418

1419
	/* Merge blocks to the output file. */
1420 1421
	foffs0 = 0;
	foffs1 = half;
1422

1423
	for (; foffs0 < half && foffs1 < file->offset; foffs0++, foffs1++) {
1424
		error = row_merge_blocks(index, file, block,
1425
					 &foffs0, &foffs1, &of, table);
1426

1427 1428
		if (error != DB_SUCCESS) {
			return(error);
1429
		}
1430
	}
1431

1432
	/* Copy the last block, if there is one. */
1433 1434 1435 1436 1437 1438
	while (foffs0 < half) {
		if (!row_merge_read(file->fd, foffs0++, block)
		    || !row_merge_write(of.fd, of.offset++, block)) {
			return(DB_CORRUPTION);
		}
	}
1439 1440 1441
	while (foffs1 < file->offset) {
		if (!row_merge_read(file->fd, foffs1++, block)
		    || !row_merge_write(of.fd, of.offset++, block)) {
1442
			return(DB_CORRUPTION);
1443
		}
1444 1445
	}

1446 1447 1448
	/* Swap file descriptors for the next pass. */
	*tmpfd = file->fd;
	*file = of;
1449

1450 1451
	UNIV_MEM_INVALID(block[0], 3 * sizeof block[0]);

1452 1453
	return(DB_SUCCESS);
}
1454

1455 1456 1457 1458 1459 1460
/*****************************************************************
Merge disk files. */
static
ulint
row_merge_sort(
/*===========*/
1461 1462 1463 1464 1465 1466 1467 1468 1469
					/* out: DB_SUCCESS or error code */
	const dict_index_t*	index,	/* in: index being created */
	merge_file_t*		file,	/* in/out: file containing
					index entries */
	row_merge_block_t*	block,	/* in/out: 3 buffers */
	int*			tmpfd,	/* in/out: temporary file handle */
	TABLE*			table)	/* in/out: MySQL table, for
					reporting erroneous key value
					if applicable */
1470 1471
{
	ulint	blksz;	/* block size */
1472

1473
	for (blksz = 1; blksz < file->offset; blksz *= 2) {
1474 1475 1476
		ulint	half;
		ulint	error;

1477 1478
		ut_ad(ut_is_2pow(blksz));
		half = ut_2pow_round((file->offset + (blksz - 1)) / 2, blksz);
1479
		error = row_merge(index, file, half, block, tmpfd, table);
1480

1481 1482 1483 1484
		if (error != DB_SUCCESS) {
			return(error);
		}
	}
1485

1486
	return(DB_SUCCESS);
1487 1488
}

1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514
/*****************************************************************
Copy externally stored columns to the data tuple. */
static
void
row_merge_copy_blobs(
/*=================*/
	const mrec_t*	mrec,	/* in: merge record */
	const ulint*	offsets,/* in: offsets of mrec */
	ulint		zip_size,/* in: compressed page size in bytes, or 0 */
	dtuple_t*	tuple,	/* in/out: data tuple */
	mem_heap_t*	heap)	/* in/out: memory heap */
{
	ulint	i;
	ulint	n_fields = dtuple_get_n_fields(tuple);

	for (i = 0; i < n_fields; i++) {
		ulint		len;
		const void*	data;
		dfield_t*	field = dtuple_get_nth_field(tuple, i);

		if (!dfield_is_ext(field)) {
			continue;
		}

		ut_ad(!dfield_is_null(field));

1515 1516 1517 1518 1519
		/* The table is locked during index creation.
		Therefore, externally stored columns cannot possibly
		be freed between the time the BLOB pointers are read
		(row_merge_read_clustered_index()) and dereferenced
		(below). */
1520 1521 1522 1523 1524 1525 1526
		data = btr_rec_copy_externally_stored_field(
			mrec, offsets, zip_size, i, &len, heap);

		dfield_set_data(field, data, len);
	}
}

1527 1528 1529
/************************************************************************
Read sorted file containing index data tuples and insert these data
tuples to the index */
1530
static
1531 1532 1533
ulint
row_merge_insert_index_tuples(
/*==========================*/
1534 1535 1536
					/* out: DB_SUCCESS or error number */
	trx_t*			trx,	/* in: transaction */
	dict_index_t*		index,	/* in: index */
1537 1538 1539
	dict_table_t*		table,	/* in: new table */
	ulint			zip_size,/* in: compressed page size of
					 the old table, or 0 if uncompressed */
1540 1541
	int			fd,	/* in: file descriptor */
	row_merge_block_t*	block)	/* in/out: file buffer */
1542
{
1543 1544 1545 1546 1547 1548 1549 1550 1551
	mrec_buf_t		buf;
	const byte*		b;
	que_thr_t*		thr;
	ins_node_t*		node;
	mem_heap_t*		tuple_heap;
	mem_heap_t*		graph_heap;
	ulint			error = DB_SUCCESS;
	ulint			foffs = 0;
	ulint*			offsets;
1552

1553 1554 1555
	ut_ad(trx);
	ut_ad(index);
	ut_ad(table);
1556 1557 1558 1559 1560 1561

	/* We use the insert query graph as the dummy graph
	needed in the row module call */

	trx->op_info = "inserting index entries";

1562
	graph_heap = mem_heap_create(500);
1563 1564 1565 1566 1567 1568
	node = ins_node_create(INS_DIRECT, table, graph_heap);

	thr = pars_complete_graph_for_exec(node, trx, graph_heap);

	que_thr_move_to_run_state_for_mysql(thr, trx);

1569
	tuple_heap = mem_heap_create(1000);
1570

1571
	{
1572
		ulint i	= 1 + REC_OFFS_HEADER_SIZE
1573 1574 1575 1576 1577
			+ dict_index_get_n_fields(index);
		offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets);
		offsets[0] = i;
		offsets[1] = dict_index_get_n_fields(index);
	}
1578

1579
	b = *block;
1580

1581 1582 1583 1584 1585 1586
	if (!row_merge_read(fd, foffs, block)) {
		error = DB_CORRUPTION;
	} else {
		for (;;) {
			const mrec_t*	mrec;
			dtuple_t*	dtuple;
1587
			ulint		n_ext;
1588 1589 1590 1591 1592 1593 1594 1595 1596 1597

			b = row_merge_read_rec(block, &buf, b, index,
					       fd, &foffs, &mrec, offsets);
			if (UNIV_UNLIKELY(!b)) {
				/* End of list, or I/O error */
				if (mrec) {
					error = DB_CORRUPTION;
				}
				break;
			}
1598

1599
			dtuple = row_rec_to_index_entry_low(
1600 1601 1602 1603 1604 1605
				mrec, index, offsets, &n_ext, tuple_heap);

			if (UNIV_UNLIKELY(n_ext)) {
				row_merge_copy_blobs(mrec, offsets, zip_size,
						     dtuple, tuple_heap);
			}
1606

1607 1608 1609
			node->row = dtuple;
			node->table = table;
			node->trx_id = trx->id;
1610

1611
			ut_ad(dtuple_validate(dtuple));
1612

1613 1614 1615
			do {
				thr->run_node = thr;
				thr->prev_node = thr->common.parent;
1616

1617 1618
				error = row_ins_index_entry(index, dtuple,
							    0, FALSE, thr);
1619

1620
				if (UNIV_LIKELY(error == DB_SUCCESS)) {
1621

1622 1623
					goto next_rec;
				}
1624

1625 1626 1627 1628 1629 1630
				thr->lock_state = QUE_THR_LOCK_ROW;
				trx->error_state = error;
				que_thr_stop_for_mysql(thr);
				thr->lock_state = QUE_THR_LOCK_NOLOCK;
			} while (row_mysql_handle_errors(&error, trx,
							 thr, NULL));
1631

1632
			goto err_exit;
1633
next_rec:
1634
			mem_heap_empty(tuple_heap);
1635
		}
1636
	}
1637 1638

	que_thr_stop_for_mysql_no_error(thr, trx);
1639
err_exit:
1640 1641 1642 1643
	que_graph_free(thr->graph);

	trx->op_info = "";

1644
	mem_heap_free(tuple_heap);
1645 1646 1647 1648

	return(error);
}

1649 1650
/*************************************************************************
Sets an exclusive lock on a table, for the duration of creating indexes. */
1651
UNIV_INTERN
1652 1653 1654 1655 1656
ulint
row_merge_lock_table(
/*=================*/
					/* out: error code or DB_SUCCESS */
	trx_t*		trx,		/* in/out: transaction */
1657 1658
	dict_table_t*	table,		/* in: table to lock */
	enum lock_mode	mode)		/* in: LOCK_X or LOCK_S */
1659 1660 1661 1662 1663 1664 1665 1666
{
	mem_heap_t*	heap;
	que_thr_t*	thr;
	ulint		err;
	sel_node_t*	node;

	ut_ad(trx);
	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
1667
	ut_ad(mode == LOCK_X || mode == LOCK_S);
1668 1669 1670

	heap = mem_heap_create(512);

1671
	trx->op_info = "setting table lock for creating or dropping index";
1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686

	node = sel_node_create(heap);
	thr = pars_complete_graph_for_exec(node, trx, heap);
	thr->graph->state = QUE_FORK_ACTIVE;

	/* We use the select query graph as the dummy graph needed
	in the lock module call */

	thr = que_fork_get_first_thr(que_node_get_parent(thr));
	que_thr_move_to_run_state_for_mysql(thr, trx);

run_again:
	thr->run_node = thr;
	thr->prev_node = thr->common.parent;

1687
	err = lock_table(0, table, mode, thr);
1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727

	trx->error_state = err;

	if (UNIV_LIKELY(err == DB_SUCCESS)) {
		que_thr_stop_for_mysql_no_error(thr, trx);
	} else {
		que_thr_stop_for_mysql(thr);

		if (err != DB_QUE_THR_SUSPENDED) {
			ibool	was_lock_wait;

			was_lock_wait = row_mysql_handle_errors(
				&err, trx, thr, NULL);

			if (was_lock_wait) {
				goto run_again;
			}
		} else {
			que_thr_t*	run_thr;
			que_node_t*	parent;

			parent = que_node_get_parent(thr);
			run_thr = que_fork_start_command(parent);

			ut_a(run_thr == thr);

			/* There was a lock wait but the thread was not
			in a ready to run or running state. */
			trx->error_state = DB_LOCK_WAIT;

			goto run_again;
		}
	}

	que_graph_free(thr->graph);
	trx->op_info = "";

	return(err);
}

1728
/*************************************************************************
1729 1730 1731
Drop an index from the InnoDB system tables.  The data dictionary must
have been locked exclusively by the caller, because the transaction
will not be committed. */
1732
UNIV_INTERN
marko's avatar
marko committed
1733 1734 1735
void
row_merge_drop_index(
/*=================*/
1736 1737 1738 1739 1740
	dict_index_t*	index,	/* in: index to be removed */
	dict_table_t*	table,	/* in: table */
	trx_t*		trx)	/* in: transaction handle */
{
	ulint		err;
1741
	pars_info_t*	info = pars_info_create();
1742 1743 1744 1745 1746 1747 1748 1749 1750

	/* We use the private SQL parser of Innobase to generate the
	query graphs needed in deleting the dictionary data from system
	tables in Innobase. Deleting a row from SYS_INDEXES table also
	frees the file segments of the B-tree associated with the index. */

	static const char str1[] =
		"PROCEDURE DROP_INDEX_PROC () IS\n"
		"BEGIN\n"
1751 1752 1753
		"DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
		"DELETE FROM SYS_INDEXES WHERE ID = :indexid\n"
		"		AND TABLE_ID = :tableid;\n"
1754 1755 1756 1757
		"END;\n";

	ut_ad(index && table && trx);

1758 1759 1760
	pars_info_add_dulint_literal(info, "indexid", index->id);
	pars_info_add_dulint_literal(info, "tableid", table->id);

1761 1762 1763
	trx_start_if_not_started(trx);
	trx->op_info = "dropping index";

1764
	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
1765

1766
	err = que_eval_sql(info, str1, FALSE, trx);
1767

1768
	ut_a(err == DB_SUCCESS);
1769

1770 1771
	/* Replace this index with another equivalent index for all
	foreign key constraints on this table where this index is used */
1772

1773 1774
	dict_table_replace_index_in_foreign_list(table, index);
	dict_index_remove_from_cache(table, index);
1775 1776

	trx->op_info = "";
marko's avatar
marko committed
1777
}
1778

marko's avatar
marko committed
1779
/*************************************************************************
1780 1781 1782 1783
Drop those indexes which were created before an error occurred when
building an index.  The data dictionary must have been locked
exclusively by the caller, because the transaction will not be
committed. */
1784
UNIV_INTERN
marko's avatar
marko committed
1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797
void
row_merge_drop_indexes(
/*===================*/
	trx_t*		trx,		/* in: transaction */
	dict_table_t*	table,		/* in: table containing the indexes */
	dict_index_t**	index,		/* in: indexes to drop */
	ulint		num_created)	/* in: number of elements in index[] */
{
	ulint	key_num;

	for (key_num = 0; key_num < num_created; key_num++) {
		row_merge_drop_index(index[key_num], table, trx);
	}
1798 1799
}

1800 1801
/*************************************************************************
Drop all partially created indexes during crash recovery. */
1802
UNIV_INTERN
1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823
void
row_merge_drop_temp_indexes(void)
/*=============================*/
{
	trx_t*		trx;
	ulint		err;

	/* We use the private SQL parser of Innobase to generate the
	query graphs needed in deleting the dictionary data from system
	tables in Innobase. Deleting a row from SYS_INDEXES table also
	frees the file segments of the B-tree associated with the index. */
#if TEMP_INDEX_PREFIX != '\377'
# error "TEMP_INDEX_PREFIX != '\377'"
#endif
	static const char drop_temp_indexes[] =
		"PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
		"indexid CHAR;\n"
		"DECLARE CURSOR c IS SELECT ID FROM SYS_INDEXES\n"
		"WHERE SUBSTR(NAME,0,1)='\377' FOR UPDATE;\n"
		"BEGIN\n"
		"\tOPEN c;\n"
1824
		"\tWHILE 1=1 LOOP\n"
1825 1826 1827 1828 1829 1830 1831 1832
		"\t\tFETCH c INTO indexid;\n"
		"\t\tIF (SQL % NOTFOUND) THEN\n"
		"\t\t\tEXIT;\n"
		"\t\tEND IF;\n"
		"\t\tDELETE FROM SYS_FIELDS WHERE INDEX_ID = indexid;\n"
		"\t\tDELETE FROM SYS_INDEXES WHERE CURRENT OF c;\n"
		"\tEND LOOP;\n"
		"\tCLOSE c;\n"
1833
		"\tCOMMIT WORK;\n"
1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846
		"END;\n";

	trx = trx_allocate_for_background();
	trx->op_info = "dropping partially created indexes";
	row_mysql_lock_data_dictionary(trx);

	err = que_eval_sql(NULL, drop_temp_indexes, FALSE, trx);
	ut_a(err == DB_SUCCESS);

	row_mysql_unlock_data_dictionary(trx);
	trx_free_for_background(trx);
}

1847
/*************************************************************************
1848 1849
Create a merge file. */
static
1850 1851 1852 1853
void
row_merge_file_create(
/*==================*/
	merge_file_t*	merge_file)	/* out: merge file structure */
1854
{
1855
	merge_file->fd = innobase_mysql_tmpfile();
1856
	merge_file->offset = 0;
1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870
}

/*************************************************************************
Destroy a merge file. */
static
void
row_merge_file_destroy(
/*===================*/
	merge_file_t*	merge_file)	/* out: merge file structure */
{
	if (merge_file->fd != -1) {
		close(merge_file->fd);
		merge_file->fd = -1;
	}
1871 1872 1873
}

/*************************************************************************
1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911
Determine the precise type of a column that is added to a tem
if a column must be constrained NOT NULL. */
UNIV_INLINE
ulint
row_merge_col_prtype(
/*=================*/
						/* out: col->prtype, possibly
						ORed with DATA_NOT_NULL */
	const dict_col_t*	col,		/* in: column */
	const char*		col_name,	/* in: name of the column */
	const merge_index_def_t*index_def)	/* in: the index definition
						of the primary key */
{
	ulint	prtype = col->prtype;
	ulint	i;

	ut_ad(index_def->ind_type & DICT_CLUSTERED);

	if (prtype & DATA_NOT_NULL) {

		return(prtype);
	}

	/* All columns that are included
	in the PRIMARY KEY must be NOT NULL. */

	for (i = 0; i < index_def->n_fields; i++) {
		if (!strcmp(col_name, index_def->fields[i].field_name)) {
			return(prtype | DATA_NOT_NULL);
		}
	}

	return(prtype);
}

/*************************************************************************
Create a temporary table for creating a primary key, using the definition
of an existing table. */
1912
UNIV_INTERN
1913 1914 1915
dict_table_t*
row_merge_create_temporary_table(
/*=============================*/
1916 1917 1918 1919 1920 1921 1922 1923
						/* out: table,
						or NULL on error */
	const char*		table_name,	/* in: new table name */
	const merge_index_def_t*index_def,	/* in: the index definition
						of the primary key */
	const dict_table_t*	table,		/* in: old table definition */
	trx_t*			trx)		/* in/out: transaction
						(sets error_state) */
1924 1925 1926 1927
{
	ulint		i;
	dict_table_t*	new_table = NULL;
	ulint		n_cols = dict_table_get_n_user_cols(table);
1928
	ulint		error;
1929
	mem_heap_t*	heap = mem_heap_create(1000);
1930

1931
	ut_ad(table_name);
1932
	ut_ad(index_def);
1933
	ut_ad(table);
1934 1935
	ut_ad(mutex_own(&dict_sys->mutex));

1936
	new_table = dict_mem_table_create(table_name, 0, n_cols, table->flags);
1937

1938 1939 1940
	for (i = 0; i < n_cols; i++) {
		const dict_col_t*	col;
		const char*		col_name;
1941

1942 1943
		col = dict_table_get_nth_col(table, i);
		col_name = dict_table_get_col_name(table, i);
1944

1945 1946 1947 1948
		dict_mem_table_add_col(new_table, heap, col_name, col->mtype,
				       row_merge_col_prtype(col, col_name,
							    index_def),
				       col->len);
1949 1950
	}

1951 1952 1953
	error = row_create_table_for_mysql(new_table, trx);
	mem_heap_free(heap);

1954 1955
	if (error != DB_SUCCESS) {
		trx->error_state = error;
1956
		dict_mem_table_free(new_table);
1957
		new_table = NULL;
1958 1959 1960 1961 1962 1963
	}

	return(new_table);
}

/*************************************************************************
1964 1965 1966
Rename the temporary indexes in the dictionary to permanent ones.  The
data dictionary must have been locked exclusively by the caller,
because the transaction will not be committed. */
1967
UNIV_INTERN
1968
ulint
1969 1970
row_merge_rename_indexes(
/*=====================*/
1971
					/* out: DB_SUCCESS if all OK */
1972 1973
	trx_t*		trx,		/* in/out: transaction */
	dict_table_t*	table)		/* in/out: table with new indexes */
1974 1975
{
	ulint		err = DB_SUCCESS;
1976
	pars_info_t*	info = pars_info_create();
1977 1978

	/* We use the private SQL parser of Innobase to generate the
1979
	query graphs needed in renaming indexes. */
1980

1981 1982 1983 1984 1985 1986
#if TEMP_INDEX_PREFIX != '\377'
# error "TEMP_INDEX_PREFIX != '\377'"
#endif

	static const char rename_indexes[] =
		"PROCEDURE RENAME_INDEXES_PROC () IS\n"
1987
		"BEGIN\n"
1988 1989
		"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
		"WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='\377';\n"
1990 1991
		"END;\n";

1992
	ut_ad(table);
1993 1994
	ut_ad(trx);
	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
1995

1996
	trx->op_info = "renaming indexes";
1997

1998
	pars_info_add_dulint_literal(info, "tableid", table->id);
1999

2000
	err = que_eval_sql(info, rename_indexes, FALSE, trx);
2001 2002

	if (err == DB_SUCCESS) {
2003 2004 2005 2006 2007 2008 2009
		dict_index_t*	index = dict_table_get_first_index(table);
		do {
			if (*index->name == TEMP_INDEX_PREFIX) {
				index->name++;
			}
			index = dict_table_get_next_index(index);
		} while (index);
2010 2011 2012 2013 2014 2015 2016
	}

	trx->op_info = "";

	return(err);
}

2017
/*************************************************************************
2018 2019 2020
Rename the tables in the data dictionary.  The data dictionary must
have been locked exclusively by the caller, because the transaction
will not be committed. */
2021
UNIV_INTERN
2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038
ulint
row_merge_rename_tables(
/*====================*/
					/* out: error code or DB_SUCCESS */
	dict_table_t*	old_table,	/* in/out: old table, renamed to
					tmp_name */
	dict_table_t*	new_table,	/* in/out: new table, renamed to
					old_table->name */
	const char*	tmp_name,	/* in: new name for old_table */
	trx_t*		trx)		/* in: transaction handle */
{
	ulint		err	= DB_ERROR;
	pars_info_t*	info;
	const char*	old_name= old_table->name;

	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
	ut_ad(old_table != new_table);
2039
	ut_ad(mutex_own(&dict_sys->mutex));
2040

2041 2042
	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);

2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091
	trx->op_info = "renaming tables";

	/* We use the private SQL parser of Innobase to generate the query
	graphs needed in updating the dictionary data in system tables. */

	info = pars_info_create();

	pars_info_add_str_literal(info, "new_name", new_table->name);
	pars_info_add_str_literal(info, "old_name", old_name);
	pars_info_add_str_literal(info, "tmp_name", tmp_name);

	err = que_eval_sql(info,
			   "PROCEDURE RENAME_TABLES () IS\n"
			   "BEGIN\n"
			   "UPDATE SYS_TABLES SET NAME = :tmp_name\n"
			   " WHERE NAME = :old_name;\n"
			   "UPDATE SYS_TABLES SET NAME = :old_name\n"
			   " WHERE NAME = :new_name;\n"
			   "END;\n", FALSE, trx);

	if (err != DB_SUCCESS) {

		goto err_exit;
	}

	/* The following calls will also rename the .ibd data files if
	the tables are stored in a single-table tablespace */

	if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE)
	    || !dict_table_rename_in_cache(new_table, old_name, FALSE)) {

		err = DB_ERROR;
		goto err_exit;
	}

	err = dict_load_foreigns(old_name, TRUE);

	if (err != DB_SUCCESS) {
err_exit:
		trx->error_state = DB_SUCCESS;
		trx_general_rollback_for_mysql(trx, FALSE, NULL);
		trx->error_state = DB_SUCCESS;
	}

	trx->op_info = "";

	return(err);
}

marko's avatar
marko committed
2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128
/*************************************************************************
Create and execute a query graph for creating an index. */
static
ulint
row_merge_create_index_graph(
/*=========================*/
					/* out: DB_SUCCESS or error code */
	trx_t*		trx,		/* in: trx */
	dict_table_t*	table,		/* in: table */
	dict_index_t*	index)		/* in: index */
{
	ind_node_t*	node;		/* Index creation node */
	mem_heap_t*	heap;		/* Memory heap */
	que_thr_t*	thr;		/* Query thread */
	ulint		err;

	ut_ad(trx);
	ut_ad(table);
	ut_ad(index);

	heap = mem_heap_create(512);

	index->table = table;
	node = ind_create_graph_create(index, heap);
	thr = pars_complete_graph_for_exec(node, trx, heap);

	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));

	que_run_threads(thr);

	err = trx->error_state;

	que_graph_free((que_t*) que_node_get_parent(thr));

	return(err);
}

2129
/*************************************************************************
2130
Create the index and load in to the dictionary. */
2131
UNIV_INTERN
2132
dict_index_t*
2133 2134
row_merge_create_index(
/*===================*/
2135 2136
					/* out: index, or NULL on error */
	trx_t*		trx,		/* in/out: trx (sets error_state) */
2137 2138 2139 2140
	dict_table_t*	table,		/* in: the index is on this table */
	const merge_index_def_t*	/* in: the index definition */
			index_def)
{
2141
	dict_index_t*	index;
2142
	ulint		err;
2143
	ulint		n_fields = index_def->n_fields;
2144
	ulint		i;
2145 2146 2147

	/* Create the index prototype, using the passed in def, this is not
	a persistent operation. We pass 0 as the space id, and determine at
2148
	a lower level the space id where to store the table. */
2149

2150 2151
	index = dict_mem_index_create(table->name, index_def->name,
				      0, index_def->ind_type, n_fields);
2152

2153
	ut_a(index);
2154

2155 2156
	for (i = 0; i < n_fields; i++) {
		merge_index_field_t*	ifield = &index_def->fields[i];
2157

2158 2159 2160
		dict_mem_index_add_field(index, ifield->field_name,
					 ifield->prefix_len);
	}
2161

2162
	/* Add the index to SYS_INDEXES, using the index prototype. */
marko's avatar
marko committed
2163
	err = row_merge_create_index_graph(trx, table, index);
2164

2165
	if (err == DB_SUCCESS) {
2166

2167 2168
		index = row_merge_dict_table_get_index(
			table, index_def);
2169

2170
		ut_a(index);
2171

2172
#ifdef ROW_MERGE_IS_INDEX_USABLE
2173 2174 2175 2176
		/* Note the id of the transaction that created this
		index, we use it to restrict readers from accessing
		this index, to ensure read consistency. */
		index->trx_id = trx->id;
2177
#endif /* ROW_MERGE_IS_INDEX_USABLE */
2178 2179
	} else {
		index = NULL;
2180 2181 2182
	}

	return(index);
2183 2184
}

2185
#ifdef ROW_MERGE_IS_INDEX_USABLE
2186
/*************************************************************************
2187
Check if a transaction can use an index. */
2188
UNIV_INTERN
2189 2190 2191
ibool
row_merge_is_index_usable(
/*======================*/
2192 2193
	const trx_t*		trx,	/* in: transaction */
	const dict_index_t*	index)	/* in: index to check */
2194 2195 2196 2197 2198 2199 2200
{
	if (!trx->read_view) {
		return(TRUE);
	}

	return(ut_dulint_cmp(index->trx_id, trx->read_view->low_limit_id) < 0);
}
2201
#endif /* ROW_MERGE_IS_INDEX_USABLE */
2202 2203

/*************************************************************************
2204
Drop the old table. */
2205
UNIV_INTERN
2206 2207 2208
ulint
row_merge_drop_table(
/*=================*/
2209
					/* out: DB_SUCCESS or error code */
2210 2211 2212
	trx_t*		trx,		/* in: transaction */
	dict_table_t*	table)		/* in: table to drop */
{
2213 2214 2215
	/* There must be no open transactions on the table. */
	ut_a(table->n_mysql_handles_opened == 0);

2216
	return(row_drop_table_for_mysql(table->name, trx, FALSE));
2217
}
2218 2219 2220 2221 2222

/*************************************************************************
Build indexes on a table by reading a clustered index,
creating a temporary file containing index entries, merge sorting
these index entries and inserting sorted index entries to indexes. */
2223
UNIV_INTERN
2224 2225 2226 2227 2228
ulint
row_merge_build_indexes(
/*====================*/
					/* out: DB_SUCCESS or error code */
	trx_t*		trx,		/* in: transaction */
marko's avatar
marko committed
2229
	dict_table_t*	old_table,	/* in: table where rows are
2230
					read from */
marko's avatar
marko committed
2231 2232 2233
	dict_table_t*	new_table,	/* in: table where indexes are
					created; identical to old_table
					unless creating a PRIMARY KEY */
2234
	dict_index_t**	indexes,	/* in: indexes to be created */
2235 2236 2237 2238
	ulint		n_indexes,	/* in: size of indexes[] */
	TABLE*		table)		/* in/out: MySQL table, for
					reporting erroneous key value
					if applicable */
2239 2240
{
	merge_file_t*		merge_files;
2241 2242
	row_merge_block_t*	block;
	ulint			block_size;
2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258
	ulint			i;
	ulint			error;
	int			tmpfd;

	ut_ad(trx);
	ut_ad(old_table);
	ut_ad(new_table);
	ut_ad(indexes);
	ut_ad(n_indexes);

	trx_start_if_not_started(trx);

	/* Allocate memory for merge file data structure and initialize
	fields */

	merge_files = mem_alloc(n_indexes * sizeof *merge_files);
2259 2260
	block_size = 3 * sizeof *block;
	block = os_mem_alloc_large(&block_size);
2261 2262 2263 2264 2265 2266 2267 2268

	for (i = 0; i < n_indexes; i++) {

		row_merge_file_create(&merge_files[i]);
	}

	tmpfd = innobase_mysql_tmpfile();

2269 2270 2271 2272
	/* Reset the MySQL row buffer that is used when reporting
	duplicate keys. */
	innobase_rec_reset(table);

2273 2274 2275 2276
	/* Read clustered index of the table and create files for
	secondary index entries for merge sort */

	error = row_merge_read_clustered_index(
2277
		trx, table, old_table, new_table, indexes,
2278
		merge_files, n_indexes, block);
2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289

	if (error != DB_SUCCESS) {

		goto func_exit;
	}

	/* Now we have files containing index entries ready for
	sorting and inserting. */

	for (i = 0; i < n_indexes; i++) {
		error = row_merge_sort(indexes[i], &merge_files[i],
2290
				       block, &tmpfd, table);
2291 2292 2293 2294

		if (error == DB_SUCCESS) {
			error = row_merge_insert_index_tuples(
				trx, indexes[i], new_table,
2295
				dict_table_zip_size(old_table),
2296
				merge_files[i].fd, block);
2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315
		}

		/* Close the temporary file to free up space. */
		row_merge_file_destroy(&merge_files[i]);

		if (error != DB_SUCCESS) {
			trx->error_key_num = i;
			goto func_exit;
		}
	}

func_exit:
	close(tmpfd);

	for (i = 0; i < n_indexes; i++) {
		row_merge_file_destroy(&merge_files[i]);
	}

	mem_free(merge_files);
2316
	os_mem_free_large(block, block_size);
2317 2318 2319

	return(error);
}