Commit 7c767a30 authored by Marko Mäkelä's avatar Marko Mäkelä Committed by Alexander Barkov

MDEV-10139 Support for InnoDB SEQUENCE objects

We introduce a NO_ROLLBACK flag for InnoDB tables. This flag only works
for tables that have a single index. Apart from undo logging, this flag
will also prevent locking and the assignment of DB_ROW_ID or DB_TRX_ID,
and imply READ UNCOMMITTED isolation. It is assumed that the SQL layer
is guaranteeing mutual exclusion.

After the initial insert of the single record during CREATE SEQUENCE,
InnoDB will be updating the single record in-place. This is crash-safe
thanks to the redo log. (That is, after a crash after CREATE SEQUENCE
was committed, the effect of sequence operations will be observable
fully or not at all.)

When it comes to the durability of the updates of SEQUENCE in
InnoDB, there is a clear analogy to MDEV-6076 Persistent AUTO_INCREMENT.
The updates would be made persistent by the InnoDB redo log flush
at transaction commit or rollback (or XA PREPARE), provided that
innodb_log_flush_at_trx_commit=1.

Similar to AUTO_INCREMENT, it is possible that the update of a SEQUENCE
in a middle of transaction becomes durable before the COMMIT/ROLLBACK of
the transaction, in case the InnoDB redo log is being flushed as a result
of the a commit or rollback of some other transaction, or as a result of
a redo log checkpoint that can be initiated at any time by operations that
are writing redo log.

dict_table_t::no_rollback(): Check if the table does not support rollback.

BTR_NO_ROLLBACK: Logging and locking flags for no_rollback() tables.

DICT_TF_BITS: Add the NO_ROLLBACK flag.

row_ins_step(): Assign 0 to DB_ROW_ID and DB_TRX_ID, and skip
any locking for no-rollback tables. There will be only a single row
in no-rollback tables (or there must be a proper PRIMARY KEY).

row_search_mvcc(): Execute the READ UNCOMMITTED code path for
no-rollback tables.

ha_innobase::external_lock(), ha_innobase::store_lock():
Block CREATE/DROP SEQUENCE in innodb_read_only mode.
This probably has no effect for CREATE SEQUENCE, because already
ha_innobase::create() should have been called (and refused)
before external_lock() or store_lock() is called.

ha_innobase::store_lock(): For CREATE SEQUENCE, do not acquire any
InnoDB locks, even though TL_WRITE is being requested. (This is just
a performance optimization.)

innobase_copy_frm_flags_from_create_info(), row_drop_table_for_mysql():
Disable persistent statistics for no_rollback tables.
parent 470c3fd9
......@@ -2502,6 +2502,7 @@ dict_index_add_to_cache_w_vcol(
ut_d(mem_heap_validate(index->heap));
ut_a(!dict_index_is_clust(index)
|| UT_LIST_GET_LEN(table->indexes) == 0);
ut_ad(dict_index_is_clust(index) || !table->no_rollback());
if (!dict_index_find_cols(table, index, add_v)) {
......
......@@ -3114,7 +3114,8 @@ innobase_copy_frm_flags_from_create_info(
ibool ps_on;
ibool ps_off;
if (dict_table_is_temporary(innodb_table)) {
if (dict_table_is_temporary(innodb_table)
|| innodb_table->no_rollback()) {
/* Temp tables do not use persistent stats. */
ps_on = FALSE;
ps_off = TRUE;
......@@ -12909,6 +12910,10 @@ create_table_info_t::innobase_table_flags()
default_compression_level : static_cast<ulint>(options->page_compression_level),
0);
if (m_form->s->table_type == TABLE_TYPE_SEQUENCE) {
m_flags |= 1U << DICT_TF_POS_NO_ROLLBACK;
}
/* Set the flags2 when create table or alter tables */
m_flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
......@@ -13539,6 +13544,10 @@ ha_innobase::create(
trx_t* trx;
DBUG_ENTER("ha_innobase::create");
DBUG_ASSERT(form->s == table_share);
DBUG_ASSERT(table_share->table_type == TABLE_TYPE_SEQUENCE
|| table_share->table_type == TABLE_TYPE_NORMAL);
create_table_info_t info(ha_thd(),
form,
create_info,
......@@ -16489,24 +16498,23 @@ ha_innobase::external_lock(
}
/* Check for UPDATEs in read-only mode. */
if (srv_read_only_mode
&& (thd_sql_command(thd) == SQLCOM_UPDATE
|| thd_sql_command(thd) == SQLCOM_INSERT
|| thd_sql_command(thd) == SQLCOM_REPLACE
|| thd_sql_command(thd) == SQLCOM_DROP_TABLE
|| thd_sql_command(thd) == SQLCOM_ALTER_TABLE
|| thd_sql_command(thd) == SQLCOM_OPTIMIZE
|| (thd_sql_command(thd) == SQLCOM_CREATE_TABLE
&& lock_type == F_WRLCK)
|| thd_sql_command(thd) == SQLCOM_CREATE_INDEX
|| thd_sql_command(thd) == SQLCOM_DROP_INDEX
|| thd_sql_command(thd) == SQLCOM_DELETE)) {
if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE) {
ib_senderrf(thd, IB_LOG_LEVEL_WARN,
ER_READ_ONLY_MODE);
DBUG_RETURN(HA_ERR_TABLE_READONLY);
} else {
if (srv_read_only_mode) {
switch (thd_sql_command(thd)) {
case SQLCOM_CREATE_TABLE:
if (lock_type != F_WRLCK) {
break;
}
case SQLCOM_UPDATE:
case SQLCOM_INSERT:
case SQLCOM_REPLACE:
case SQLCOM_DROP_TABLE:
case SQLCOM_ALTER_TABLE:
case SQLCOM_OPTIMIZE:
case SQLCOM_CREATE_INDEX:
case SQLCOM_DROP_INDEX:
case SQLCOM_CREATE_SEQUENCE:
case SQLCOM_DROP_SEQUENCE:
case SQLCOM_DELETE:
ib_senderrf(thd, IB_LOG_LEVEL_WARN,
ER_READ_ONLY_MODE);
DBUG_RETURN(HA_ERR_TABLE_READONLY);
......@@ -17433,7 +17441,8 @@ ha_innobase::store_lock(
/* Use consistent read for checksum table */
if (sql_command == SQLCOM_CHECKSUM
|| (sql_command == SQLCOM_ANALYZE && lock_type == TL_READ)
|| sql_command == SQLCOM_CREATE_SEQUENCE
|| (sql_command == SQLCOM_ANALYZE && lock_type == TL_READ)
|| ((srv_locks_unsafe_for_binlog
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
&& trx->isolation_level != TRX_ISO_SERIALIZABLE
......
......@@ -42,6 +42,11 @@ enum {
/** sys fields will be found in the update vector or inserted
entry */
BTR_KEEP_SYS_FLAG = 4,
/** no rollback */
BTR_NO_ROLLBACK = BTR_NO_UNDO_LOG_FLAG
| BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG,
/** btr_cur_pessimistic_update() must keep cursor position
when moving columns to big_rec */
BTR_KEEP_POS_FLAG = 8,
......
......@@ -147,17 +147,20 @@ Width of the page compression flag
#define DICT_TF_WIDTH_PAGE_COMPRESSION 1
#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4
/**
Width of atomic writes flag
DEFAULT=0, ON = 1, OFF = 2
*/
#define DICT_TF_WIDTH_ATOMIC_WRITES 2
/**
Width of the page encryption flag
*/
#define DICT_TF_WIDTH_PAGE_ENCRYPTION 1
#define DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY 8
/**
Width of atomic writes flag
DEFAULT=0, ON = 1, OFF = 2
*/
#define DICT_TF_WIDTH_ATOMIC_WRITES 2
/** Width of the NO_ROLLBACK flag */
#define DICT_TF_WIDTH_NO_ROLLBACK 1
/** Width of all the currently known table flags */
#define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \
......@@ -169,7 +172,8 @@ DEFAULT=0, ON = 1, OFF = 2
+ DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \
+ DICT_TF_WIDTH_ATOMIC_WRITES \
+ DICT_TF_WIDTH_PAGE_ENCRYPTION \
+ DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY)
+ DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY \
+ DICT_TF_WIDTH_NO_ROLLBACK)
/** A mask of all the known/used bits in table flags */
#define DICT_TF_BIT_MASK (~(~0U << DICT_TF_BITS))
......@@ -203,9 +207,11 @@ DEFAULT=0, ON = 1, OFF = 2
/** Zero relative shift position of the PAGE_ENCRYPTION_KEY field */
#define DICT_TF_POS_PAGE_ENCRYPTION_KEY (DICT_TF_POS_PAGE_ENCRYPTION \
+ DICT_TF_WIDTH_PAGE_ENCRYPTION)
#define DICT_TF_POS_UNUSED (DICT_TF_POS_PAGE_ENCRYPTION_KEY \
/** Zero relative shift position of the NO_ROLLBACK field */
#define DICT_TF_POS_NO_ROLLBACK (DICT_TF_POS_PAGE_ENCRYPTION_KEY \
+ DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY)
#define DICT_TF_POS_UNUSED (DICT_TF_POS_NO_ROLLBACK \
+ DICT_TF_WIDTH_NO_ROLLBACK)
/** Bit mask of the COMPACT field */
#define DICT_TF_MASK_COMPACT \
((~(~0U << DICT_TF_WIDTH_COMPACT)) \
......@@ -1357,6 +1363,12 @@ struct dict_table_t {
/** Release the table handle. */
inline void release();
/** @return whether the table supports transactions */
bool no_rollback() const
{
return flags & (1U << DICT_TF_POS_NO_ROLLBACK);
}
/** Id of the table. */
table_id_t id;
......
......@@ -3169,7 +3169,7 @@ row_ins_clust_index_entry(
log_free_check();
const ulint flags = dict_table_is_temporary(index->table)
? BTR_NO_LOCKING_FLAG
: 0;
: index->table->no_rollback() ? BTR_NO_ROLLBACK : 0;
err = row_ins_clust_index_entry_low(
flags, BTR_MODIFY_LEAF, index, n_uniq, entry,
......@@ -3703,7 +3703,27 @@ row_ins_step(
table during the search operation, and there is no need to set
it again here. But we must write trx->id to node->trx_id_buf. */
memset(node->trx_id_buf, 0, DATA_TRX_ID_LEN);
if (node->table->no_rollback()) {
/* No-rollback tables should only be accessed by a
single thread at a time. Concurrency control (mutual
exclusion) must be guaranteed by the SQL layer. */
DBUG_ASSERT(node->table->n_ref_count == 1);
DBUG_ASSERT(node->ins_type == INS_DIRECT);
/* No-rollback tables can consist only of a single index. */
DBUG_ASSERT(UT_LIST_GET_LEN(node->entry_list) == 1);
DBUG_ASSERT(UT_LIST_GET_LEN(node->table->indexes) == 1);
/* There should be no possibility for interruption and
restarting here. In theory, we could allow resumption
from the INS_NODE_INSERT_ENTRIES state here. */
DBUG_ASSERT(node->state == INS_NODE_SET_IX_LOCK);
memset(node->trx_id_buf, 0, DATA_TRX_ID_LEN);
memset(node->row_id_buf, 0, DATA_ROW_ID_LEN);
node->index = dict_table_get_first_index(node->table);
node->entry = UT_LIST_GET_FIRST(node->entry_list);
node->state = INS_NODE_INSERT_ENTRIES;
goto do_insert;
}
trx_write_trx_id(node->trx_id_buf, trx->id);
if (node->state == INS_NODE_SET_IX_LOCK) {
......@@ -3753,7 +3773,7 @@ row_ins_step(
return(thr);
}
do_insert:
/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
err = row_ins(node, thr);
......
......@@ -3696,7 +3696,7 @@ row_drop_table_for_mysql(
RemoteDatafile::delete_link_file(name);
}
if (!dict_table_is_temporary(table)) {
if (!dict_table_is_temporary(table) && !table->no_rollback()) {
dict_stats_recalc_pool_del(table);
dict_stats_defrag_pool_del(table, NULL);
......
......@@ -4130,9 +4130,10 @@ row_search_mvcc(
ulint direction)
{
DBUG_ENTER("row_search_mvcc");
DBUG_ASSERT(prebuilt->index->table == prebuilt->table);
dict_index_t* index = prebuilt->index;
ibool comp = dict_table_is_comp(index->table);
ibool comp = dict_table_is_comp(prebuilt->table);
const dtuple_t* search_tuple = prebuilt->search_tuple;
btr_pcur_t* pcur = prebuilt->pcur;
trx_t* trx = prebuilt->trx;
......@@ -4514,7 +4515,7 @@ row_search_mvcc(
que_thr_move_to_run_state_for_mysql(thr, trx);
clust_index = dict_table_get_first_index(index->table);
clust_index = dict_table_get_first_index(prebuilt->table);
/* Do some start-of-statement preparations */
......@@ -4543,7 +4544,7 @@ row_search_mvcc(
prebuilt->sql_stat_start = FALSE;
} else {
wait_table_again:
err = lock_table(0, index->table,
err = lock_table(0, prebuilt->table,
prebuilt->select_lock_type == LOCK_S
? LOCK_IS : LOCK_IX, thr);
......@@ -5072,7 +5073,8 @@ row_search_mvcc(
/* This is a non-locking consistent read: if necessary, fetch
a previous version of the record */
if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
|| prebuilt->table->no_rollback()) {
/* Do nothing: we let a non-locking SELECT read the
latest version of the record */
......
......@@ -2260,7 +2260,7 @@ row_upd_sec_index_entry(
flags = BTR_NO_LOCKING_FLAG;
mtr.set_log_mode(MTR_LOG_NO_REDO);
} else {
flags = 0;
flags = index->table->no_rollback() ? BTR_NO_ROLLBACK : 0;
}
if (!index->is_committed()) {
......@@ -3046,11 +3046,11 @@ row_upd_clust_step(
server or connection lifetime and so REDO information is not needed
on restart for recovery.
Disable locking as temp-tables are not shared across connection. */
if (dict_table_is_temporary(index->table)) {
if (dict_table_is_temporary(node->table)) {
flags = BTR_NO_LOCKING_FLAG;
mtr.set_log_mode(MTR_LOG_NO_REDO);
} else {
flags = 0;
flags = node->table->no_rollback() ? BTR_NO_ROLLBACK : 0;
}
/* If the restoration does not succeed, then the same
......
......@@ -3,14 +3,14 @@ CREATE TABLE t1 (a INT PRIMARY KEY, b CHAR(8)) ENGINE=rocksdb DATA DIRECTORY = '
ERROR HY000: Can't create table `test`.`t1` (errno: 140 "Wrong create options")
show warnings;
Level Code Message
Warning 1296 Got error 198 'Specifying DATA DIRECTORY for an individual table is not supported.' from ROCKSDB
Warning 1296 Got error 200 'Specifying DATA DIRECTORY for an individual table is not supported.' from ROCKSDB
Error 1005 Can't create table `test`.`t1` (errno: 140 "Wrong create options")
Warning 1030 Got error 140 "Wrong create options" from storage engine ROCKSDB
CREATE TABLE t1 (a INT PRIMARY KEY, b CHAR(8)) ENGINE=rocksdb INDEX DIRECTORY = '/foo/bar/index';
ERROR HY000: Can't create table `test`.`t1` (errno: 140 "Wrong create options")
show warnings;
Level Code Message
Warning 1296 Got error 199 'Specifying INDEX DIRECTORY for an individual table is not supported.' from ROCKSDB
Warning 1296 Got error 201 'Specifying INDEX DIRECTORY for an individual table is not supported.' from ROCKSDB
Error 1005 Can't create table `test`.`t1` (errno: 140 "Wrong create options")
Warning 1030 Got error 140 "Wrong create options" from storage engine ROCKSDB
CREATE TABLE t1 (id INT NOT NULL PRIMARY KEY) ENGINE=rocksdb PARTITION BY RANGE (id)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment