MDEV-20797 FULLTEXT search with apostrophe, and mandatory words

- InnoDB should ignore the single word followed by apostrophe while
tokenising the document. Example is that if the input string is O'brien
then right now, InnoDB seperates into two tokens as O, brien. But
after this patch, InnoDB can ignore the token 'O' and consider
only 'brien'.
parent a8a27f1e
...@@ -732,4 +732,32 @@ ALTER TABLE t1 DROP KEY `ftidx` ; ...@@ -732,4 +732,32 @@ ALTER TABLE t1 DROP KEY `ftidx` ;
INSERT INTO t1 (col_int, col_text) VALUES ( 1255, NULL); INSERT INTO t1 (col_int, col_text) VALUES ( 1255, NULL);
DROP TABLE t1; DROP TABLE t1;
SET @@global.innodb_file_per_table = @save; SET @@global.innodb_file_per_table = @save;
#
# MDEV-20797 FULLTEXT search with apostrophe,
# and mandatory words
#
CREATE TABLE t1(f1 TINYTEXT NOT NULL, FULLTEXT(f1))ENGINE=InnoDB;
INSERT INTO t1 VALUES('O''Brien'), ('O Brien'), ('''Brien');
INSERT INTO t1 VALUES('Brien'), ('O ''Brien'), ('O'' Brien');
INSERT INTO t1 VALUES('Doh''nuts');
SELECT * FROM t1 WHERE MATCH (f1) AGAINST ("+O'Brien" IN BOOLEAN MODE);
f1
O'Brien
O Brien
'Brien
Brien
O 'Brien
O' Brien
SELECT * FROM t1 WHERE MATCH (f1) AGAINST ("+Doh'nuts" IN BOOLEAN MODE);
f1
Doh'nuts
SELECT * FROM t1 WHERE MATCH (f1) AGAINST ("+''Brien" IN BOOLEAN MODE);
f1
O'Brien
O Brien
'Brien
Brien
O 'Brien
O' Brien
DROP TABLE t1;
# End of 10.3 tests # End of 10.3 tests
...@@ -757,4 +757,17 @@ INSERT INTO t1 (col_int, col_text) VALUES ( 1255, NULL); ...@@ -757,4 +757,17 @@ INSERT INTO t1 (col_int, col_text) VALUES ( 1255, NULL);
DROP TABLE t1; DROP TABLE t1;
SET @@global.innodb_file_per_table = @save; SET @@global.innodb_file_per_table = @save;
--echo #
--echo # MDEV-20797 FULLTEXT search with apostrophe,
--echo # and mandatory words
--echo #
CREATE TABLE t1(f1 TINYTEXT NOT NULL, FULLTEXT(f1))ENGINE=InnoDB;
INSERT INTO t1 VALUES('O''Brien'), ('O Brien'), ('''Brien');
INSERT INTO t1 VALUES('Brien'), ('O ''Brien'), ('O'' Brien');
INSERT INTO t1 VALUES('Doh''nuts');
SELECT * FROM t1 WHERE MATCH (f1) AGAINST ("+O'Brien" IN BOOLEAN MODE);
SELECT * FROM t1 WHERE MATCH (f1) AGAINST ("+Doh'nuts" IN BOOLEAN MODE);
SELECT * FROM t1 WHERE MATCH (f1) AGAINST ("+''Brien" IN BOOLEAN MODE);
DROP TABLE t1;
--echo # End of 10.3 tests --echo # End of 10.3 tests
...@@ -6912,7 +6912,8 @@ innobase_mysql_fts_get_token( ...@@ -6912,7 +6912,8 @@ innobase_mysql_fts_get_token(
ulint mwc = 0; ulint mwc = 0;
ulint length = 0; ulint length = 0;
bool reset_token_str = false;
reset:
token->f_str = const_cast<byte*>(doc); token->f_str = const_cast<byte*>(doc);
while (doc < end) { while (doc < end) {
...@@ -6923,6 +6924,9 @@ innobase_mysql_fts_get_token( ...@@ -6923,6 +6924,9 @@ innobase_mysql_fts_get_token(
cs, &ctype, (uchar*) doc, (uchar*) end); cs, &ctype, (uchar*) doc, (uchar*) end);
if (true_word_char(ctype, *doc)) { if (true_word_char(ctype, *doc)) {
mwc = 0; mwc = 0;
} else if (*doc == '\'' && length == 1) {
/* Could be apostrophe */
reset_token_str = true;
} else if (!misc_word_char(*doc) || mwc) { } else if (!misc_word_char(*doc) || mwc) {
break; break;
} else { } else {
...@@ -6932,6 +6936,14 @@ innobase_mysql_fts_get_token( ...@@ -6932,6 +6936,14 @@ innobase_mysql_fts_get_token(
++length; ++length;
doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1); doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
if (reset_token_str) {
/* Reset the token if the single character
followed by apostrophe */
mwc = 0;
length = 0;
reset_token_str = false;
goto reset;
}
} }
token->f_len = (uint) (doc - token->f_str) - mwc; token->f_len = (uint) (doc - token->f_str) - mwc;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment