Commit 53279f1d authored by svoj@may.pils.ru's avatar svoj@may.pils.ru

BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns

The problem was that MySQL hadn't true ctype implementation. As a
result many multibyte punctuation/whitespace characters were
treated as word characters.

This fix uses recently added CTYPE table for unicode character sets
(WL1386) to detect unicode punctuation/whitespace characters
correctly.

Note: this is incompatible change since it changes parser behavior.
One will have to use REPAIR TABLE statement to rebuild fulltext
indexes.
parent 7f6afa8b
...@@ -241,3 +241,11 @@ select * from t1 where match a against('ab c' in boolean mode); ...@@ -241,3 +241,11 @@ select * from t1 where match a against('ab c' in boolean mode);
a a
drop table t1; drop table t1;
set names latin1; set names latin1;
SET NAMES utf8;
CREATE TABLE t1(a VARCHAR(255), FULLTEXT(a)) ENGINE=MyISAM DEFAULT CHARSET=utf8;
INSERT INTO t1 VALUES('„MySQL“');
SELECT a FROM t1 WHERE MATCH a AGAINST('“MySQL„' IN BOOLEAN MODE);
a
„MySQL“
DROP TABLE t1;
SET NAMES latin1;
...@@ -221,3 +221,13 @@ drop table t1; ...@@ -221,3 +221,13 @@ drop table t1;
set names latin1; set names latin1;
# End of 4.1 tests # End of 4.1 tests
#
# BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns
#
SET NAMES utf8;
CREATE TABLE t1(a VARCHAR(255), FULLTEXT(a)) ENGINE=MyISAM DEFAULT CHARSET=utf8;
INSERT INTO t1 VALUES('„MySQL“');
SELECT a FROM t1 WHERE MATCH a AGAINST('“MySQL„' IN BOOLEAN MODE);
DROP TABLE t1;
SET NAMES latin1;
...@@ -114,6 +114,7 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end, ...@@ -114,6 +114,7 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param) FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
{ {
byte *doc=*start; byte *doc=*start;
int ctype;
uint mwc, length, mbl; uint mwc, length, mbl;
param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
...@@ -122,9 +123,11 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end, ...@@ -122,9 +123,11 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
while (doc<end) while (doc<end)
{ {
for (;doc<end;doc++) for (; doc < end; doc+= (mbl > 0 ? mbl : 1))
{ {
if (true_word_char(cs,*doc)) break; mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
if (true_word_char(ctype, *doc))
break;
if (*doc == FTB_RQUOT && param->quot) if (*doc == FTB_RQUOT && param->quot)
{ {
param->quot=doc; param->quot=doc;
...@@ -158,14 +161,16 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end, ...@@ -158,14 +161,16 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
} }
mwc=length=0; mwc=length=0;
for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1)) for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
if (true_word_char(cs,*doc)) {
mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
if (true_word_char(ctype, *doc))
mwc=0; mwc=0;
else if (!misc_word_char(*doc) || mwc) else if (!misc_word_char(*doc) || mwc)
break; break;
else else
mwc++; mwc++;
}
param->prev='A'; /* be sure *prev is true_word_char */ param->prev='A'; /* be sure *prev is true_word_char */
word->len= (uint)(doc-word->pos) - mwc; word->len= (uint)(doc-word->pos) - mwc;
if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
...@@ -200,24 +205,31 @@ byte ft_simple_get_word(CHARSET_INFO *cs, byte **start, const byte *end, ...@@ -200,24 +205,31 @@ byte ft_simple_get_word(CHARSET_INFO *cs, byte **start, const byte *end,
{ {
byte *doc= *start; byte *doc= *start;
uint mwc, length, mbl; uint mwc, length, mbl;
int ctype;
DBUG_ENTER("ft_simple_get_word"); DBUG_ENTER("ft_simple_get_word");
do do
{ {
for (;; doc++) for (;; doc+= (mbl > 0 ? mbl : 1))
{ {
if (doc >= end) DBUG_RETURN(0); if (doc >= end)
if (true_word_char(cs, *doc)) break; DBUG_RETURN(0);
mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
if (true_word_char(ctype, *doc))
break;
} }
mwc= length= 0; mwc= length= 0;
for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1)) for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
if (true_word_char(cs,*doc)) {
mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
if (true_word_char(ctype, *doc))
mwc= 0; mwc= 0;
else if (!misc_word_char(*doc) || mwc) else if (!misc_word_char(*doc) || mwc)
break; break;
else else
mwc++; mwc++;
}
word->len= (uint)(doc-word->pos) - mwc; word->len= (uint)(doc-word->pos) - mwc;
......
...@@ -174,11 +174,6 @@ int _mi_ft_cmp(MI_INFO *info, uint keynr, const byte *rec1, const byte *rec2) ...@@ -174,11 +174,6 @@ int _mi_ft_cmp(MI_INFO *info, uint keynr, const byte *rec1, const byte *rec2)
FT_SEG_ITERATOR ftsi1, ftsi2; FT_SEG_ITERATOR ftsi1, ftsi2;
CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset; CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset;
DBUG_ENTER("_mi_ft_cmp"); DBUG_ENTER("_mi_ft_cmp");
#ifndef MYSQL_HAS_TRUE_CTYPE_IMPLEMENTATION
if (cs->mbmaxlen > 1)
DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT);
#endif
_mi_ft_segiterator_init(info, keynr, rec1, &ftsi1); _mi_ft_segiterator_init(info, keynr, rec1, &ftsi1);
_mi_ft_segiterator_init(info, keynr, rec2, &ftsi2); _mi_ft_segiterator_init(info, keynr, rec2, &ftsi2);
......
...@@ -24,9 +24,10 @@ ...@@ -24,9 +24,10 @@
#include <queues.h> #include <queues.h>
#include <mysql/plugin.h> #include <mysql/plugin.h>
#define true_word_char(s,X) (my_isalnum(s,X) || (X)=='_') #define true_word_char(ctype, character) \
((ctype) & (_MY_U | _MY_L | _MY_NMR) || \
(character) == '_')
#define misc_word_char(X) 0 #define misc_word_char(X) 0
#define word_char(s,X) (true_word_char(s,X) || misc_word_char(X))
#define FT_MAX_WORD_LEN_FOR_SORT 31 #define FT_MAX_WORD_LEN_FOR_SORT 31
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment