BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns

The problem was that MySQL hadn't true ctype implementation. As a result many multibyte punctuation/whitespace characters were treated as word characters. This fix uses recently added CTYPE table for unicode character sets (WL1386) to detect unicode punctuation/whitespace characters correctly. Note: this is incompatible change since it changes parser behavior. One will have to use REPAIR TABLE statement to rebuild fulltext indexes.

BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns
The problem was that MySQL hadn't true ctype implementation. As a result many multibyte punctuation/whitespace characters were treated as word characters. This fix uses recently added CTYPE table for unicode character sets (WL1386) to detect unicode punctuation/whitespace characters correctly. Note: this is incompatible change since it changes parser behavior. One will have to use REPAIR TABLE statement to rebuild fulltext indexes.
53279f1d · svoj@may.pils.ru · 7f6afa8b · 53279f1d · 53279f1d · 53279f1d
Commit 53279f1d authored May 29, 2006 by svoj@may.pils.ru
5 changed files
--- a/mysql-test/r/fulltext2.result
+++ b/mysql-test/r/fulltext2.result
@@ -241,3 +241,11 @@ select * from t1 where match a against('ab c' in boolean mode);
 a
 drop table t1;
 set names latin1;
+SET NAMES utf8;
+CREATE TABLE t1(a VARCHAR(255), FULLTEXT(a)) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+INSERT INTO t1 VALUES('„MySQL“');
+SELECT a FROM t1 WHERE MATCH a AGAINST('“MySQL„' IN BOOLEAN MODE);
+a
+„MySQL“
+DROP TABLE t1;
+SET NAMES latin1;
--- a/mysql-test/t/fulltext2.test
+++ b/mysql-test/t/fulltext2.test
@@ -221,3 +221,13 @@ drop table t1;
 set names latin1;

 # End of 4.1 tests
+
+#
+# BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns
+#
+SET NAMES utf8;
+CREATE TABLE t1(a VARCHAR(255), FULLTEXT(a)) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+INSERT INTO t1 VALUES('„MySQL“');
+SELECT a FROM t1 WHERE MATCH a AGAINST('“MySQL„' IN BOOLEAN MODE);
+DROP TABLE t1;
+SET NAMES latin1;
--- a/storage/myisam/ft_parser.c
+++ b/storage/myisam/ft_parser.c
@@ -114,6 +114,7 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
                 FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
 {
  byte *doc=*start;
+  int ctype;
  uint mwc, length, mbl;

  param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
@@ -122,9 +123,11 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,

  while (doc<end)
  {
-    for (;doc<end;doc++)
+    for (; doc < end; doc+= (mbl > 0 ? mbl : 1))
    {
-      if (true_word_char(cs,*doc)) break;
+      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      if (true_word_char(ctype, *doc))
+        break;
      if (*doc == FTB_RQUOT && param->quot)
      {
        param->quot=doc;
@@ -158,14 +161,16 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
    }

    mwc=length=0;
-    for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1))
-      if (true_word_char(cs,*doc))
+    for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
+    {
+      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      if (true_word_char(ctype, *doc))
        mwc=0;
      else if (!misc_word_char(*doc) || mwc)
        break;
      else
        mwc++;
-
+    }
    param->prev='A'; /* be sure *prev is true_word_char */
    word->len= (uint)(doc-word->pos) - mwc;
    if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
@@ -200,24 +205,31 @@ byte ft_simple_get_word(CHARSET_INFO *cs, byte **start, const byte *end,
 {
  byte *doc= *start;
  uint mwc, length, mbl;
+  int ctype;
  DBUG_ENTER("ft_simple_get_word");

  do
  {
-    for (;; doc++)
+    for (;; doc+= (mbl > 0 ? mbl : 1))
    {
-      if (doc >= end) DBUG_RETURN(0);
-      if (true_word_char(cs, *doc)) break;
+      if (doc >= end)
+        DBUG_RETURN(0);
+      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      if (true_word_char(ctype, *doc))
+        break;
    }

    mwc= length= 0;
-    for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1))
-      if (true_word_char(cs,*doc))
+    for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
+    {
+      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      if (true_word_char(ctype, *doc))
        mwc= 0;
      else if (!misc_word_char(*doc) || mwc)
        break;
      else
        mwc++;
+    }

    word->len= (uint)(doc-word->pos) - mwc;


--- a/storage/myisam/ft_update.c
+++ b/storage/myisam/ft_update.c
@@ -174,11 +174,6 @@ int _mi_ft_cmp(MI_INFO *info, uint keynr, const byte *rec1, const byte *rec2)
  FT_SEG_ITERATOR ftsi1, ftsi2;
  CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset;
  DBUG_ENTER("_mi_ft_cmp");
-#ifndef MYSQL_HAS_TRUE_CTYPE_IMPLEMENTATION
-  if (cs->mbmaxlen > 1)
-    DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT);
-#endif
-
  _mi_ft_segiterator_init(info, keynr, rec1, &ftsi1);
  _mi_ft_segiterator_init(info, keynr, rec2, &ftsi2);


--- a/storage/myisam/ftdefs.h
+++ b/storage/myisam/ftdefs.h
@@ -24,9 +24,10 @@
 #include <queues.h>
 #include <mysql/plugin.h>

-#define true_word_char(s,X)	(my_isalnum(s,X) || (X)=='_')
+#define true_word_char(ctype, character) \
+                      ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \
+                       (character) == '_')
 #define misc_word_char(X)	0
-#define word_char(s,X)		(true_word_char(s,X) || misc_word_char(X))

 #define FT_MAX_WORD_LEN_FOR_SORT 31