From 862769a30ec0643bd59c8a6e8f0cd0c0d4cbed94 Mon Sep 17 00:00:00 2001
From: unknown <serg@serg.mysql.com>
Date: Thu, 18 Apr 2002 14:12:29 +0000
Subject: [PATCH] phrase search

---
 myisam/ft_boolean_search.c   | 79 ++++++++++++++++++++++++++----------
 myisam/ft_parser.c           | 14 +++++--
 myisam/ftdefs.h              |  5 ++-
 mysql-test/r/fulltext.result |  3 ++
 mysql-test/t/fulltext.test   |  3 +-
 5 files changed, 77 insertions(+), 27 deletions(-)

diff --git a/myisam/ft_boolean_search.c b/myisam/ft_boolean_search.c
index dd310b4921a..10b5044826f 100644
--- a/myisam/ft_boolean_search.c
+++ b/myisam/ft_boolean_search.c
@@ -59,6 +59,7 @@ static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */
 typedef struct st_ftb_expr FTB_EXPR;
 struct st_ftb_expr {
   FTB_EXPR *up;
+  byte     *quot, *qend;
   float     weight;
   uint      flags;
   my_off_t  docid[2];             /* for index search and for scan */
@@ -113,7 +114,7 @@ int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b)
 }
 
 void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
-		      FTB_EXPR *up, uint depth)
+                      FTB_EXPR *up, uint depth)
 {
   byte        res;
   FTB_PARAM   param;
@@ -126,16 +127,17 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
     return;
 
   param.prev=' ';
+  param.quot=up->quot;
   while ((res=ft_get_word(start,end,&w,&param)))
   {
-    int  r=param.plusminus;
+    int   r=param.plusminus;
     float weight= (float) (param.pmsign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)];
     switch (res) {
       case 1: /* word found */
         ftbw=(FTB_WORD *)alloc_root(&ftb->mem_root,
-				    sizeof(FTB_WORD) +
-				    (param.trunc ? MI_MAX_KEY_BUFF :
-				     w.len+extra));
+                                    sizeof(FTB_WORD) +
+                                    (param.trunc ? MI_MAX_KEY_BUFF :
+                                     w.len+extra));
         ftbw->len=w.len+1;
         ftbw->flags=0;
         if (param.yesno>0) ftbw->flags|=FTB_FLAG_YES;
@@ -149,7 +151,7 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
         ftbw->word[0]=w.len;
         if (param.yesno > 0) up->ythresh++;
         queue_insert(& ftb->queue, (byte *)ftbw);
-        ftb->with_scan|=param.trunc;
+        ftb->with_scan|=(param.trunc & FTB_FLAG_TRUNC);
         break;
       case 2: /* left bracket */
         ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR));
@@ -160,10 +162,12 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
         ftbe->up=up;
         ftbe->ythresh=ftbe->yweaks=0;
         ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR;
+        if ((ftbe->quot=param.quot)) ftb->with_scan|=2;
         if (param.yesno > 0) up->ythresh++;
         _ftb_parse_query(ftb, start, end, ftbe, depth+1);
         break;
       case 3: /* right bracket */
+        if (up->quot) up->qend=param.quot;
         return;
     }
   }
@@ -209,7 +213,7 @@ void  _ftb_init_index_search(FT_INFO *ftb)
                          ftbw->len     - (ftbw->flags&FTB_FLAG_TRUNC),
                          ftbw->word    + (ftbw->flags&FTB_FLAG_TRUNC),
                          ftbw->len     - (ftbw->flags&FTB_FLAG_TRUNC),
-			 0);
+                         0);
     }
     if (r) /* not found */
     {
@@ -260,7 +264,7 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query,
   ftbe->weight=1;
   ftbe->flags=FTB_FLAG_YES;
   ftbe->nos=1;
-  ftbe->up=0;
+  ftbe->quot=ftbe->up=0;
   ftbe->ythresh=ftbe->yweaks=0;
   ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR;
   ftb->root=ftbe;
@@ -270,16 +274,39 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query,
   memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements);
   qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *),
                               (qsort2_cmp)FTB_WORD_cmp_list, ftb->charset);
-  if (ftb->queue.elements<2) ftb->with_scan=0;
+  if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC;
   ftb->state=READY;
   return ftb;
 }
 
-void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode)
+/* returns 1 if str0 contain str1 */
+int _ftb_strstr(const byte *s0, const byte *e0,
+                const byte *s1, const byte *e1,
+                CHARSET_INFO *cs)
 {
+  const byte *p;
+
+  while (s0 < e0)
+  {
+    while (s0 < e0 && cs->to_upper[*s0++] != cs->to_upper[*s1])
+      /* no-op */;
+    if (s0 >= e0)
+      return 0;
+    p=s1+1;
+    while (s0 < e0 && p < e1 && cs->to_upper[*s0++] == cs->to_upper[*p++])
+      /* no-op */;
+    if (p >= e1)
+      return 1;
+  }
+  return 0;
+}
+
+void _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig)
+{
+  FT_SEG_ITERATOR ftsi;
   FTB_EXPR *ftbe;
   float weight=ftbw->weight;
-  int  yn=ftbw->flags, ythresh;
+  int  yn=ftbw->flags, ythresh, mode=(ftsi_orig != 0);
   my_off_t curdoc=ftbw->docid[mode];
 
   for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
@@ -300,6 +327,20 @@ void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode)
       {
         yn=ftbe->flags;
         weight=ftbe->cur_weight*ftbe->weight;
+        if (mode && ftbe->quot)
+        {
+          int not_found=1;
+
+          memcpy(&ftsi, ftsi_orig, sizeof(ftsi));
+          while (_mi_ft_segiterator(&ftsi) && not_found)
+          {
+            if (!ftsi.pos)
+              continue;
+            not_found = ! _ftb_strstr(ftsi.pos, ftsi.pos+ftsi.len,
+                                      ftbe->quot, ftbe->qend, ftb->charset);
+          }
+          if (not_found) break;
+        } /* ftbe->quot */
       }
       else
         break;
@@ -356,7 +397,7 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record)
   {
     while (curdoc==(ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0])
     {
-      _ftb_climb_the_tree(ftbw,0);
+      _ftb_climb_the_tree(ftb, ftbw, 0);
 
       /* update queue */
       r=_mi_search(info, keyinfo, (uchar*) ftbw->word, USE_WHOLE_KEY,
@@ -367,7 +408,7 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record)
                            info->lastkey + (ftbw->flags&FTB_FLAG_TRUNC),
                            ftbw->len     - (ftbw->flags&FTB_FLAG_TRUNC),
                            ftbw->word    + (ftbw->flags&FTB_FLAG_TRUNC),
-			   ftbw->len     - (ftbw->flags&FTB_FLAG_TRUNC),
+                           ftbw->len     - (ftbw->flags&FTB_FLAG_TRUNC),
                            0);
       }
       if (r) /* not found */
@@ -414,7 +455,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
   FT_WORD word;
   FTB_WORD *ftbw;
   FTB_EXPR *ftbe;
-  FT_SEG_ITERATOR ftsi;
+  FT_SEG_ITERATOR ftsi, ftsi2;
   const byte *end;
   my_off_t  docid=ftb->info->lastpos;
 
@@ -423,17 +464,11 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
   if (!ftb->queue.elements)
     return 0;
 
-#if NOT_USED
-  if (ftb->state == READY || ftb->state == INDEX_DONE)
-    ftb->state=SCAN;
-  else if (ftb->state != SCAN)
-    return -3.0;
-#endif
-
   if (ftb->keynr==NO_SUCH_KEY)
     _mi_ft_segiterator_dummy_init(record, length, &ftsi);
   else
     _mi_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi);
+  memcpy(&ftsi2, &ftsi, sizeof(ftsi));
 
   while (_mi_ft_segiterator(&ftsi))
   {
@@ -464,7 +499,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
         if (ftbw->docid[1] == docid)
           continue;
         ftbw->docid[1]=docid;
-        _ftb_climb_the_tree(ftbw,1);
+        _ftb_climb_the_tree(ftb, ftbw, &ftsi2);
       }
     }
   }
diff --git a/myisam/ft_parser.c b/myisam/ft_parser.c
index 78529efed0b..39dcf4b458b 100644
--- a/myisam/ft_parser.c
+++ b/myisam/ft_parser.c
@@ -133,13 +133,20 @@ byte ft_get_word(byte **start, byte *end, FT_WORD *word, FTB_PARAM *param)
     for (;doc<end;doc++)
     {
       if (true_word_char(*doc)) break;
-      if (*doc == FTB_LBR || *doc == FTB_RBR)
+      if (*doc == FTB_RQUOT && param->quot) {
+        param->quot=doc-1;
+        *start=doc+1;
+        return 3; /* FTB_RBR */
+      }
+      if ((*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
+          && !param->quot)
       {
         /* param->prev=' '; */
         *start=doc+1;
+        if (*doc == FTB_LQUOT) param->quot=*start;
         return (*doc == FTB_RBR)+2;
       }
-      if (param->prev == ' ')
+      if (param->prev == ' ' && !param->quot)
       {
         if (*doc == FTB_YES ) { param->yesno=+1;    continue; } else
         if (*doc == FTB_EGAL) { param->yesno= 0;    continue; } else
@@ -149,7 +156,8 @@ byte ft_get_word(byte **start, byte *end, FT_WORD *word, FTB_PARAM *param)
         if (*doc == FTB_NEG ) { param->pmsign=!param->pmsign; continue; }
       }
       param->prev=*doc;
-      param->yesno=param->plusminus=param->pmsign=0;
+      param->yesno=(param->quot != 0);
+      param->plusminus=param->pmsign=0;
     }
 
     mwc=0;
diff --git a/myisam/ftdefs.h b/myisam/ftdefs.h
index e02220d0fd5..a1352a13150 100644
--- a/myisam/ftdefs.h
+++ b/myisam/ftdefs.h
@@ -95,6 +95,8 @@ extern ulong collstat;
 #define FTB_RBR   (ft_boolean_syntax[6])
 #define FTB_NEG   (ft_boolean_syntax[7])
 #define FTB_TRUNC (ft_boolean_syntax[8])
+#define FTB_LQUOT (ft_boolean_syntax[10])
+#define FTB_RQUOT (ft_boolean_syntax[11])
 
 typedef struct st_ft_word {
   byte * pos;
@@ -111,6 +113,7 @@ typedef struct st_ftb_param {
   int  plusminus;
   bool pmsign;
   bool trunc;
+  byte *quot;
 } FTB_PARAM;
 
 int is_stopword(char *word, uint len);
@@ -132,7 +135,7 @@ uint _mi_ft_segiterator(FT_SEG_ITERATOR *);
 
 void ft_parse_init(TREE *, CHARSET_INFO *);
 int ft_parse(TREE *, byte *, int);
-FT_WORD * ft_linearize(/*MI_INFO *, uint, byte *, */TREE *);
+FT_WORD * ft_linearize(TREE *);
 FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, byte *, const byte *);
 uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record);
 
diff --git a/mysql-test/r/fulltext.result b/mysql-test/r/fulltext.result
index dd5e59e4081..cd9d1a93c58 100644
--- a/mysql-test/r/fulltext.result
+++ b/mysql-test/r/fulltext.result
@@ -67,6 +67,9 @@ Full-text indexes	are called collections	1
 Only MyISAM tables	support collections	2
 Function MATCH ... AGAINST()	is used to do a search	0
 Full-text search in MySQL	implements vector space model	0
+select * from t1 where MATCH a,b AGAINST ('"Now sUPPort"' IN BOOLEAN MODE);
+a	b
+MySQL has now support	for full-text search
 select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE);
 a	b
 Full-text search in MySQL	implements vector space model
diff --git a/mysql-test/t/fulltext.test b/mysql-test/t/fulltext.test
index ea7a572951a..616ca7cb081 100644
--- a/mysql-test/t/fulltext.test
+++ b/mysql-test/t/fulltext.test
@@ -20,7 +20,6 @@ select * from t1 where MATCH(a,b) AGAINST ("indexes collections");
 # UNION of fulltext's
 select * from t1 where MATCH(a,b) AGAINST ("collections") UNION ALL select * from t1 where MATCH(a,b) AGAINST ("indexes");
 
-
 # boolean search
 
 select * from t1 where MATCH(a,b) AGAINST("support -collections" IN BOOLEAN MODE);
@@ -34,6 +33,8 @@ select * from t1 where MATCH(a,b) AGAINST("+search -(support vector)" IN BOOLEAN
 select *, MATCH(a,b) AGAINST("support collections" IN BOOLEAN MODE) as x from t1;
 select *, MATCH(a,b) AGAINST("collections support" IN BOOLEAN MODE) as x from t1;
 
+select * from t1 where MATCH a,b AGAINST ('"Now sUPPort"' IN BOOLEAN MODE);
+
 # boolean w/o index:
 
 select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE);
-- 
2.30.9