From 862769a30ec0643bd59c8a6e8f0cd0c0d4cbed94 Mon Sep 17 00:00:00 2001 From: unknown <serg@serg.mysql.com> Date: Thu, 18 Apr 2002 14:12:29 +0000 Subject: [PATCH] phrase search --- myisam/ft_boolean_search.c | 79 ++++++++++++++++++++++++++---------- myisam/ft_parser.c | 14 +++++-- myisam/ftdefs.h | 5 ++- mysql-test/r/fulltext.result | 3 ++ mysql-test/t/fulltext.test | 3 +- 5 files changed, 77 insertions(+), 27 deletions(-) diff --git a/myisam/ft_boolean_search.c b/myisam/ft_boolean_search.c index dd310b4921a..10b5044826f 100644 --- a/myisam/ft_boolean_search.c +++ b/myisam/ft_boolean_search.c @@ -59,6 +59,7 @@ static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */ typedef struct st_ftb_expr FTB_EXPR; struct st_ftb_expr { FTB_EXPR *up; + byte *quot, *qend; float weight; uint flags; my_off_t docid[2]; /* for index search and for scan */ @@ -113,7 +114,7 @@ int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b) } void _ftb_parse_query(FTB *ftb, byte **start, byte *end, - FTB_EXPR *up, uint depth) + FTB_EXPR *up, uint depth) { byte res; FTB_PARAM param; @@ -126,16 +127,17 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end, return; param.prev=' '; + param.quot=up->quot; while ((res=ft_get_word(start,end,&w,¶m))) { - int r=param.plusminus; + int r=param.plusminus; float weight= (float) (param.pmsign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)]; switch (res) { case 1: /* word found */ ftbw=(FTB_WORD *)alloc_root(&ftb->mem_root, - sizeof(FTB_WORD) + - (param.trunc ? MI_MAX_KEY_BUFF : - w.len+extra)); + sizeof(FTB_WORD) + + (param.trunc ? MI_MAX_KEY_BUFF : + w.len+extra)); ftbw->len=w.len+1; ftbw->flags=0; if (param.yesno>0) ftbw->flags|=FTB_FLAG_YES; @@ -149,7 +151,7 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end, ftbw->word[0]=w.len; if (param.yesno > 0) up->ythresh++; queue_insert(& ftb->queue, (byte *)ftbw); - ftb->with_scan|=param.trunc; + ftb->with_scan|=(param.trunc & FTB_FLAG_TRUNC); break; case 2: /* left bracket */ ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR)); @@ -160,10 +162,12 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end, ftbe->up=up; ftbe->ythresh=ftbe->yweaks=0; ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR; + if ((ftbe->quot=param.quot)) ftb->with_scan|=2; if (param.yesno > 0) up->ythresh++; _ftb_parse_query(ftb, start, end, ftbe, depth+1); break; case 3: /* right bracket */ + if (up->quot) up->qend=param.quot; return; } } @@ -209,7 +213,7 @@ void _ftb_init_index_search(FT_INFO *ftb) ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), ftbw->word + (ftbw->flags&FTB_FLAG_TRUNC), ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), - 0); + 0); } if (r) /* not found */ { @@ -260,7 +264,7 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query, ftbe->weight=1; ftbe->flags=FTB_FLAG_YES; ftbe->nos=1; - ftbe->up=0; + ftbe->quot=ftbe->up=0; ftbe->ythresh=ftbe->yweaks=0; ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR; ftb->root=ftbe; @@ -270,16 +274,39 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query, memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements); qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *), (qsort2_cmp)FTB_WORD_cmp_list, ftb->charset); - if (ftb->queue.elements<2) ftb->with_scan=0; + if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC; ftb->state=READY; return ftb; } -void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode) +/* returns 1 if str0 contain str1 */ +int _ftb_strstr(const byte *s0, const byte *e0, + const byte *s1, const byte *e1, + CHARSET_INFO *cs) { + const byte *p; + + while (s0 < e0) + { + while (s0 < e0 && cs->to_upper[*s0++] != cs->to_upper[*s1]) + /* no-op */; + if (s0 >= e0) + return 0; + p=s1+1; + while (s0 < e0 && p < e1 && cs->to_upper[*s0++] == cs->to_upper[*p++]) + /* no-op */; + if (p >= e1) + return 1; + } + return 0; +} + +void _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig) +{ + FT_SEG_ITERATOR ftsi; FTB_EXPR *ftbe; float weight=ftbw->weight; - int yn=ftbw->flags, ythresh; + int yn=ftbw->flags, ythresh, mode=(ftsi_orig != 0); my_off_t curdoc=ftbw->docid[mode]; for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up) @@ -300,6 +327,20 @@ void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode) { yn=ftbe->flags; weight=ftbe->cur_weight*ftbe->weight; + if (mode && ftbe->quot) + { + int not_found=1; + + memcpy(&ftsi, ftsi_orig, sizeof(ftsi)); + while (_mi_ft_segiterator(&ftsi) && not_found) + { + if (!ftsi.pos) + continue; + not_found = ! _ftb_strstr(ftsi.pos, ftsi.pos+ftsi.len, + ftbe->quot, ftbe->qend, ftb->charset); + } + if (not_found) break; + } /* ftbe->quot */ } else break; @@ -356,7 +397,7 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record) { while (curdoc==(ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0]) { - _ftb_climb_the_tree(ftbw,0); + _ftb_climb_the_tree(ftb, ftbw, 0); /* update queue */ r=_mi_search(info, keyinfo, (uchar*) ftbw->word, USE_WHOLE_KEY, @@ -367,7 +408,7 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record) info->lastkey + (ftbw->flags&FTB_FLAG_TRUNC), ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), ftbw->word + (ftbw->flags&FTB_FLAG_TRUNC), - ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), + ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), 0); } if (r) /* not found */ @@ -414,7 +455,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length) FT_WORD word; FTB_WORD *ftbw; FTB_EXPR *ftbe; - FT_SEG_ITERATOR ftsi; + FT_SEG_ITERATOR ftsi, ftsi2; const byte *end; my_off_t docid=ftb->info->lastpos; @@ -423,17 +464,11 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length) if (!ftb->queue.elements) return 0; -#if NOT_USED - if (ftb->state == READY || ftb->state == INDEX_DONE) - ftb->state=SCAN; - else if (ftb->state != SCAN) - return -3.0; -#endif - if (ftb->keynr==NO_SUCH_KEY) _mi_ft_segiterator_dummy_init(record, length, &ftsi); else _mi_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi); + memcpy(&ftsi2, &ftsi, sizeof(ftsi)); while (_mi_ft_segiterator(&ftsi)) { @@ -464,7 +499,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length) if (ftbw->docid[1] == docid) continue; ftbw->docid[1]=docid; - _ftb_climb_the_tree(ftbw,1); + _ftb_climb_the_tree(ftb, ftbw, &ftsi2); } } } diff --git a/myisam/ft_parser.c b/myisam/ft_parser.c index 78529efed0b..39dcf4b458b 100644 --- a/myisam/ft_parser.c +++ b/myisam/ft_parser.c @@ -133,13 +133,20 @@ byte ft_get_word(byte **start, byte *end, FT_WORD *word, FTB_PARAM *param) for (;doc<end;doc++) { if (true_word_char(*doc)) break; - if (*doc == FTB_LBR || *doc == FTB_RBR) + if (*doc == FTB_RQUOT && param->quot) { + param->quot=doc-1; + *start=doc+1; + return 3; /* FTB_RBR */ + } + if ((*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) + && !param->quot) { /* param->prev=' '; */ *start=doc+1; + if (*doc == FTB_LQUOT) param->quot=*start; return (*doc == FTB_RBR)+2; } - if (param->prev == ' ') + if (param->prev == ' ' && !param->quot) { if (*doc == FTB_YES ) { param->yesno=+1; continue; } else if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else @@ -149,7 +156,8 @@ byte ft_get_word(byte **start, byte *end, FT_WORD *word, FTB_PARAM *param) if (*doc == FTB_NEG ) { param->pmsign=!param->pmsign; continue; } } param->prev=*doc; - param->yesno=param->plusminus=param->pmsign=0; + param->yesno=(param->quot != 0); + param->plusminus=param->pmsign=0; } mwc=0; diff --git a/myisam/ftdefs.h b/myisam/ftdefs.h index e02220d0fd5..a1352a13150 100644 --- a/myisam/ftdefs.h +++ b/myisam/ftdefs.h @@ -95,6 +95,8 @@ extern ulong collstat; #define FTB_RBR (ft_boolean_syntax[6]) #define FTB_NEG (ft_boolean_syntax[7]) #define FTB_TRUNC (ft_boolean_syntax[8]) +#define FTB_LQUOT (ft_boolean_syntax[10]) +#define FTB_RQUOT (ft_boolean_syntax[11]) typedef struct st_ft_word { byte * pos; @@ -111,6 +113,7 @@ typedef struct st_ftb_param { int plusminus; bool pmsign; bool trunc; + byte *quot; } FTB_PARAM; int is_stopword(char *word, uint len); @@ -132,7 +135,7 @@ uint _mi_ft_segiterator(FT_SEG_ITERATOR *); void ft_parse_init(TREE *, CHARSET_INFO *); int ft_parse(TREE *, byte *, int); -FT_WORD * ft_linearize(/*MI_INFO *, uint, byte *, */TREE *); +FT_WORD * ft_linearize(TREE *); FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, byte *, const byte *); uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record); diff --git a/mysql-test/r/fulltext.result b/mysql-test/r/fulltext.result index dd5e59e4081..cd9d1a93c58 100644 --- a/mysql-test/r/fulltext.result +++ b/mysql-test/r/fulltext.result @@ -67,6 +67,9 @@ Full-text indexes are called collections 1 Only MyISAM tables support collections 2 Function MATCH ... AGAINST() is used to do a search 0 Full-text search in MySQL implements vector space model 0 +select * from t1 where MATCH a,b AGAINST ('"Now sUPPort"' IN BOOLEAN MODE); +a b +MySQL has now support for full-text search select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE); a b Full-text search in MySQL implements vector space model diff --git a/mysql-test/t/fulltext.test b/mysql-test/t/fulltext.test index ea7a572951a..616ca7cb081 100644 --- a/mysql-test/t/fulltext.test +++ b/mysql-test/t/fulltext.test @@ -20,7 +20,6 @@ select * from t1 where MATCH(a,b) AGAINST ("indexes collections"); # UNION of fulltext's select * from t1 where MATCH(a,b) AGAINST ("collections") UNION ALL select * from t1 where MATCH(a,b) AGAINST ("indexes"); - # boolean search select * from t1 where MATCH(a,b) AGAINST("support -collections" IN BOOLEAN MODE); @@ -34,6 +33,8 @@ select * from t1 where MATCH(a,b) AGAINST("+search -(support vector)" IN BOOLEAN select *, MATCH(a,b) AGAINST("support collections" IN BOOLEAN MODE) as x from t1; select *, MATCH(a,b) AGAINST("collections support" IN BOOLEAN MODE) as x from t1; +select * from t1 where MATCH a,b AGAINST ('"Now sUPPort"' IN BOOLEAN MODE); + # boolean w/o index: select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE); -- 2.30.9