Commit 862769a3 authored by unknown's avatar unknown

phrase search

parent d8764f0b
...@@ -59,6 +59,7 @@ static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */ ...@@ -59,6 +59,7 @@ static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */
typedef struct st_ftb_expr FTB_EXPR; typedef struct st_ftb_expr FTB_EXPR;
struct st_ftb_expr { struct st_ftb_expr {
FTB_EXPR *up; FTB_EXPR *up;
byte *quot, *qend;
float weight; float weight;
uint flags; uint flags;
my_off_t docid[2]; /* for index search and for scan */ my_off_t docid[2]; /* for index search and for scan */
...@@ -113,7 +114,7 @@ int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b) ...@@ -113,7 +114,7 @@ int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b)
} }
void _ftb_parse_query(FTB *ftb, byte **start, byte *end, void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
FTB_EXPR *up, uint depth) FTB_EXPR *up, uint depth)
{ {
byte res; byte res;
FTB_PARAM param; FTB_PARAM param;
...@@ -126,16 +127,17 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end, ...@@ -126,16 +127,17 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
return; return;
param.prev=' '; param.prev=' ';
param.quot=up->quot;
while ((res=ft_get_word(start,end,&w,&param))) while ((res=ft_get_word(start,end,&w,&param)))
{ {
int r=param.plusminus; int r=param.plusminus;
float weight= (float) (param.pmsign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)]; float weight= (float) (param.pmsign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)];
switch (res) { switch (res) {
case 1: /* word found */ case 1: /* word found */
ftbw=(FTB_WORD *)alloc_root(&ftb->mem_root, ftbw=(FTB_WORD *)alloc_root(&ftb->mem_root,
sizeof(FTB_WORD) + sizeof(FTB_WORD) +
(param.trunc ? MI_MAX_KEY_BUFF : (param.trunc ? MI_MAX_KEY_BUFF :
w.len+extra)); w.len+extra));
ftbw->len=w.len+1; ftbw->len=w.len+1;
ftbw->flags=0; ftbw->flags=0;
if (param.yesno>0) ftbw->flags|=FTB_FLAG_YES; if (param.yesno>0) ftbw->flags|=FTB_FLAG_YES;
...@@ -149,7 +151,7 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end, ...@@ -149,7 +151,7 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
ftbw->word[0]=w.len; ftbw->word[0]=w.len;
if (param.yesno > 0) up->ythresh++; if (param.yesno > 0) up->ythresh++;
queue_insert(& ftb->queue, (byte *)ftbw); queue_insert(& ftb->queue, (byte *)ftbw);
ftb->with_scan|=param.trunc; ftb->with_scan|=(param.trunc & FTB_FLAG_TRUNC);
break; break;
case 2: /* left bracket */ case 2: /* left bracket */
ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR)); ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR));
...@@ -160,10 +162,12 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end, ...@@ -160,10 +162,12 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
ftbe->up=up; ftbe->up=up;
ftbe->ythresh=ftbe->yweaks=0; ftbe->ythresh=ftbe->yweaks=0;
ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR; ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR;
if ((ftbe->quot=param.quot)) ftb->with_scan|=2;
if (param.yesno > 0) up->ythresh++; if (param.yesno > 0) up->ythresh++;
_ftb_parse_query(ftb, start, end, ftbe, depth+1); _ftb_parse_query(ftb, start, end, ftbe, depth+1);
break; break;
case 3: /* right bracket */ case 3: /* right bracket */
if (up->quot) up->qend=param.quot;
return; return;
} }
} }
...@@ -209,7 +213,7 @@ void _ftb_init_index_search(FT_INFO *ftb) ...@@ -209,7 +213,7 @@ void _ftb_init_index_search(FT_INFO *ftb)
ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC),
ftbw->word + (ftbw->flags&FTB_FLAG_TRUNC), ftbw->word + (ftbw->flags&FTB_FLAG_TRUNC),
ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC),
0); 0);
} }
if (r) /* not found */ if (r) /* not found */
{ {
...@@ -260,7 +264,7 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query, ...@@ -260,7 +264,7 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query,
ftbe->weight=1; ftbe->weight=1;
ftbe->flags=FTB_FLAG_YES; ftbe->flags=FTB_FLAG_YES;
ftbe->nos=1; ftbe->nos=1;
ftbe->up=0; ftbe->quot=ftbe->up=0;
ftbe->ythresh=ftbe->yweaks=0; ftbe->ythresh=ftbe->yweaks=0;
ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR; ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR;
ftb->root=ftbe; ftb->root=ftbe;
...@@ -270,16 +274,39 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query, ...@@ -270,16 +274,39 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query,
memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements); memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements);
qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *), qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *),
(qsort2_cmp)FTB_WORD_cmp_list, ftb->charset); (qsort2_cmp)FTB_WORD_cmp_list, ftb->charset);
if (ftb->queue.elements<2) ftb->with_scan=0; if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC;
ftb->state=READY; ftb->state=READY;
return ftb; return ftb;
} }
void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode) /* returns 1 if str0 contain str1 */
int _ftb_strstr(const byte *s0, const byte *e0,
const byte *s1, const byte *e1,
CHARSET_INFO *cs)
{ {
const byte *p;
while (s0 < e0)
{
while (s0 < e0 && cs->to_upper[*s0++] != cs->to_upper[*s1])
/* no-op */;
if (s0 >= e0)
return 0;
p=s1+1;
while (s0 < e0 && p < e1 && cs->to_upper[*s0++] == cs->to_upper[*p++])
/* no-op */;
if (p >= e1)
return 1;
}
return 0;
}
void _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig)
{
FT_SEG_ITERATOR ftsi;
FTB_EXPR *ftbe; FTB_EXPR *ftbe;
float weight=ftbw->weight; float weight=ftbw->weight;
int yn=ftbw->flags, ythresh; int yn=ftbw->flags, ythresh, mode=(ftsi_orig != 0);
my_off_t curdoc=ftbw->docid[mode]; my_off_t curdoc=ftbw->docid[mode];
for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up) for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
...@@ -300,6 +327,20 @@ void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode) ...@@ -300,6 +327,20 @@ void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode)
{ {
yn=ftbe->flags; yn=ftbe->flags;
weight=ftbe->cur_weight*ftbe->weight; weight=ftbe->cur_weight*ftbe->weight;
if (mode && ftbe->quot)
{
int not_found=1;
memcpy(&ftsi, ftsi_orig, sizeof(ftsi));
while (_mi_ft_segiterator(&ftsi) && not_found)
{
if (!ftsi.pos)
continue;
not_found = ! _ftb_strstr(ftsi.pos, ftsi.pos+ftsi.len,
ftbe->quot, ftbe->qend, ftb->charset);
}
if (not_found) break;
} /* ftbe->quot */
} }
else else
break; break;
...@@ -356,7 +397,7 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record) ...@@ -356,7 +397,7 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record)
{ {
while (curdoc==(ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0]) while (curdoc==(ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0])
{ {
_ftb_climb_the_tree(ftbw,0); _ftb_climb_the_tree(ftb, ftbw, 0);
/* update queue */ /* update queue */
r=_mi_search(info, keyinfo, (uchar*) ftbw->word, USE_WHOLE_KEY, r=_mi_search(info, keyinfo, (uchar*) ftbw->word, USE_WHOLE_KEY,
...@@ -367,7 +408,7 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record) ...@@ -367,7 +408,7 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record)
info->lastkey + (ftbw->flags&FTB_FLAG_TRUNC), info->lastkey + (ftbw->flags&FTB_FLAG_TRUNC),
ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC),
ftbw->word + (ftbw->flags&FTB_FLAG_TRUNC), ftbw->word + (ftbw->flags&FTB_FLAG_TRUNC),
ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC),
0); 0);
} }
if (r) /* not found */ if (r) /* not found */
...@@ -414,7 +455,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length) ...@@ -414,7 +455,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
FT_WORD word; FT_WORD word;
FTB_WORD *ftbw; FTB_WORD *ftbw;
FTB_EXPR *ftbe; FTB_EXPR *ftbe;
FT_SEG_ITERATOR ftsi; FT_SEG_ITERATOR ftsi, ftsi2;
const byte *end; const byte *end;
my_off_t docid=ftb->info->lastpos; my_off_t docid=ftb->info->lastpos;
...@@ -423,17 +464,11 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length) ...@@ -423,17 +464,11 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
if (!ftb->queue.elements) if (!ftb->queue.elements)
return 0; return 0;
#if NOT_USED
if (ftb->state == READY || ftb->state == INDEX_DONE)
ftb->state=SCAN;
else if (ftb->state != SCAN)
return -3.0;
#endif
if (ftb->keynr==NO_SUCH_KEY) if (ftb->keynr==NO_SUCH_KEY)
_mi_ft_segiterator_dummy_init(record, length, &ftsi); _mi_ft_segiterator_dummy_init(record, length, &ftsi);
else else
_mi_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi); _mi_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi);
memcpy(&ftsi2, &ftsi, sizeof(ftsi));
while (_mi_ft_segiterator(&ftsi)) while (_mi_ft_segiterator(&ftsi))
{ {
...@@ -464,7 +499,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length) ...@@ -464,7 +499,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
if (ftbw->docid[1] == docid) if (ftbw->docid[1] == docid)
continue; continue;
ftbw->docid[1]=docid; ftbw->docid[1]=docid;
_ftb_climb_the_tree(ftbw,1); _ftb_climb_the_tree(ftb, ftbw, &ftsi2);
} }
} }
} }
......
...@@ -133,13 +133,20 @@ byte ft_get_word(byte **start, byte *end, FT_WORD *word, FTB_PARAM *param) ...@@ -133,13 +133,20 @@ byte ft_get_word(byte **start, byte *end, FT_WORD *word, FTB_PARAM *param)
for (;doc<end;doc++) for (;doc<end;doc++)
{ {
if (true_word_char(*doc)) break; if (true_word_char(*doc)) break;
if (*doc == FTB_LBR || *doc == FTB_RBR) if (*doc == FTB_RQUOT && param->quot) {
param->quot=doc-1;
*start=doc+1;
return 3; /* FTB_RBR */
}
if ((*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
&& !param->quot)
{ {
/* param->prev=' '; */ /* param->prev=' '; */
*start=doc+1; *start=doc+1;
if (*doc == FTB_LQUOT) param->quot=*start;
return (*doc == FTB_RBR)+2; return (*doc == FTB_RBR)+2;
} }
if (param->prev == ' ') if (param->prev == ' ' && !param->quot)
{ {
if (*doc == FTB_YES ) { param->yesno=+1; continue; } else if (*doc == FTB_YES ) { param->yesno=+1; continue; } else
if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else
...@@ -149,7 +156,8 @@ byte ft_get_word(byte **start, byte *end, FT_WORD *word, FTB_PARAM *param) ...@@ -149,7 +156,8 @@ byte ft_get_word(byte **start, byte *end, FT_WORD *word, FTB_PARAM *param)
if (*doc == FTB_NEG ) { param->pmsign=!param->pmsign; continue; } if (*doc == FTB_NEG ) { param->pmsign=!param->pmsign; continue; }
} }
param->prev=*doc; param->prev=*doc;
param->yesno=param->plusminus=param->pmsign=0; param->yesno=(param->quot != 0);
param->plusminus=param->pmsign=0;
} }
mwc=0; mwc=0;
......
...@@ -95,6 +95,8 @@ extern ulong collstat; ...@@ -95,6 +95,8 @@ extern ulong collstat;
#define FTB_RBR (ft_boolean_syntax[6]) #define FTB_RBR (ft_boolean_syntax[6])
#define FTB_NEG (ft_boolean_syntax[7]) #define FTB_NEG (ft_boolean_syntax[7])
#define FTB_TRUNC (ft_boolean_syntax[8]) #define FTB_TRUNC (ft_boolean_syntax[8])
#define FTB_LQUOT (ft_boolean_syntax[10])
#define FTB_RQUOT (ft_boolean_syntax[11])
typedef struct st_ft_word { typedef struct st_ft_word {
byte * pos; byte * pos;
...@@ -111,6 +113,7 @@ typedef struct st_ftb_param { ...@@ -111,6 +113,7 @@ typedef struct st_ftb_param {
int plusminus; int plusminus;
bool pmsign; bool pmsign;
bool trunc; bool trunc;
byte *quot;
} FTB_PARAM; } FTB_PARAM;
int is_stopword(char *word, uint len); int is_stopword(char *word, uint len);
...@@ -132,7 +135,7 @@ uint _mi_ft_segiterator(FT_SEG_ITERATOR *); ...@@ -132,7 +135,7 @@ uint _mi_ft_segiterator(FT_SEG_ITERATOR *);
void ft_parse_init(TREE *, CHARSET_INFO *); void ft_parse_init(TREE *, CHARSET_INFO *);
int ft_parse(TREE *, byte *, int); int ft_parse(TREE *, byte *, int);
FT_WORD * ft_linearize(/*MI_INFO *, uint, byte *, */TREE *); FT_WORD * ft_linearize(TREE *);
FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, byte *, const byte *); FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, byte *, const byte *);
uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record); uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record);
......
...@@ -67,6 +67,9 @@ Full-text indexes are called collections 1 ...@@ -67,6 +67,9 @@ Full-text indexes are called collections 1
Only MyISAM tables support collections 2 Only MyISAM tables support collections 2
Function MATCH ... AGAINST() is used to do a search 0 Function MATCH ... AGAINST() is used to do a search 0
Full-text search in MySQL implements vector space model 0 Full-text search in MySQL implements vector space model 0
select * from t1 where MATCH a,b AGAINST ('"Now sUPPort"' IN BOOLEAN MODE);
a b
MySQL has now support for full-text search
select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE); select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE);
a b a b
Full-text search in MySQL implements vector space model Full-text search in MySQL implements vector space model
......
...@@ -20,7 +20,6 @@ select * from t1 where MATCH(a,b) AGAINST ("indexes collections"); ...@@ -20,7 +20,6 @@ select * from t1 where MATCH(a,b) AGAINST ("indexes collections");
# UNION of fulltext's # UNION of fulltext's
select * from t1 where MATCH(a,b) AGAINST ("collections") UNION ALL select * from t1 where MATCH(a,b) AGAINST ("indexes"); select * from t1 where MATCH(a,b) AGAINST ("collections") UNION ALL select * from t1 where MATCH(a,b) AGAINST ("indexes");
# boolean search # boolean search
select * from t1 where MATCH(a,b) AGAINST("support -collections" IN BOOLEAN MODE); select * from t1 where MATCH(a,b) AGAINST("support -collections" IN BOOLEAN MODE);
...@@ -34,6 +33,8 @@ select * from t1 where MATCH(a,b) AGAINST("+search -(support vector)" IN BOOLEAN ...@@ -34,6 +33,8 @@ select * from t1 where MATCH(a,b) AGAINST("+search -(support vector)" IN BOOLEAN
select *, MATCH(a,b) AGAINST("support collections" IN BOOLEAN MODE) as x from t1; select *, MATCH(a,b) AGAINST("support collections" IN BOOLEAN MODE) as x from t1;
select *, MATCH(a,b) AGAINST("collections support" IN BOOLEAN MODE) as x from t1; select *, MATCH(a,b) AGAINST("collections support" IN BOOLEAN MODE) as x from t1;
select * from t1 where MATCH a,b AGAINST ('"Now sUPPort"' IN BOOLEAN MODE);
# boolean w/o index: # boolean w/o index:
select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE); select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment