Commit 9e29bb4e authored by unknown's avatar unknown

ft boolean search by table scan; queue_fix()


include/queues.h:
  queue_fix() introduced
myisam/ft_boolean_search.c:
  ft boolean search by table scan
myisam/ft_parser.c:
  ft boolean search by table scan
myisam/ft_update.c:
  ft boolean search by table scan
myisam/ftdefs.h:
  ft boolean search by table scan
mysql-test/r/fulltext_cache.result:
  ft boolean search by table scan
mysql-test/t/fulltext_cache.test:
  ft boolean search by table scan
mysys/queues.c:
  queue_fix() introduced
parent 36a4cc17
......@@ -53,6 +53,7 @@ void delete_queue(QUEUE *queue);
void queue_insert(QUEUE *queue,byte *element);
byte *queue_remove(QUEUE *queue,uint idx);
void _downheap(QUEUE *queue,uint idx);
void queue_fix(QUEUE *queue);
#define is_queue_inited(queue) ((queue)->root != 0)
#ifdef __cplusplus
......
......@@ -16,6 +16,8 @@
/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
/* TODO: add caching - pre-read several index entries at once */
#define FT_CORE
#include "ftdefs.h"
#include <queues.h>
......@@ -78,7 +80,7 @@ typedef struct st_ft_info {
struct _ft_vft *please;
MI_INFO *info;
uint keynr;
int ok;
enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE, SCAN } state;
FTB_EXPR *root;
QUEUE queue;
MEM_ROOT mem_root;
......@@ -101,13 +103,9 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
FT_WORD w;
FTB_WORD *ftbw;
FTB_EXPR *ftbe;
MI_INFO *info=ftb->info;
int r;
MI_KEYDEF *keyinfo=info->s->keyinfo+ftb->keynr;
my_off_t keyroot=info->s->state.key_root[ftb->keynr];
uint extra=HA_FT_WLEN+info->s->rec_reflength; /* just a shortcut */
uint extra=HA_FT_WLEN+ftb->info->s->rec_reflength; /* just a shortcut */
if (! ftb->ok)
if (ftb->state != UNINITIALIZED)
return;
param.prev=' ';
......@@ -132,7 +130,7 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
case 1:
ftbw=(FTB_WORD *)alloc_root(&ftb->mem_root,
sizeof(FTB_WORD) + (param.trunc ? MI_MAX_KEY_BUFF : w.len+extra));
ftbw->len=w.len + !param.trunc;
ftbw->len=w.len+1;
ftbw->yesno=param.yesno;
ftbw->trunc=param.trunc; /* 0 or 1 */
ftbw->weight=weight;
......@@ -142,37 +140,55 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
memcpy(ftbw->word+1, w.pos, w.len);
ftbw->word[0]=w.len;
if (ftbw->yesno > 0) up->ythresh++;
/*****************************************/
r=_mi_search(info, keyinfo, ftbw->word, ftbw->len,
SEARCH_FIND | SEARCH_PREFIX, keyroot);
if (!r)
{
r=_mi_compare_text(default_charset_info,
info->lastkey+ftbw->trunc,ftbw->len,
ftbw->word+ftbw->trunc,ftbw->len,0);
}
if (r) /* not found */
{
if (ftbw->yesno>0 && ftbw->up->up==0)
{ /* this word MUST BE present in every document returned,
so we can abort the search right now */
ftb->ok=0;
return;
}
}
else
{
memcpy(ftbw->word, info->lastkey, info->lastkey_length);
ftbw->docid=info->lastpos;
queue_insert(& ftb->queue, (byte *)ftbw);
}
/*****************************************/
queue_insert(& ftb->queue, (byte *)ftbw);
break;
}
}
return;
}
void _ftb_init_index_search(FT_INFO *ftb)
{
int i, r;
FTB_WORD *ftbw;
MI_INFO *info=ftb->info;
MI_KEYDEF *keyinfo=info->s->keyinfo+ftb->keynr;
my_off_t keyroot=info->s->state.key_root[ftb->keynr];
if (ftb->state != READY)
return;
ftb->state=INDEX_SEARCH;
for (i=ftb->queue.elements; i; i--)
{
ftbw=(FTB_WORD *)(ftb->queue.root[i]);
r=_mi_search(info, keyinfo, ftbw->word, ftbw->len,
SEARCH_FIND | SEARCH_PREFIX, keyroot);
if (!r)
{
r=_mi_compare_text(default_charset_info,
info->lastkey+ftbw->trunc,ftbw->len-ftbw->trunc,
ftbw->word+ftbw->trunc,ftbw->len-ftbw->trunc,0);
}
if (r) /* not found */
{
if (ftbw->yesno>0 && ftbw->up->up==0)
{ /* this word MUST BE present in every document returned,
so we can abort the search right now */
ftb->state=INDEX_DONE;
return;
}
}
else
{
memcpy(ftbw->word, info->lastkey, info->lastkey_length);
ftbw->docid=info->lastpos;
}
}
queue_fix(& ftb->queue);
}
FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query,
uint query_len, my_bool presort __attribute__((unused)))
{
......@@ -183,7 +199,7 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query,
if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME))))
return 0;
ftb->please=& _ft_vft_boolean;
ftb->ok=1;
ftb->state=UNINITIALIZED;
ftb->info=info;
ftb->keynr=keynr;
......@@ -202,9 +218,63 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query,
ftbe->docid=HA_POS_ERROR;
ftb->root=ftbe;
_ftb_parse_query(ftb, &query, query+query_len, ftbe, 0, 0);
ftb->state=READY;
return ftb;
}
void _ftb_climb_the_tree(FTB_WORD *ftbw, my_off_t curdoc)
{
FTB_EXPR *ftbe;
float weight=ftbw->weight;
int yn=ftbw->yesno;
for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
{
if (ftbe->docid != curdoc)
{
ftbe->cur_weight=ftbe->yesses=ftbe->nos=0;
ftbe->docid=curdoc;
}
if (yn>0)
{
ftbe->cur_weight+=weight;
if (++ftbe->yesses >= ftbe->ythresh && !ftbe->nos)
{
yn=ftbe->yesno;
weight=ftbe->cur_weight*ftbe->weight;
}
else
break;
}
else
if (yn<0)
{
/* NOTE: special sort function of queue assures that all yn<0
* events for every particular subexpression will
* "auto-magically" happen BEFORE all yn>=0 events. So no
* already matched expression can become not-matched again.
*/
++ftbe->nos;
break;
}
else
/* if (yn==0) */
{
if (ftbe->yesses >= ftbe->ythresh && !ftbe->nos)
{
yn=ftbe->yesno;
ftbe->cur_weight=weight;
weight*=ftbe->weight;
}
else
{
ftbe->cur_weight+=weight;
break;
}
}
}
}
int ft_boolean_read_next(FT_INFO *ftb, char *record)
{
FTB_EXPR *ftbe, *up;
......@@ -215,6 +285,9 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record)
my_off_t curdoc;
int r;
if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE)
return -1;
/* black magic ON */
if ((int) _mi_check_index(info, ftb->keynr) < 0)
return my_errno;
......@@ -225,66 +298,21 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record)
if (!ftb->queue.elements)
return my_errno=HA_ERR_END_OF_FILE;
while(ftb->ok &&
while(ftb->state == INDEX_SEARCH &&
(curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid) != HA_POS_ERROR)
{
while (curdoc==(ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid)
{
float weight=ftbw->weight;
int yn=ftbw->yesno;
for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
{
if (ftbe->docid != curdoc)
{
ftbe->cur_weight=ftbe->yesses=ftbe->nos=0;
ftbe->docid=curdoc;
}
if (yn>0)
{
ftbe->cur_weight+=weight;
if (++ftbe->yesses >= ftbe->ythresh && !ftbe->nos)
{
yn=ftbe->yesno;
weight=ftbe->cur_weight*ftbe->weight;
}
else
break;
}
else
if (yn<0)
{
/* NOTE: special sort function of queue assures that all yn<0
* events for every particular subexpression will
* "auto-magically" happen BEFORE all yn>=0 events. So no
* already matched expression can become not-matched again.
*/
++ftbe->nos;
break;
}
else
/* if (yn==0) */
{
if (ftbe->yesses >= ftbe->ythresh && !ftbe->nos)
{
yn=ftbe->yesno;
ftbe->cur_weight=weight;
weight*=ftbe->weight;
}
else
{
ftbe->cur_weight+=weight;
break;
}
}
}
_ftb_climb_the_tree(ftbw, curdoc);
/* update queue */
r=_mi_search(info, keyinfo, ftbw->word, USE_WHOLE_KEY, /*ftbw->len,*/
r=_mi_search(info, keyinfo, ftbw->word, USE_WHOLE_KEY,
SEARCH_BIGGER , keyroot);
if (!r)
{
r=_mi_compare_text(default_charset_info,
info->lastkey+ftbw->trunc,ftbw->len,
ftbw->word+ftbw->trunc,ftbw->len,0);
info->lastkey+ftbw->trunc,ftbw->len-ftbw->trunc,
ftbw->word+ftbw->trunc,ftbw->len-ftbw->trunc,0);
}
if (r) /* not found */
{
......@@ -292,7 +320,7 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record)
if (ftbw->yesno>0 && ftbw->up->up==0)
{ /* this word MUST BE present in every document returned,
so we can stop the search right now */
ftb->ok=0;
ftb->state=INDEX_DONE;
}
}
else
......@@ -304,7 +332,8 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record)
}
ftbe=ftb->root;
if (ftbe->cur_weight>0 && ftbe->yesses>=ftbe->ythresh && !ftbe->nos)
if (ftbe->docid==curdoc && ftbe->cur_weight>0 &&
ftbe->yesses>=ftbe->ythresh && !ftbe->nos)
{
/* curdoc matched ! */
info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); /* why is this ? */
......@@ -321,10 +350,56 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record)
return my_errno=HA_ERR_END_OF_FILE;
}
float ft_boolean_find_relevance(FT_INFO *ftb,
my_off_t docid __attribute__((unused)), byte *record)
float ft_boolean_find_relevance(FT_INFO *ftb, my_off_t docid, byte *record)
{
return -1.0; /* to be done via str scan */
TREE ptree;
FT_WORD word;
FTB_WORD *ftbw;
FTB_EXPR *ftbe;
uint i;
if (ftb->state == READY)
{
queue_fix(& ftb->queue);
ftb->state=SCAN;
}
else if (ftb->state != SCAN)
return -1.0;
bzero(&ptree, sizeof(ptree));
if (_mi_ft_parse(& ptree, ftb->info, ftb->keynr, record))
return -1.0;
for (i=1; i<=ftb->queue.elements; i++)
{
ftbw=(FTB_WORD *)(ftb->queue.root[i]);
ptree.custom_arg=(void *)(ftbw->trunc);
word.pos=ftbw->word+1;
word.len=ftbw->len-1;
if (tree_search(& ptree, & word))
{ /* found! */
_ftb_climb_the_tree(ftbw, docid);
}
else
{ /* not found! */
if (ftbw->yesno>0 && ftbw->up->up==0)
{ /* but this word MUST BE present in every document matched,
so we can stop the search right now */
break;
}
}
}
delete_tree(& ptree);
ftbe=ftb->root;
if (ftbe->docid==docid && ftbe->cur_weight>0 &&
ftbe->yesses>=ftbe->ythresh && !ftbe->nos)
{ /* row matched ! */
return ftbe->cur_weight;
}
else
{ /* match failed ! */
return 0.0;
}
}
void ft_boolean_close_search(FT_INFO *ftb)
......@@ -345,6 +420,6 @@ my_off_t ft_boolean_get_docid(FT_INFO *ftb)
void ft_boolean_reinit_search(FT_INFO *ftb)
{
fprintf(stderr, "ft_boolean_reinit_search called!\n");
_ftb_init_index_search(ftb);
}
......@@ -33,17 +33,16 @@ typedef struct st_ft_docstat {
double max, nsum, nsum2;
#endif /* EVAL_RUN */
MI_INFO *info;
uint keynr;
byte *keybuf;
// MI_INFO *info;
// uint keynr;
// byte *keybuf;
} FT_DOCSTAT;
static int FT_WORD_cmp(void* cmp_arg __attribute__((unused)),
FT_WORD *w1, FT_WORD *w2)
static int FT_WORD_cmp(void* cmp_arg, FT_WORD *w1, FT_WORD *w2)
{
return _mi_compare_text(default_charset_info,
(uchar*) w1->pos,w1->len,
(uchar*) w2->pos, w2->len,0);
(uchar*) w2->pos, w2->len,(my_bool)cmp_arg);
}
static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
......@@ -64,7 +63,9 @@ static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
/* transforms tree of words into the array, applying normalization */
FT_WORD * ft_linearize(MI_INFO *info, uint keynr, byte *keybuf, TREE *wtree)
FT_WORD * ft_linearize(//MI_INFO *info, uint keynr,
//byte *keybuf,
TREE *wtree)
{
FT_WORD *wlist,*p;
FT_DOCSTAT docstat;
......@@ -73,9 +74,9 @@ FT_WORD * ft_linearize(MI_INFO *info, uint keynr, byte *keybuf, TREE *wtree)
if ((wlist=(FT_WORD *) my_malloc(sizeof(FT_WORD)*
(1+wtree->elements_in_tree),MYF(0))))
{
docstat.info=info;
docstat.keynr=keynr;
docstat.keybuf=keybuf;
// docstat.info=info;
// docstat.keynr=keynr;
// docstat.keybuf=keybuf;
docstat.list=wlist;
docstat.uniq=wtree->elements_in_tree;
#ifdef EVAL_RUN
......@@ -207,19 +208,6 @@ byte ft_simple_get_word(byte **start, byte *end, FT_WORD *word)
return 0;
}
int is_boolean(byte *q, uint len)
{
if (!len) return 0;
if (*q == FTB_YES || *q == FTB_NO) return 1;
for (++q; --len; ++q)
{
if ((*q == FTB_YES || *q == FTB_NO) && q[-1] == ' ' && true_word_char(q[1]))
return 1;
}
return 0;
}
TREE * ft_parse(TREE *wtree, byte *doc, int doclen)
{
byte *end=doc+doclen;
......
......@@ -29,17 +29,12 @@
/* parses a document i.e. calls _mi_ft_parse for every keyseg */
FT_WORD * _mi_ft_parserecord(MI_INFO *info, uint keynr, byte *keybuf,
const byte *record)
uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record)
{
TREE *parsed, ptree;
MI_KEYSEG *keyseg;
byte *pos;
uint i;
MI_KEYSEG *keyseg=info->s->keyinfo[keynr].seg;
bzero(parsed=&ptree, sizeof(ptree));
keyseg=info->s->keyinfo[keynr].seg;
for (i=info->s->keyinfo[keynr].keysegs-FT_SEGS ; i-- ; )
{
uint len;
......@@ -62,13 +57,26 @@ FT_WORD * _mi_ft_parserecord(MI_INFO *info, uint keynr, byte *keybuf,
}
else
len=keyseg->length;
if (!(parsed=ft_parse(parsed, pos, len)))
return NULL;
if (!(ft_parse(parsed, pos, len)))
return 1;
}
/* Handle the case where all columns are NULL */
if (!is_tree_inited(parsed) && !(parsed=ft_parse(parsed, (byte*) "", 0)))
if (!is_tree_inited(parsed) && !(ft_parse(parsed, (byte*) "", 0)))
return 1;
else
return 0;
}
FT_WORD * _mi_ft_parserecord(MI_INFO *info, uint keynr, byte *keybuf,
const byte *record)
{
TREE ptree;
bzero(&ptree, sizeof(ptree));
if (_mi_ft_parse(& ptree, info, keynr, record))
return NULL;
return ft_linearize(info, keynr, keybuf, parsed);
return ft_linearize(/*info, keynr, keybuf, */ & ptree);
}
static int _mi_ft_store(MI_INFO *info, uint keynr, byte *keybuf,
......@@ -158,7 +166,7 @@ int _mi_ft_cmp(MI_INFO *info, uint keynr, const byte *rec1, const byte *rec2)
}
/* update a document entry */
int _mi_ft_update(MI_INFO *info, uint keynr, byte *keybuf,
int _mi_ft_update(MI_INFO *info, uint keynr, byte *keybuf,
const byte *oldrec, const byte *newrec, my_off_t pos)
{
int error= -1;
......
......@@ -120,8 +120,8 @@ byte ft_get_word(byte **, byte *, FT_WORD *, FTB_PARAM *);
byte ft_simple_get_word(byte **, byte *, FT_WORD *);
TREE * ft_parse(TREE *, byte *, int);
FT_WORD * ft_linearize(MI_INFO *, uint, byte *, TREE *);
FT_WORD * _mi_ft_parserecord(MI_INFO *, uint , byte *, const byte *);
FT_WORD * ft_linearize(/*MI_INFO *, uint, byte *, */TREE *);
FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, byte *, const byte *);
const struct _ft_vft _ft_vft_nlq;
FT_INFO *ft_init_nlq_search(MI_INFO *, uint, byte *, uint, my_bool);
......
......@@ -21,8 +21,8 @@ INSERT INTO t2 VALUES (5,2,'um copo de Vodka');
INSERT INTO t2 VALUES (6,2,'um chocolate Snickers');
INSERT INTO t2 VALUES (7,1,'Bife');
INSERT INTO t2 VALUES (8,1,'Pizza de Salmao');
SELECT t1.q, t2.item, t2.id, MATCH t2.item AGAINST ('sushi') as x FROM t1, t2
WHERE (t2.id2 = t1.id) ORDER BY x DESC,t2.id;
SELECT t1.q, t2.item, t2.id, MATCH t2.item AGAINST ('sushi')
as x FROM t1, t2 WHERE (t2.id2 = t1.id) ORDER BY x DESC,t2.id;
q item id x
aaaaaaaaa dsaass de sushi 1 1.92378664016724
aaaaaaaaa dsaass de Bolo de Chocolate 2 0
......@@ -32,8 +32,19 @@ ssde df s fsda sad er um copo de Vodka 5 0
ssde df s fsda sad er um chocolate Snickers 6 0
aaaaaaaaa dsaass de Bife 7 0
aaaaaaaaa dsaass de Pizza de Salmao 8 0
SELECT t1.q, t2.item, t2.id, MATCH t2.item AGAINST ('sushi') as x FROM t2, t1
WHERE (t2.id2 = t1.id) ORDER BY x DESC,t2.id;
SELECT t1.q, t2.item, t2.id, MATCH t2.item AGAINST ('sushi' IN BOOLEAN MODE)
as x FROM t1, t2 WHERE (t2.id2 = t1.id) ORDER BY x DESC,t2.id;
q item id x
aaaaaaaaa dsaass de sushi 1 1
aaaaaaaaa dsaass de Bolo de Chocolate 2 0
aaaaaaaaa dsaass de Feijoada 3 0
aaaaaaaaa dsaass de Mousse de Chocolate 4 0
ssde df s fsda sad er um copo de Vodka 5 0
ssde df s fsda sad er um chocolate Snickers 6 0
aaaaaaaaa dsaass de Bife 7 0
aaaaaaaaa dsaass de Pizza de Salmao 8 0
SELECT t1.q, t2.item, t2.id, MATCH t2.item AGAINST ('sushi')
as x FROM t2, t1 WHERE (t2.id2 = t1.id) ORDER BY x DESC,t2.id;
q item id x
aaaaaaaaa dsaass de sushi 1 1.92378664016724
aaaaaaaaa dsaass de Bolo de Chocolate 2 0
......@@ -43,4 +54,15 @@ ssde df s fsda sad er um copo de Vodka 5 0
ssde df s fsda sad er um chocolate Snickers 6 0
aaaaaaaaa dsaass de Bife 7 0
aaaaaaaaa dsaass de Pizza de Salmao 8 0
SELECT t1.q, t2.item, t2.id, MATCH t2.item AGAINST ('sushi' IN BOOLEAN MODE)
as x FROM t2, t1 WHERE (t2.id2 = t1.id) ORDER BY x DESC,t2.id;
q item id x
aaaaaaaaa dsaass de sushi 1 1
aaaaaaaaa dsaass de Bolo de Chocolate 2 0
aaaaaaaaa dsaass de Feijoada 3 0
aaaaaaaaa dsaass de Mousse de Chocolate 4 0
ssde df s fsda sad er um copo de Vodka 5 0
ssde df s fsda sad er um chocolate Snickers 6 0
aaaaaaaaa dsaass de Bife 7 0
aaaaaaaaa dsaass de Pizza de Salmao 8 0
drop table t1, t2;
......@@ -26,10 +26,16 @@ INSERT INTO t2 VALUES (6,2,'um chocolate Snickers');
INSERT INTO t2 VALUES (7,1,'Bife');
INSERT INTO t2 VALUES (8,1,'Pizza de Salmao');
SELECT t1.q, t2.item, t2.id, MATCH t2.item AGAINST ('sushi') as x FROM t1, t2
WHERE (t2.id2 = t1.id) ORDER BY x DESC,t2.id;
SELECT t1.q, t2.item, t2.id, MATCH t2.item AGAINST ('sushi')
as x FROM t1, t2 WHERE (t2.id2 = t1.id) ORDER BY x DESC,t2.id;
SELECT t1.q, t2.item, t2.id, MATCH t2.item AGAINST ('sushi') as x FROM t2, t1
WHERE (t2.id2 = t1.id) ORDER BY x DESC,t2.id;
SELECT t1.q, t2.item, t2.id, MATCH t2.item AGAINST ('sushi' IN BOOLEAN MODE)
as x FROM t1, t2 WHERE (t2.id2 = t1.id) ORDER BY x DESC,t2.id;
SELECT t1.q, t2.item, t2.id, MATCH t2.item AGAINST ('sushi')
as x FROM t2, t1 WHERE (t2.id2 = t1.id) ORDER BY x DESC,t2.id;
SELECT t1.q, t2.item, t2.id, MATCH t2.item AGAINST ('sushi' IN BOOLEAN MODE)
as x FROM t2, t1 WHERE (t2.id2 = t1.id) ORDER BY x DESC,t2.id;
drop table t1, t2;
......@@ -124,7 +124,6 @@ byte *queue_remove(register QUEUE *queue, uint idx)
}
}
/* Fix when element on top has been replaced */
#ifndef queue_replaced
......@@ -166,3 +165,19 @@ void _downheap(register QUEUE *queue, uint idx)
}
queue->root[idx]=element;
}
static int queue_fix_cmp(QUEUE *queue, void *a, void *b)
{
return queue->compare(queue->first_cmp_arg,
a+queue->offset_to_key,
b+queue->offset_to_key);
}
/* Fix heap when every element was changed */
void queue_fix(QUEUE *queue)
{
qsort2(queue->root+1,queue->elements, sizeof(void *),
(qsort2_cmp)queue_fix_cmp, queue);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment