Commit b820d2a3 authored by serg@serg.mylan's avatar serg@serg.mylan

query expansion for fulltext search

parent df693095
......@@ -51,18 +51,18 @@ extern const char *ft_precompiled_stopwords[];
extern ulong ft_min_word_len;
extern ulong ft_max_word_len;
extern ulong ft_max_word_len_for_sort;
extern ulong ft_query_expansion_limit;
extern const char *ft_boolean_syntax;
int ft_init_stopwords(void);
void ft_free_stopwords(void);
#define FT_NL 0 /* this MUST be 0, see ft_init_search() */
#define FT_BOOL 1 /* this MUST be 1, see ft_init_search() */
#define FT_NL 0
#define FT_BOOL 1
#define FT_SORTED 2
#define FT_EXPAND 4 /* query expansion */
FT_INFO *ft_init_search(uint,void *, uint, byte *, uint);
FT_INFO *ft_init_search(uint,void *, uint, byte *, uint, byte *);
#ifdef __cplusplus
}
......
......@@ -20,8 +20,6 @@
#define FT_CORE
#include "ftdefs.h"
#include <queues.h>
#include <assert.h> /* for DBUG_ASSERT() */
/* search with boolean queries */
......@@ -340,8 +338,7 @@ static void _ftb_init_index_search(FT_INFO *ftb)
FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query,
uint query_len,
uint flags __attribute__((unused)))
uint query_len)
{
FTB *ftb;
FTB_EXPR *ftbe;
......
......@@ -167,17 +167,28 @@ static int walk_and_copy(FT_SUPERDOC *from,
DBUG_RETURN(0);
}
static int walk_and_push(FT_SUPERDOC *from,
uint32 count __attribute__((unused)), QUEUE *best)
{
DBUG_ENTER("walk_and_copy");
from->doc.weight+=from->tmp_weight*from->word_ptr->weight;
set_if_smaller(best->elements, ft_query_expansion_limit-1)
queue_insert(best, (byte *)& from->doc);
DBUG_RETURN(0);
}
static int FT_DOC_cmp(FT_DOC *a, FT_DOC *b)
static int FT_DOC_cmp(void *unused __attribute__((unused)),
FT_DOC *a, FT_DOC *b)
{
return sgn(b->weight - a->weight);
}
FT_INFO *ft_init_nlq_search(MI_INFO *info, uint keynr, byte *query,
uint query_len, uint flags)
uint query_len, uint flags, byte *record)
{
TREE allocated_wtree, *wtree=&allocated_wtree;
TREE wtree;
ALL_IN_ONE aio;
FT_DOC *dptr;
FT_INFO *dlist=NULL;
......@@ -196,24 +207,47 @@ FT_INFO *ft_init_nlq_search(MI_INFO *info, uint keynr, byte *query,
aio.charset=info->s->keyinfo[keynr].seg->charset;
aio.keybuff=info->lastkey+info->s->base.max_key_length;
bzero(&allocated_wtree,sizeof(allocated_wtree));
bzero(&wtree,sizeof(wtree));
init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp,0,
NULL, NULL);
ft_parse_init(&allocated_wtree, aio.charset);
if (ft_parse(&allocated_wtree,query,query_len))
ft_parse_init(&wtree, aio.charset);
if (ft_parse(&wtree,query,query_len,0))
goto err;
if (tree_walk(wtree, (tree_walk_action)&walk_and_match, &aio,
if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
left_root_right))
goto err2;
goto err;
if (flags & FT_EXPAND && ft_query_expansion_limit)
{
QUEUE best;
init_queue(&best,ft_query_expansion_limit,0,0, &FT_DOC_cmp, 0);
tree_walk(&aio.dtree, (tree_walk_action) &walk_and_push,
&best, left_root_right);
while (best.elements)
{
my_off_t docid=((FT_DOC *)queue_remove(& best, 0))->dpos;
if (!(*info->read_record)(info,docid,record))
{
info->update|= HA_STATE_AKTIV;
_mi_ft_parse(&wtree, info, keynr, record,1);
}
}
delete_queue(&best);
reset_tree(&aio.dtree);
if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
left_root_right))
goto err;
}
dlist=(FT_INFO *)my_malloc(sizeof(FT_INFO)+
sizeof(FT_DOC)*(aio.dtree.elements_in_tree-1),
MYF(0));
if(!dlist)
goto err2;
if (!dlist)
goto err;
dlist->please= (struct _ft_vft *) & _ft_vft_nlq;
dlist->ndocs=aio.dtree.elements_in_tree;
......@@ -225,13 +259,11 @@ FT_INFO *ft_init_nlq_search(MI_INFO *info, uint keynr, byte *query,
&dptr, left_root_right);
if (flags & FT_SORTED)
qsort(dlist->doc, dlist->ndocs, sizeof(FT_DOC), (qsort_cmp)&FT_DOC_cmp);
err2:
delete_tree(wtree);
delete_tree(&aio.dtree);
qsort2(dlist->doc, dlist->ndocs, sizeof(FT_DOC), (qsort2_cmp)&FT_DOC_cmp, 0);
err:
delete_tree(&aio.dtree);
delete_tree(&wtree);
info->lastpos=saved_lastpos;
DBUG_RETURN(dlist);
}
......
......@@ -183,7 +183,7 @@ void ft_parse_init(TREE *wtree, CHARSET_INFO *cs)
DBUG_VOID_RETURN;
}
int ft_parse(TREE *wtree, byte *doc, int doclen)
int ft_parse(TREE *wtree, byte *doc, int doclen, my_bool with_alloc)
{
byte *end=doc+doclen;
FT_WORD w;
......@@ -191,6 +191,15 @@ int ft_parse(TREE *wtree, byte *doc, int doclen)
while (ft_simple_get_word(wtree->custom_arg, &doc,end,&w))
{
if (with_alloc)
{
byte *ptr;
/* allocating the data in the tree - to avoid mallocs and frees */
DBUG_ASSERT(wtree->with_delete==0);
ptr=(byte *)alloc_root(& wtree->mem_root,w.len);
memcpy(ptr, w.pos, w.len);
w.pos=ptr;
}
if (!tree_insert(wtree, &w, 0, wtree->custom_arg))
goto err;
}
......
......@@ -20,7 +20,7 @@
ulong ft_min_word_len=4;
ulong ft_max_word_len=HA_FT_MAXLEN;
ulong ft_max_word_len_for_sort=20;
ulong ft_query_expansion_limit=5;
const char *ft_boolean_syntax="+ -><()~*:\"\"&|";
const HA_KEYSEG ft_keysegs[FT_SEGS]={
......@@ -53,14 +53,13 @@ const struct _ft_vft _ft_vft_boolean = {
ft_boolean_get_relevance, ft_boolean_reinit_search
};
FT_INFO *(*_ft_init_vft[2])(MI_INFO *, uint, byte *, uint, uint) =
{ ft_init_nlq_search, ft_init_boolean_search };
FT_INFO *ft_init_search(uint flags, void *info, uint keynr,
byte *query, uint query_len)
byte *query, uint query_len, byte *record)
{
return (*_ft_init_vft[ flags&1 ])((MI_INFO *)info, keynr,
query, query_len, flags);
if (flags & FT_BOOL)
ft_init_boolean_search((MI_INFO *)info, keynr, query, query_len);
else
ft_init_nlq_search((MI_INFO *)info, keynr, query, query_len, flags, record);
}
const char *ft_stopword_file = 0;
......
......@@ -97,7 +97,8 @@ uint _mi_ft_segiterator(register FT_SEG_ITERATOR *ftsi)
/* parses a document i.e. calls ft_parse for every keyseg */
uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record)
uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr,
const byte *record, my_bool with_alloc)
{
FT_SEG_ITERATOR ftsi;
DBUG_ENTER("_mi_ft_parse");
......@@ -108,7 +109,7 @@ uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record)
while (_mi_ft_segiterator(&ftsi))
{
if (ftsi.pos)
if (ft_parse(parsed, (byte *)ftsi.pos, ftsi.len))
if (ft_parse(parsed, (byte *)ftsi.pos, ftsi.len, with_alloc))
DBUG_RETURN(1);
}
DBUG_RETURN(0);
......@@ -120,7 +121,7 @@ FT_WORD * _mi_ft_parserecord(MI_INFO *info, uint keynr, const byte *record)
DBUG_ENTER("_mi_ft_parserecord");
bzero((char*) &ptree, sizeof(ptree));
if (_mi_ft_parse(&ptree, info, keynr, record))
if (_mi_ft_parse(&ptree, info, keynr, record,0))
DBUG_RETURN(NULL);
DBUG_RETURN(ft_linearize(&ptree));
......
......@@ -21,11 +21,15 @@
#include "fulltext.h"
#include <m_ctype.h>
#include <my_tree.h>
#include <queues.h>
#include <assert.h>
#define true_word_char(s,X) (my_isalnum(s,X) || (X)=='_')
#define misc_word_char(X) ((X)=='\'')
#define word_char(s,X) (true_word_char(s,X) || misc_word_char(X))
#define FT_MAX_WORD_LEN_FOR_SORT 20
#define COMPILE_STOPWORDS_IN
/* Interested readers may consult SMART
......@@ -122,13 +126,15 @@ void _mi_ft_segiterator_dummy_init(const byte *, uint, FT_SEG_ITERATOR *);
uint _mi_ft_segiterator(FT_SEG_ITERATOR *);
void ft_parse_init(TREE *, CHARSET_INFO *);
int ft_parse(TREE *, byte *, int);
int ft_parse(TREE *, byte *, int, my_bool);
FT_WORD * ft_linearize(TREE *);
FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, const byte *);
uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record);
uint _mi_ft_parse(TREE *, MI_INFO *, uint, const byte *, my_bool);
FT_INFO *ft_init_nlq_search(MI_INFO *, uint, byte *, uint, uint, byte *);
FT_INFO *ft_init_boolean_search(MI_INFO *, uint, byte *, uint);
extern const struct _ft_vft _ft_vft_nlq;
FT_INFO *ft_init_nlq_search(MI_INFO *, uint, byte *, uint, uint);
int ft_nlq_read_next(FT_INFO *, char *);
float ft_nlq_find_relevance(FT_INFO *, byte *, uint);
void ft_nlq_close_search(FT_INFO *);
......@@ -137,10 +143,10 @@ my_off_t ft_nlq_get_docid(FT_INFO *);
void ft_nlq_reinit_search(FT_INFO *);
extern const struct _ft_vft _ft_vft_boolean;
FT_INFO *ft_init_boolean_search(MI_INFO *, uint, byte *, uint, uint);
int ft_boolean_read_next(FT_INFO *, char *);
float ft_boolean_find_relevance(FT_INFO *, byte *, uint);
void ft_boolean_close_search(FT_INFO *);
float ft_boolean_get_relevance(FT_INFO *);
my_off_t ft_boolean_get_docid(FT_INFO *);
void ft_boolean_reinit_search(FT_INFO *);
......@@ -20,7 +20,6 @@
#include <m_ctype.h>
#include <stdarg.h>
#include <my_getopt.h>
#include <assert.h>
#ifdef HAVE_SYS_VADVISE_H
#include <sys/vadvise.h>
#endif
......@@ -1955,11 +1954,11 @@ int mi_repair_by_sort(MI_CHECK *param, register MI_INFO *info,
if (sort_param.keyinfo->flag & HA_FULLTEXT)
{
sort_info.max_records=
(ha_rows) (sort_info.filelength/ft_max_word_len_for_sort+1);
(ha_rows) (sort_info.filelength/FT_MAX_WORD_LEN_FOR_SORT+1);
sort_param.key_read=sort_ft_key_read;
sort_param.key_write=sort_ft_key_write;
sort_param.key_length+=ft_max_word_len_for_sort-HA_FT_MAXLEN;
sort_param.key_length+=FT_MAX_WORD_LEN_FOR_SORT-HA_FT_MAXLEN;
}
else
{
......@@ -2350,7 +2349,7 @@ int mi_repair_parallel(MI_CHECK *param, register MI_INFO *info,
total_key_length+=sort_param[i].key_length;
if (sort_param[i].keyinfo->flag & HA_FULLTEXT)
sort_param[i].key_length+=ft_max_word_len_for_sort-ft_max_word_len;
sort_param[i].key_length+=FT_MAX_WORD_LEN_FOR_SORT-ft_max_word_len;
}
sort_info.total_keys=i;
sort_param[0].master= 1;
......@@ -3875,7 +3874,7 @@ static my_bool mi_too_big_key_for_sort(MI_KEYDEF *key, ha_rows rows)
{
uint key_maxlength=key->maxlength;
if (key->flag & HA_FULLTEXT)
key_maxlength+=ft_max_word_len_for_sort-HA_FT_MAXLEN;
key_maxlength+=FT_MAX_WORD_LEN_FOR_SORT-HA_FT_MAXLEN;
return (key->flag & (HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY | HA_FULLTEXT) &&
((ulonglong) rows * key_maxlength >
(ulonglong) myisam_max_temp_length));
......
......@@ -314,9 +314,6 @@ static struct my_option my_long_options[] =
{ "ft_max_word_len", OPT_FT_MAX_WORD_LEN, "", (gptr*) &ft_max_word_len,
(gptr*) &ft_max_word_len, 0, GET_ULONG, REQUIRED_ARG, HA_FT_MAXLEN, 10,
HA_FT_MAXLEN, 0, 1, 0},
{ "ft_max_word_len_for_sort", OPT_FT_MAX_WORD_LEN_FOR_SORT, "",
(gptr*) &ft_max_word_len_for_sort, (gptr*) &ft_max_word_len_for_sort, 0,
GET_ULONG, REQUIRED_ARG, 20, 4, HA_FT_MAXLEN, 0, 1, 0},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
};
......
......@@ -18,11 +18,20 @@ Full-text indexes are called collections
Only MyISAM tables support collections
select * from t1 where MATCH(a,b) AGAINST ("only");
a b
select * from t1 where MATCH(a,b) AGAINST ("collections") UNION ALL select * from t1 where MATCH(a,b) AGAINST ("indexes");
select * from t1 where MATCH(a,b) AGAINST ("collections" WITH QUERY EXPANSION);
a b
Only MyISAM tables support collections
Full-text indexes are called collections
MySQL has now support for full-text search
select * from t1 where MATCH(a,b) AGAINST ("indexes" WITH QUERY EXPANSION);
a b
Full-text indexes are called collections
Only MyISAM tables support collections
select * from t1 where MATCH(a,b) AGAINST ("indexes collections" WITH QUERY EXPANSION);
a b
Full-text indexes are called collections
Only MyISAM tables support collections
MySQL has now support for full-text search
explain select * from t1 where MATCH(a,b) AGAINST ("collections");
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 fulltext a a 0 1 Using where
......
......@@ -3,5 +3,5 @@ Variable_name Value
ft_boolean_syntax + -><()~*:""&|
ft_min_word_len 4
ft_max_word_len 254
ft_max_word_len_for_sort 20
ft_query_expansion_limit 20
ft_stopword_file (built-in)
......@@ -20,9 +20,11 @@ select * from t1 where MATCH(a,b) AGAINST ("indexes");
select * from t1 where MATCH(a,b) AGAINST ("indexes collections");
select * from t1 where MATCH(a,b) AGAINST ("only");
# UNION of fulltext's
select * from t1 where MATCH(a,b) AGAINST ("collections") UNION ALL select * from t1 where MATCH(a,b) AGAINST ("indexes");
# query expansion
select * from t1 where MATCH(a,b) AGAINST ("collections" WITH QUERY EXPANSION);
select * from t1 where MATCH(a,b) AGAINST ("indexes" WITH QUERY EXPANSION);
select * from t1 where MATCH(a,b) AGAINST ("indexes collections" WITH QUERY EXPANSION);
# add_ft_keys() tests
......@@ -66,7 +68,6 @@ select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE);
select * from t1 where MATCH b AGAINST ("sear*" IN BOOLEAN MODE);
# UNION of fulltext's
select * from t1 where MATCH(a,b) AGAINST ("collections") UNION ALL select * from t1 where MATCH(a,b) AGAINST ("indexes");
#update/delete with fulltext index
......
......@@ -90,7 +90,7 @@ class ha_myisam: public handler
return 0;
}
FT_INFO *ft_init_ext(uint flags, uint inx,const byte *key, uint keylen)
{ return ft_init_search(flags,file,inx,(byte*) key,keylen); }
{ return ft_init_search(flags,file,inx,(byte*) key,keylen, table->record[0]); }
int ft_read(byte *buf);
int rnd_init(bool scan=1);
int rnd_next(byte *buf);
......
......@@ -3568,7 +3568,7 @@ enum options_mysqld
OPT_CONNECT_TIMEOUT, OPT_DELAYED_INSERT_TIMEOUT,
OPT_DELAYED_INSERT_LIMIT, OPT_DELAYED_QUEUE_SIZE,
OPT_FLUSH_TIME, OPT_FT_MIN_WORD_LEN,
OPT_FT_MAX_WORD_LEN, OPT_FT_MAX_WORD_LEN_FOR_SORT, OPT_FT_STOPWORD_FILE,
OPT_FT_MAX_WORD_LEN, OPT_FT_QUERY_EXPANSION_LIMIT, OPT_FT_STOPWORD_FILE,
OPT_INTERACTIVE_TIMEOUT, OPT_JOIN_BUFF_SIZE,
OPT_KEY_BUFFER_SIZE, OPT_LONG_QUERY_TIME,
OPT_LOWER_CASE_TABLE_NAMES, OPT_MAX_ALLOWED_PACKET,
......@@ -4210,10 +4210,10 @@ replicating a LOAD DATA INFILE command.",
"The maximum length of the word to be included in a FULLTEXT index. Note: FULLTEXT indexes must be rebuilt after changing this variable.",
(gptr*) &ft_max_word_len, (gptr*) &ft_max_word_len, 0, GET_ULONG,
REQUIRED_ARG, HA_FT_MAXLEN, 10, HA_FT_MAXLEN, 0, 1, 0},
{ "ft_max_word_len_for_sort", OPT_FT_MAX_WORD_LEN_FOR_SORT,
"The maximum length of the word for repair_by_sorting. Longer words are included the slow way. The lower this value, the more words will be put in one sort bucket.",
(gptr*) &ft_max_word_len_for_sort, (gptr*) &ft_max_word_len_for_sort, 0, GET_ULONG,
REQUIRED_ARG, 20, 4, HA_FT_MAXLEN, 0, 1, 0},
{ "ft_query_expansion_limit", OPT_FT_QUERY_EXPANSION_LIMIT,
"Number of best matches to use for query expansion",
(gptr*) &ft_query_expansion_limit, (gptr*) &ft_query_expansion_limit, 0, GET_ULONG,
REQUIRED_ARG, 20, 0, 1000, 0, 1, 0},
{ "ft_stopword_file", OPT_FT_STOPWORD_FILE,
"Use stopwords from this file instead of built-in list.",
(gptr*) &ft_stopword_file, (gptr*) &ft_stopword_file, 0, GET_STR,
......
......@@ -556,7 +556,7 @@ struct show_var_st init_vars[]= {
{"ft_boolean_syntax", (char*) ft_boolean_syntax, SHOW_CHAR},
{"ft_min_word_len", (char*) &ft_min_word_len, SHOW_LONG},
{"ft_max_word_len", (char*) &ft_max_word_len, SHOW_LONG},
{"ft_max_word_len_for_sort",(char*) &ft_max_word_len_for_sort, SHOW_LONG},
{"ft_query_expansion_limit",(char*) &ft_query_expansion_limit, SHOW_LONG},
{"ft_stopword_file", (char*) &ft_stopword_file, SHOW_CHAR_PTR},
{"have_bdb", (char*) &have_berkeley_db, SHOW_HAVE},
{"have_crypt", (char*) &have_crypt, SHOW_HAVE},
......
......@@ -4546,6 +4546,7 @@ keyword:
| ESCAPE_SYM {}
| EVENTS_SYM {}
| EXECUTE_SYM {}
| EXPANSION_SYM {}
| EXTENDED_SYM {}
| FAST_SYM {}
| DISABLE_SYM {}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment