ftdefs.h 6.11 KB
Newer Older
bk@work.mysql.com's avatar
bk@work.mysql.com committed
1
/* Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2

bk@work.mysql.com's avatar
bk@work.mysql.com committed
3 4 5 6
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
7

bk@work.mysql.com's avatar
bk@work.mysql.com committed
8 9 10 11
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
12

bk@work.mysql.com's avatar
bk@work.mysql.com committed
13 14 15 16 17 18 19 20 21 22 23
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

/* Written by Sergei A. Golubchik, who has a shared copyright to this code */

/* some definitions for full-text indices */

#include "fulltext.h"
#include <m_ctype.h>
#include <my_tree.h>
24
#include <queues.h>
25
#include <mysql/plugin.h>
bk@work.mysql.com's avatar
bk@work.mysql.com committed
26

27 28 29
#define true_word_char(ctype, character) \
                      ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \
                       (character) == '_')
30
#define misc_word_char(X)	0
bk@work.mysql.com's avatar
bk@work.mysql.com committed
31

32
#define FT_MAX_WORD_LEN_FOR_SORT 31
33

34 35
#define FTPARSER_MEMROOT_ALLOC_SIZE 65536

bk@work.mysql.com's avatar
bk@work.mysql.com committed
36 37
#define COMPILE_STOPWORDS_IN

38 39 40 41 42 43 44
/* Interested readers may consult SMART
   (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z)
   for an excellent implementation of vector space model we use.
   It also demonstrate the usage of different weghting techniques.
   This code, though, is completely original and is not based on the
   SMART code but was in some cases inspired by it.

bk@work.mysql.com's avatar
bk@work.mysql.com committed
45 46 47 48 49 50 51 52 53 54 55 56 57 58
   NORM_PIVOT was taken from the article
   A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization",
   ACM SIGIR'96, 21-29, 1996
 */

#define LWS_FOR_QUERY					  LWS_TF
#define LWS_IN_USE					 LWS_LOG
#define PRENORM_IN_USE				     PRENORM_AVG
#define NORM_IN_USE				      NORM_PIVOT
#define GWS_IN_USE					GWS_PROB
/*==============================================================*/
#define LWS_TF						  (count)
#define LWS_BINARY					(count>0)
#define LWS_SQUARE				    (count*count)
monty@hundin.mysql.fi's avatar
monty@hundin.mysql.fi committed
59
#define LWS_LOG				 (count?(log( (double) count)+1):0)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
/*--------------------------------------------------------------*/
#define PRENORM_NONE				      (p->weight)
#define PRENORM_MAX			  (p->weight/docstat.max)
#define PRENORM_AUG		  (0.4+0.6*p->weight/docstat.max)
#define PRENORM_AVG	     (p->weight/docstat.sum*docstat.uniq)
#define PRENORM_AVGLOG ((1+log(p->weight))/(1+log(docstat.sum/docstat.uniq)))
/*--------------------------------------------------------------*/
#define NORM_NONE					      (1)
#define NORM_SUM				   (docstat.nsum)
#define NORM_COS			    (sqrt(docstat.nsum2))

#define PIVOT_VAL (0.0115)
#define NORM_PIVOT  (1+PIVOT_VAL*docstat.uniq)
/*---------------------------------------------------------------*/
#define GWS_NORM				     (1/sqrt(sum2))
#define GWS_GFIDF				      (sum/doc_cnt)
/* Mysterious, but w/o (double) GWS_IDF performs better :-o */
#define GWS_IDF		   log(aio->info->state->records/doc_cnt)
#define GWS_IDF1	   log((double)aio->info->state->records/doc_cnt)
79
#define GWS_PROB ((aio->info->state->records > doc_cnt) ? log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) : 0 )
bk@work.mysql.com's avatar
bk@work.mysql.com committed
80 81 82 83 84 85
#define GWS_FREQ					(1.0/doc_cnt)
#define GWS_SQUARED pow(log((double)aio->info->state->records/doc_cnt),2)
#define GWS_CUBIC   pow(log((double)aio->info->state->records/doc_cnt),3)
#define GWS_ENTROPY (1-(suml/sum-log(sum))/log(aio->info->state->records))
/*=================================================================*/

86
/* Boolean search operators */
87 88 89 90 91 92 93 94 95
#define FTB_YES   (ft_boolean_syntax[0])
#define FTB_EGAL  (ft_boolean_syntax[1])
#define FTB_NO    (ft_boolean_syntax[2])
#define FTB_INC   (ft_boolean_syntax[3])
#define FTB_DEC   (ft_boolean_syntax[4])
#define FTB_LBR   (ft_boolean_syntax[5])
#define FTB_RBR   (ft_boolean_syntax[6])
#define FTB_NEG   (ft_boolean_syntax[7])
#define FTB_TRUNC (ft_boolean_syntax[8])
serg@serg.mysql.com's avatar
serg@serg.mysql.com committed
96 97
#define FTB_LQUOT (ft_boolean_syntax[10])
#define FTB_RQUOT (ft_boolean_syntax[11])
98

bk@work.mysql.com's avatar
bk@work.mysql.com committed
99 100 101 102 103 104 105 106 107 108
typedef struct st_ft_word {
  byte * pos;
  uint	 len;
  double weight;
} FT_WORD;

int is_stopword(char *word, uint len);

uint _ft_make_key(MI_INFO *, uint , byte *, FT_WORD *, my_off_t);

109 110
byte ft_get_word(CHARSET_INFO *, byte **, byte *, FT_WORD *,
                 MYSQL_FTPARSER_BOOLEAN_INFO *);
111 112
byte ft_simple_get_word(CHARSET_INFO *, byte **, const byte *,
                        FT_WORD *, my_bool);
113

114 115
typedef struct _st_ft_seg_iterator {
  uint        num, len;
116
  HA_KEYSEG  *seg;
117 118 119 120 121 122 123
  const byte *rec, *pos;
} FT_SEG_ITERATOR;

void _mi_ft_segiterator_init(MI_INFO *, uint, const byte *, FT_SEG_ITERATOR *);
void _mi_ft_segiterator_dummy_init(const byte *, uint, FT_SEG_ITERATOR *);
uint _mi_ft_segiterator(FT_SEG_ITERATOR *);

124
void ft_parse_init(TREE *, CHARSET_INFO *);
125 126 127 128 129 130
int ft_parse(TREE *, byte *, int, struct st_mysql_ftparser *parser,
             MYSQL_FTPARSER_PARAM *, MEM_ROOT *);
FT_WORD * ft_linearize(TREE *, MEM_ROOT *);
FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, const byte *, MEM_ROOT *);
uint _mi_ft_parse(TREE *, MI_INFO *, uint, const byte *,
                  MYSQL_FTPARSER_PARAM *, MEM_ROOT *);
131 132

FT_INFO *ft_init_nlq_search(MI_INFO *, uint, byte *, uint, uint, byte *);
133
FT_INFO *ft_init_boolean_search(MI_INFO *, uint, byte *, uint, CHARSET_INFO *);
134

135
extern const struct _ft_vft _ft_vft_nlq;
136
int ft_nlq_read_next(FT_INFO *, char *);
137
float ft_nlq_find_relevance(FT_INFO *, byte *, uint);
138 139 140 141 142
void ft_nlq_close_search(FT_INFO *);
float ft_nlq_get_relevance(FT_INFO *);
my_off_t ft_nlq_get_docid(FT_INFO *);
void ft_nlq_reinit_search(FT_INFO *);

143
extern const struct _ft_vft _ft_vft_boolean;
144
int ft_boolean_read_next(FT_INFO *, char *);
145
float ft_boolean_find_relevance(FT_INFO *, byte *, uint);
146 147 148 149
void ft_boolean_close_search(FT_INFO *);
float ft_boolean_get_relevance(FT_INFO *);
my_off_t ft_boolean_get_docid(FT_INFO *);
void ft_boolean_reinit_search(FT_INFO *);
150
extern MYSQL_FTPARSER_PARAM *ftparser_call_initializer(MI_INFO *info,
151 152
                                                       uint keynr,
                                                       uint paramnr);
153
extern void ftparser_call_deinitializer(MI_INFO *info);