ft_parser.c 11.6 KB
Newer Older
unknown's avatar
unknown committed
1
/* Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2

unknown's avatar
unknown committed
3 4 5 6
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
7

unknown's avatar
unknown committed
8 9 10 11
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
12

unknown's avatar
unknown committed
13 14 15 16 17 18 19 20 21 22 23 24 25 26
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

/* Written by Sergei A. Golubchik, who has a shared copyright to this code */

#include "ftdefs.h"

typedef struct st_ft_docstat {
  FT_WORD *list;
  uint uniq;
  double sum;
} FT_DOCSTAT;

27 28
typedef struct st_my_ft_parser_param
{
29 30
  TREE     *wtree;
  MEM_ROOT *mem_root;
31 32
} MY_FT_PARSER_PARAM;

33
static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2)
unknown's avatar
unknown committed
34
{
unknown's avatar
unknown committed
35
  return mi_compare_text(cs, (uchar*) w1->pos, w1->len,
36
                         (uchar*) w2->pos, w2->len, 0, 0);
unknown's avatar
unknown committed
37 38 39 40 41 42 43 44 45 46 47 48
}

static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
{
    word->weight=LWS_IN_USE;
    docstat->sum+=word->weight;
    memcpy_fixed((docstat->list)++,word,sizeof(FT_WORD));
    return 0;
}

/* transforms tree of words into the array, applying normalization */

49
FT_WORD * ft_linearize(TREE *wtree, MEM_ROOT *mem_root)
unknown's avatar
unknown committed
50 51 52
{
  FT_WORD *wlist,*p;
  FT_DOCSTAT docstat;
unknown's avatar
unknown committed
53
  DBUG_ENTER("ft_linearize");
unknown's avatar
unknown committed
54

55 56
  if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)*
                                    (1+wtree->elements_in_tree))))
unknown's avatar
unknown committed
57 58 59 60 61 62 63
  {
    docstat.list=wlist;
    docstat.uniq=wtree->elements_in_tree;
    docstat.sum=0;
    tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right);
  }
  delete_tree(wtree);
64
  if (!wlist)
unknown's avatar
unknown committed
65
    DBUG_RETURN(NULL);
unknown's avatar
unknown committed
66 67 68

  docstat.list->pos=NULL;

69
  for (p=wlist;p->pos;p++)
unknown's avatar
unknown committed
70 71 72 73
  {
    p->weight=PRENORM_IN_USE;
  }

74
  for (p=wlist;p->pos;p++)
unknown's avatar
unknown committed
75 76 77 78
  {
    p->weight/=NORM_IN_USE;
  }

unknown's avatar
unknown committed
79
  DBUG_RETURN(wlist);
unknown's avatar
unknown committed
80 81
}

unknown's avatar
unknown committed
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
my_bool ft_boolean_check_syntax_string(const byte *str)
{
  uint i, j;

  if (!str ||
      (strlen(str)+1 != sizeof(ft_boolean_syntax)) ||
      (str[0] != ' ' && str[1] != ' '))
    return 1;
  for (i=0; i<sizeof(ft_boolean_syntax); i++)
  {
    /* limiting to 7-bit ascii only */
    if ((unsigned char)(str[i]) > 127 || my_isalnum(default_charset_info, str[i]))
      return 1;
    for (j=0; j<i; j++)
      if (str[i] == str[j] && (i != 11 || j != 10))
        return 1;
  }
  return 0;
}

unknown's avatar
unknown committed
102 103 104 105 106 107 108 109
/*
  RETURN VALUE
  0 - eof
  1 - word found
  2 - left bracket
  3 - right bracket
  4 - stopword found
*/
110
byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
111
                 FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
unknown's avatar
unknown committed
112
{
113
  byte *doc=*start;
114
  int ctype;
unknown's avatar
unknown committed
115
  uint mwc, length, mbl;
116

117
  param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
118 119
  param->weight_adjust= param->wasign= 0;
  param->type= FT_TOKEN_EOF;
unknown's avatar
unknown committed
120

121
  while (doc<end)
unknown's avatar
unknown committed
122
  {
123
    for (; doc < end; doc+= (mbl > 0 ? mbl : 1))
124
    {
125 126 127
      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
      if (true_word_char(ctype, *doc))
        break;
unknown's avatar
unknown committed
128 129
      if (*doc == FTB_RQUOT && param->quot)
      {
unknown's avatar
unknown committed
130
        param->quot=doc;
unknown's avatar
unknown committed
131
        *start=doc+1;
132 133
        param->type= FT_TOKEN_RIGHT_PAREN;
        goto ret;
unknown's avatar
unknown committed
134
      }
unknown's avatar
unknown committed
135
      if (!param->quot)
136
      {
unknown's avatar
unknown committed
137 138 139 140 141
        if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
        {
          /* param->prev=' '; */
          *start=doc+1;
          if (*doc == FTB_LQUOT) param->quot=*start;
142 143
          param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN);
          goto ret;
unknown's avatar
unknown committed
144 145 146 147 148 149
        }
        if (param->prev == ' ')
        {
          if (*doc == FTB_YES ) { param->yesno=+1;    continue; } else
          if (*doc == FTB_EGAL) { param->yesno= 0;    continue; } else
          if (*doc == FTB_NO  ) { param->yesno=-1;    continue; } else
150 151 152
          if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else
          if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else
          if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; }
unknown's avatar
unknown committed
153
        }
154 155
      }
      param->prev=*doc;
156
      param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
157
      param->weight_adjust= param->wasign= 0;
158 159
    }

160
    mwc=length=0;
161 162 163 164
    for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
    {
      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
      if (true_word_char(ctype, *doc))
165
        mwc=0;
unknown's avatar
unknown committed
166
      else if (!misc_word_char(*doc) || mwc)
167
        break;
unknown's avatar
unknown committed
168 169
      else
        mwc++;
170
    }
unknown's avatar
unknown committed
171
    param->prev='A'; /* be sure *prev is true_word_char */
172
    word->len= (uint)(doc-word->pos) - mwc;
173
    if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
174 175
      doc++;

176 177
    if (((length >= ft_min_word_len && !is_stopword(word->pos, word->len))
         || param->trunc) && length < ft_max_word_len)
178 179
    {
      *start=doc;
180 181
      param->type= FT_TOKEN_WORD;
      goto ret;
182
    }
unknown's avatar
unknown committed
183
    else if (length) /* make sure length > 0 (if start contains spaces only) */
184 185
    {
      *start= doc;
186 187
      param->type= FT_TOKEN_STOPWORD;
      goto ret;
188
    }
unknown's avatar
unknown committed
189
  }
unknown's avatar
unknown committed
190 191 192
  if (param->quot)
  {
    param->quot=*start=doc;
193 194
    param->type= 3; /* FT_RBR */
    goto ret;
unknown's avatar
unknown committed
195
  }
196 197
ret:
  return param->type;
198 199
}

200 201
byte ft_simple_get_word(CHARSET_INFO *cs, byte **start, const byte *end,
                        FT_WORD *word, my_bool skip_stopwords)
202
{
unknown's avatar
unknown committed
203
  byte *doc= *start;
204
  uint mwc, length, mbl;
205
  int ctype;
unknown's avatar
unknown committed
206
  DBUG_ENTER("ft_simple_get_word");
unknown's avatar
unknown committed
207

208
  do
unknown's avatar
unknown committed
209
  {
210
    for (;; doc+= (mbl > 0 ? mbl : 1))
unknown's avatar
unknown committed
211
    {
212 213 214 215 216
      if (doc >= end)
        DBUG_RETURN(0);
      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
      if (true_word_char(ctype, *doc))
        break;
217 218
    }

unknown's avatar
unknown committed
219
    mwc= length= 0;
220 221 222 223
    for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
    {
      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
      if (true_word_char(ctype, *doc))
unknown's avatar
unknown committed
224
        mwc= 0;
225
      else if (!misc_word_char(*doc) || mwc)
226
        break;
227 228
      else
        mwc++;
229
    }
230 231 232

    word->len= (uint)(doc-word->pos) - mwc;

233 234 235
    if (skip_stopwords == FALSE ||
        (length >= ft_min_word_len && length < ft_max_word_len &&
         !is_stopword(word->pos, word->len)))
236
    {
unknown's avatar
unknown committed
237
      *start= doc;
unknown's avatar
unknown committed
238
      DBUG_RETURN(1);
unknown's avatar
unknown committed
239
    }
unknown's avatar
unknown committed
240
  } while (doc < end);
unknown's avatar
unknown committed
241
  DBUG_RETURN(0);
242 243
}

244 245
void ft_parse_init(TREE *wtree, CHARSET_INFO *cs)
{
unknown's avatar
unknown committed
246
  DBUG_ENTER("ft_parse_init");
247 248
  if (!is_tree_inited(wtree))
    init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0,NULL, cs);
unknown's avatar
unknown committed
249
  DBUG_VOID_RETURN;
250 251
}

252

253 254
static int ft_add_word(MYSQL_FTPARSER_PARAM *param,
                       char *word, int word_len,
255
             MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused)))
256
{
257
  TREE *wtree;
258
  FT_WORD w;
259
  MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
260
  DBUG_ENTER("ft_add_word");
261
  wtree= ft_param->wtree;
262
  if (param->flags & MYSQL_FTFLAGS_NEED_COPY)
263
  {
264 265
    byte *ptr;
    DBUG_ASSERT(wtree->with_delete == 0);
266
    ptr= (byte *)alloc_root(ft_param->mem_root, word_len);
267 268 269 270 271 272 273 274 275 276
    memcpy(ptr, word, word_len);
    w.pos= ptr;
  }
  else
    w.pos= word;
  w.len= word_len;
  if (!tree_insert(wtree, &w, 0, wtree->custom_arg))
  {
    delete_tree(wtree);
    DBUG_RETURN(1);
277
  }
unknown's avatar
unknown committed
278
  DBUG_RETURN(0);
279
}
280

281

282 283
static int ft_parse_internal(MYSQL_FTPARSER_PARAM *param,
                             byte *doc, int doc_len)
284 285
{
  byte   *end=doc+doc_len;
286
  MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
287
  TREE *wtree= ft_param->wtree;
288 289 290 291
  FT_WORD w;
  DBUG_ENTER("ft_parse_internal");

  while (ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE))
292
    if (param->mysql_add_word(param, w.pos, w.len, 0))
293 294
      DBUG_RETURN(1);
  DBUG_RETURN(0);
unknown's avatar
unknown committed
295
}
296

297

298
int ft_parse(TREE *wtree, byte *doc, int doclen,
299
                    struct st_mysql_ftparser *parser,
300
                    MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
301 302 303 304
{
  MY_FT_PARSER_PARAM my_param;
  DBUG_ENTER("ft_parse");
  DBUG_ASSERT(parser);
305

306
  my_param.wtree= wtree;
307
  my_param.mem_root= mem_root;
308

309 310 311 312 313 314 315
  param->mysql_parse= ft_parse_internal;
  param->mysql_add_word= ft_add_word;
  param->mysql_ftparam= &my_param;
  param->cs= wtree->custom_arg;
  param->doc= doc;
  param->length= doclen;
  param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
316
  DBUG_RETURN(parser->parse(param));
317 318
}

319 320 321
#define MAX_PARAM_NR 2
MYSQL_FTPARSER_PARAM *ftparser_call_initializer(MI_INFO *info,
                                                uint keynr, uint paramnr)
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
{
  uint32 ftparser_nr;
  struct st_mysql_ftparser *parser;
  if (! info->ftparser_param)
  {
    /* info->ftparser_param can not be zero after the initialization,
       because it always includes built-in fulltext parser. And built-in
       parser can be called even if the table has no fulltext indexes and
       no varchar/text fields. */
    if (! info->s->ftparsers)
    {
      /* It's ok that modification to shared structure is done w/o mutex
         locks, because all threads would set the same variables to the
         same values. */
      uint i, j, keys= info->s->state.header.keys, ftparsers= 1;
      for (i= 0; i < keys; i++)
      {
        MI_KEYDEF *keyinfo= &info->s->keyinfo[i];
        if (keyinfo->flag & HA_FULLTEXT)
        {
          for (j= 0;; j++)
          {
            if (j == i)
            {
              keyinfo->ftparser_nr= ftparsers++;
              break;
            }
            if (info->s->keyinfo[j].flag & HA_FULLTEXT &&
                keyinfo->parser == info->s->keyinfo[j].parser)
            {
              keyinfo->ftparser_nr= info->s->keyinfo[j].ftparser_nr;
              break;
            }
          }
        }
      }
      info->s->ftparsers= ftparsers;
    }
360 361 362 363 364 365
    /*
      We have to allocate two MYSQL_FTPARSER_PARAM structures per plugin
      because in a boolean search a parser is called recursively
      ftb_find_relevance* calls ftb_check_phrase*
      (MAX_PARAM_NR=2)
    */
366
    info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
367
      my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
368
                info->s->ftparsers, MYF(MY_WME|MY_ZEROFILL));
369
    init_alloc_root(&info->ft_memroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
370 371 372 373 374 375 376 377 378 379 380 381 382
    if (! info->ftparser_param)
      return 0;
  }
  if (keynr == NO_SUCH_KEY)
  {
    ftparser_nr= 0;
    parser= &ft_default_parser;
  }
  else
  {
    ftparser_nr= info->s->keyinfo[keynr].ftparser_nr;
    parser= info->s->keyinfo[keynr].parser;
  }
383 384
  DBUG_ASSERT(paramnr < MAX_PARAM_NR);
  ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr;
385 386 387 388 389 390 391 392 393 394 395 396 397 398 399
  if (! info->ftparser_param[ftparser_nr].mysql_add_word)
  {
    /* Note, that mysql_add_word is used here as a flag:
       mysql_add_word == 0 - parser is not initialized
       mysql_add_word != 0 - parser is initialized, or no
                             initialization needed. */
    info->ftparser_param[ftparser_nr].mysql_add_word= (void *)1;
    if (parser->init && parser->init(&info->ftparser_param[ftparser_nr]))
      return 0;
  }
  return &info->ftparser_param[ftparser_nr];
}

void ftparser_call_deinitializer(MI_INFO *info)
{
400
  uint i, j, keys= info->s->state.header.keys;
401
  free_root(&info->ft_memroot, MYF(0));
402 403 404 405 406
  if (! info->ftparser_param)
    return;
  for (i= 0; i < keys; i++)
  {
    MI_KEYDEF *keyinfo= &info->s->keyinfo[i];
407
    for (j=0; j < MAX_PARAM_NR; j++)
408
    {
409 410 411 412 413 414 415 416 417 418
      MYSQL_FTPARSER_PARAM *ftparser_param=
        &info->ftparser_param[keyinfo->ftparser_nr*MAX_PARAM_NR + j];
      if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word)
      {
        if (keyinfo->parser->deinit)
          keyinfo->parser->deinit(ftparser_param);
        ftparser_param->mysql_add_word= 0;
      }
      else
        break;
419 420
    }
  }
421
}
422