ft_parser.c 5.4 KB
Newer Older
bk@work.mysql.com's avatar
bk@work.mysql.com committed
1
/* Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2

bk@work.mysql.com's avatar
bk@work.mysql.com committed
3 4 5 6
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
7

bk@work.mysql.com's avatar
bk@work.mysql.com committed
8 9 10 11
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
12

bk@work.mysql.com's avatar
bk@work.mysql.com committed
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

/* Written by Sergei A. Golubchik, who has a shared copyright to this code */

#include "ftdefs.h"

#ifdef EVAL_RUN
#ifdef PIVOT_STAT
ulong collstat=0;
#endif
#endif /* EVAL_RUN */

typedef struct st_ft_docstat {
  FT_WORD *list;
  uint uniq;
  double sum;
#ifdef EVAL_RUN
  uint words, totlen;
  double max, nsum, nsum2;
#endif /* EVAL_RUN */

} FT_DOCSTAT;

38
static int FT_WORD_cmp(void* cmp_arg, FT_WORD *w1, FT_WORD *w2)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
39 40 41
{
  return _mi_compare_text(default_charset_info,
			  (uchar*) w1->pos,w1->len,
42
			  (uchar*) w2->pos, w2->len,(my_bool)cmp_arg);
bk@work.mysql.com's avatar
bk@work.mysql.com committed
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
}

static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
{
    word->weight=LWS_IN_USE;

#ifdef EVAL_RUN
    word->cnt= (uchar) count;
    if(docstat->max < word->weight) docstat->max=word->weight;
    docstat->words+=count;
    docstat->totlen+=word->len;
#endif /* EVAL_RUN */
    docstat->sum+=word->weight;

    memcpy_fixed((docstat->list)++,word,sizeof(FT_WORD));
    return 0;
}

/* transforms tree of words into the array, applying normalization */

63
FT_WORD * ft_linearize(TREE *wtree)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
64 65 66
{
  FT_WORD *wlist,*p;
  FT_DOCSTAT docstat;
serg@serg.mysql.com's avatar
serg@serg.mysql.com committed
67
  DBUG_ENTER("ft_linearize");
bk@work.mysql.com's avatar
bk@work.mysql.com committed
68 69 70 71 72 73 74 75 76 77 78 79 80

  if ((wlist=(FT_WORD *) my_malloc(sizeof(FT_WORD)*
				   (1+wtree->elements_in_tree),MYF(0))))
  {
    docstat.list=wlist;
    docstat.uniq=wtree->elements_in_tree;
#ifdef EVAL_RUN
    docstat.nsum=docstat.nsum2=docstat.max=docstat.words=docstat.totlen=
#endif /* EVAL_RUN */
    docstat.sum=0;
    tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right);
  }
  delete_tree(wtree);
81
  if (!wlist)
serg@serg.mysql.com's avatar
serg@serg.mysql.com committed
82
    DBUG_RETURN(NULL);
bk@work.mysql.com's avatar
bk@work.mysql.com committed
83 84 85

  docstat.list->pos=NULL;

86
  for (p=wlist;p->pos;p++)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
87 88 89 90 91 92 93 94 95 96 97 98 99 100
  {
    p->weight=PRENORM_IN_USE;
#ifdef EVAL_RUN
    docstat.nsum+=p->weight;
    docstat.nsum2+=p->weight*p->weight;
#endif /* EVAL_RUN */
  }

#ifdef EVAL_RUN
#ifdef PIVOT_STAT
  collstat+=PIVOT_STAT;
#endif
#endif /* EVAL_RUN */

101
  for (p=wlist;p->pos;p++)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
102 103 104 105
  {
    p->weight/=NORM_IN_USE;
  }

serg@serg.mysql.com's avatar
serg@serg.mysql.com committed
106
  DBUG_RETURN(wlist);
bk@work.mysql.com's avatar
bk@work.mysql.com committed
107 108
}

109
#define true_word_char(X)	(isalnum(X) || (X)=='_')
bk@work.mysql.com's avatar
bk@work.mysql.com committed
110
#ifdef HYPHEN_IS_DELIM
serg@serg.mysql.com's avatar
serg@serg.mysql.com committed
111
#define misc_word_char(X)	((X)=='\'')
bk@work.mysql.com's avatar
bk@work.mysql.com committed
112
#else
serg@serg.mysql.com's avatar
serg@serg.mysql.com committed
113
#define misc_word_char(X)	((X)=='\'' || (X)=='-')
bk@work.mysql.com's avatar
bk@work.mysql.com committed
114
#endif
115
#define word_char(X)		(true_word_char(X) || misc_word_char(X))
bk@work.mysql.com's avatar
bk@work.mysql.com committed
116

117 118 119 120 121 122 123

/* returns:
 * 0 - eof
 * 1 - word found
 * 2 - left bracket
 * 3 - right bracket
 */
124
byte ft_get_word(byte **start, byte *end, FT_WORD *word, FTB_PARAM *param)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
125
{
126 127 128
  byte *doc=*start;
  int mwc;

129 130
  param->yesno=(FTB_YES==' ')?1:0;
  param->plusminus=param->pmsign=0;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
131

132
  while (doc<end)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
133
  {
134 135 136 137 138
    for (;doc<end;doc++)
    {
      if (true_word_char(*doc)) break;
      if (*doc == FTB_LBR || *doc == FTB_RBR)
      {
139
        /* param->prev=' '; */
140
        *start=doc+1;
141
        return (*doc == FTB_RBR)+2;
142 143 144
      }
      if (param->prev == ' ')
      {
145 146 147 148 149 150
        if (*doc == FTB_YES ) { param->yesno=+1;    continue; } else
        if (*doc == FTB_EGAL) { param->yesno= 0;    continue; } else
        if (*doc == FTB_NO  ) { param->yesno=-1;    continue; } else
        if (*doc == FTB_INC ) { param->plusminus++; continue; } else
        if (*doc == FTB_DEC ) { param->plusminus--; continue; } else
        if (*doc == FTB_NEG ) { param->pmsign=!param->pmsign; continue; }
151 152 153 154 155 156 157 158 159 160 161 162
      }
      param->prev=*doc;
      param->yesno=param->plusminus=param->pmsign=0;
    }

    mwc=0;
    for (word->pos=doc; doc<end; doc++)
      if (true_word_char(*doc))
        mwc=0;
      else if (!misc_word_char(*doc) || mwc++)
        break;

163
    param->prev='A'; /* be sure *prev is true_word_char */
164
    word->len= (uint)(doc-word->pos) - mwc;
165
    if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
166 167
      doc++;

168 169
    if (((word->len >= ft_min_word_len && !is_stopword(word->pos, word->len))
         || param->trunc) && word->len < ft_max_word_len)
170 171 172 173
    {
      *start=doc;
      return 1;
    }
bk@work.mysql.com's avatar
bk@work.mysql.com committed
174
  }
175 176 177 178 179 180 181
  return 0;
}

byte ft_simple_get_word(byte **start, byte *end, FT_WORD *word)
{
  byte *doc=*start;
  int mwc;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
182

183
  while (doc<end)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
184
  {
185
    for (;doc<end;doc++)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
186
    {
187 188 189 190 191 192 193 194 195 196 197 198
      if (true_word_char(*doc)) break;
    }

    mwc=0;
    for(word->pos=doc; doc<end; doc++)
      if (true_word_char(*doc))
        mwc=0;
      else if (!misc_word_char(*doc) || mwc++)
        break;

    word->len= (uint)(doc-word->pos) - mwc;

serg@serg.mysql.com's avatar
serg@serg.mysql.com committed
199
    if (word->len >= ft_min_word_len && word->len < ft_max_word_len &&
200 201 202 203
        !is_stopword(word->pos, word->len))
    {
      *start=doc;
      return 1;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
204 205
    }
  }
206 207 208
  return 0;
}

209
int ft_parse(TREE *wtree, byte *doc, int doclen)
210 211 212 213 214 215
{
  byte   *end=doc+doclen;
  FT_WORD w;

  if (!is_tree_inited(wtree))
  {
serg@serg.mysql.com's avatar
serg@serg.mysql.com committed
216
    init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0,NULL, NULL);
217 218
  }

219
  while (ft_simple_get_word(&doc,end,&w))
220 221 222 223
  {
    if (!tree_insert(wtree, &w, 0))
      goto err;
  }
224
  return 0;
225 226 227

err:
  delete_tree(wtree);
228
  return 1;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
229
}
230