myisam_ftdump.c 6.87 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

17 18
/* Written by Sergei A. Golubchik, who has a shared copyright to this code
   added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */
19 20

#include "ftdefs.h"
21
#include <my_getopt.h>
22

23
static void usage();
24
static void complain(int val);
unknown's avatar
unknown committed
25
static my_bool get_one_option(int, const struct my_option *, char *);
26

27 28
static int count=0, stats=0, dump=0, lstats=0;
static my_bool verbose;
29
static char *query=NULL;
30
static uint lengths[256];
31

32
#define MAX_LEN (HA_FT_MAXBYTELEN+10)
33
#define HOW_OFTEN_TO_WRITE 10000
34

35 36
static struct my_option my_long_options[] =
{
37
  {"dump", 'd', "Dump index (incl. data offsets and word weights).",
38
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
39
  {"stats", 's', "Report global stats.",
40
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
41
  {"verbose", 'v', "Be verbose.",
42
   (gptr*) &verbose, (gptr*) &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
43
  {"count", 'c', "Calculate per-word stats (counts and global weights).",
44
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
45
  {"length", 'l', "Report length distribution.",
46
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
47
  {"help", 'h', "Display help and exit.",
48
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
49
  {"help", '?', "Synonym for -h.",
50 51 52 53 54
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
};


55 56
int main(int argc,char *argv[])
{
57
  int error=0, subkeys;
unknown's avatar
unknown committed
58
  uint keylen, keylen2=0, inx, doc_cnt=0;
59
  float weight= 1.0;
unknown's avatar
unknown committed
60
  double gws, min_gws=0, avg_gws=0;
61
  MI_INFO *info;
62
  char buf[MAX_LEN], buf2[MAX_LEN], buf_maxlen[MAX_LEN], buf_min_gws[MAX_LEN];
63 64 65 66
  ulong total=0, maxlen=0, uniq=0, max_doc_cnt=0;
  struct { MI_INFO *info; } aio0, *aio=&aio0; /* for GWS_IN_USE */

  MY_INIT(argv[0]);
67
  if ((error= handle_options(&argc, &argv, my_long_options, get_one_option)))
unknown's avatar
unknown committed
68
    exit(error);
69 70
  if (count || dump)
    verbose=0;
71
  if (!count && !dump && !lstats && !query)
72 73 74 75 76
    stats=1;

  if (verbose)
    setbuf(stdout,NULL);

unknown's avatar
unknown committed
77
  if (argc < 2)
78
    usage();
79

unknown's avatar
unknown committed
80 81
  {
    char *end;
82
    inx= (uint) strtoll(argv[1], &end, 10);
unknown's avatar
unknown committed
83 84 85 86
    if (*end)
      usage();
  }

unknown's avatar
unknown committed
87
  init_key_cache(dflt_key_cache,MI_KEY_BLOCK_LENGTH,USE_BUFFER_INIT, 0, 0);
unknown's avatar
unknown committed
88

89
  if (!(info=mi_open(argv[0],2,HA_OPEN_ABORT_IF_LOCKED|HA_OPEN_FROM_SQL_LAYER)))
unknown's avatar
unknown committed
90 91
  {
    error=my_errno;
92
    goto err;
unknown's avatar
unknown committed
93
  }
94 95 96 97

  *buf2=0;
  aio->info=info;

unknown's avatar
unknown committed
98 99
  if ((inx >= info->s->base.keys) ||
      !(info->s->keyinfo[inx].flag & HA_FULLTEXT))
100 101 102 103
  {
    printf("Key %d in table %s is not a FULLTEXT key\n", inx, info->filename);
    goto err;
  }
unknown's avatar
unknown committed
104

unknown's avatar
unknown committed
105 106
  mi_lock_database(info, F_EXTRA_LCK);

unknown's avatar
unknown committed
107 108
  info->lastpos= HA_OFFSET_ERROR;
  info->update|= HA_STATE_PREV_FOUND;
109

unknown's avatar
unknown committed
110
  while (!(error=mi_rnext(info,NULL,inx)))
111
  {
unknown's avatar
unknown committed
112
    keylen=*(info->lastkey);
113

unknown's avatar
unknown committed
114 115 116
    subkeys=ft_sintXkorr(info->lastkey+keylen+1);
    if (subkeys >= 0)
      weight=*(float*)&subkeys;
117

unknown's avatar
unknown committed
118
#ifdef HAVE_SNPRINTF
unknown's avatar
unknown committed
119
    snprintf(buf,MAX_LEN,"%.*s",(int) keylen,info->lastkey+1);
unknown's avatar
unknown committed
120
#else
unknown's avatar
unknown committed
121
    sprintf(buf,"%.*s",(int) keylen,info->lastkey+1);
unknown's avatar
unknown committed
122
#endif
unknown's avatar
unknown committed
123 124 125
    my_casedn_str(default_charset_info,buf);
    total++;
    lengths[keylen]++;
126

unknown's avatar
unknown committed
127 128 129 130
    if (count || stats)
    {
      doc_cnt++;
      if (strcmp(buf, buf2))
131
      {
unknown's avatar
unknown committed
132
        if (*buf2)
133
        {
unknown's avatar
unknown committed
134 135 136 137 138
          uniq++;
          avg_gws+=gws=GWS_IN_USE;
          if (count)
            printf("%9u %20.7f %s\n",doc_cnt,gws,buf2);
          if (maxlen<keylen2)
139
          {
unknown's avatar
unknown committed
140 141 142 143 144 145 146 147
            maxlen=keylen2;
            strmov(buf_maxlen, buf2);
          }
          if (max_doc_cnt < doc_cnt)
          {
            max_doc_cnt=doc_cnt;
            strmov(buf_min_gws, buf2);
            min_gws=gws;
148 149
          }
        }
unknown's avatar
unknown committed
150 151 152
        strmov(buf2, buf);
        keylen2=keylen;
        doc_cnt=0;
153 154
      }
    }
unknown's avatar
unknown committed
155 156 157 158 159 160 161 162 163 164 165
    if (dump)
    {
      if (subkeys>=0)
        printf("%9lx %20.7f %s\n", (long) info->lastpos,weight,buf);
      else
        printf("%9lx => %17d %s\n",(long) info->lastpos,-subkeys,buf);
    }
    if (verbose && (total%HOW_OFTEN_TO_WRITE)==0)
      printf("%10ld\r",total);
  }
  mi_lock_database(info, F_UNLCK);
166

unknown's avatar
unknown committed
167 168 169 170
  if (count || stats)
  {
    doc_cnt++;
    if (*buf2)
171
    {
unknown's avatar
unknown committed
172 173 174 175 176
      uniq++;
      avg_gws+=gws=GWS_IN_USE;
      if (count)
        printf("%9u %20.7f %s\n",doc_cnt,gws,buf2);
      if (maxlen<keylen2)
177
      {
unknown's avatar
unknown committed
178 179
        maxlen=keylen2;
        strmov(buf_maxlen, buf2);
180
      }
unknown's avatar
unknown committed
181
      if (max_doc_cnt < doc_cnt)
182
      {
unknown's avatar
unknown committed
183 184 185
        max_doc_cnt=doc_cnt;
        strmov(buf_min_gws, buf2);
        min_gws=gws;
186 187
      }
    }
188 189
  }

unknown's avatar
unknown committed
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
  if (stats)
  {
    count=0;
    for (inx=0;inx<256;inx++)
    {
      count+=lengths[inx];
      if ((ulong) count >= total/2)
        break;
    }
    printf("Total rows: %lu\nTotal words: %lu\n"
           "Unique words: %lu\nLongest word: %lu chars (%s)\n"
           "Median length: %u\n"
           "Average global weight: %f\n"
           "Most common word: %lu times, weight: %f (%s)\n",
           (long) info->state->records, total, uniq, maxlen, buf_maxlen,
           inx, avg_gws/uniq, max_doc_cnt, min_gws, buf_min_gws);
  }
  if (lstats)
  {
    count=0;
    for (inx=0; inx<256; inx++)
    {
      count+=lengths[inx];
      if (count && lengths[inx])
        printf("%3u: %10lu %5.2f%% %20lu %4.1f%%\n", inx,
               (ulong) lengths[inx],100.0*lengths[inx]/total,(ulong) count,
               100.0*count/total);
    }
  }

220 221 222 223 224 225 226 227
err:
  if (error && error != HA_ERR_END_OF_FILE)
    printf("got error %d\n",my_errno);
  if (info)
    mi_close(info);
  return 0;
}

228 229 230 231 232 233 234

static my_bool
get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
	       char *argument __attribute__((unused)))
{
  switch(optid) {
  case 'd':
235
    dump=1;
236 237
    complain(count || query);
    break;
238 239
  case 's':
    stats=1;
240 241
    complain(query!=0);
    break;
242
  case 'c':
243 244 245
    count= 1;
    complain(dump || query);
    break;
246
  case 'l':
247 248 249 250 251 252 253 254 255
    lstats=1;
    complain(query!=0);
    break;
  case '?':
  case 'h':
    usage();
  }
  return 0;
}
256

unknown's avatar
unknown committed
257
#include <help_start.h>
unknown's avatar
unknown committed
258

259
static void usage()
260
{
unknown's avatar
unknown committed
261
  printf("Use: myisam_ftdump <table_name> <index_num>\n");
262 263
  my_print_help(my_long_options);
  my_print_variables(my_long_options);
unknown's avatar
unknown committed
264
  NETWARE_SET_SCREEN_MODE(1);
265 266 267
  exit(1);
}

unknown's avatar
unknown committed
268
#include <help_end.h>
269

270 271 272 273 274 275 276 277
static void complain(int val) /* Kinda assert :-)  */
{
  if (val)
  {
    printf("You cannot use these options together!\n");
    exit(1);
  }
}