Commit 768146f8 authored by Jim Fulton's avatar Jim Fulton

Fixed bug in parsing code that lead to buffer overflow and corrupted

memory.  Also made some minor optimizations in same.
parent a3ed0b73
...@@ -151,40 +151,47 @@ check_synstop(WordSequence *self, PyObject *word) ...@@ -151,40 +151,47 @@ check_synstop(WordSequence *self, PyObject *word)
return value; /* Which must be None! */ return value; /* Which must be None! */
} }
#define MAX_WORD 256
static PyObject * static PyObject *
next_word(WordSequence *self) next_word(WordSequence *self)
{ {
char wbuf[256]; char wbuf[MAX_WORD];
char *tmp_ptr; char *p, *end, *here, *b;
int size = PyString_Size(self->wordletters), i = 0; int size = PyString_Size(self->wordletters), i = 0;
PyObject *pyword, *res; PyObject *pyword, *res;
while (self->here < self->end) here=self->here;
end=self->end;
b=wbuf;
while (here < end)
{ {
/* skip hyphens */ /* skip hyphens */
if ((i > 0) && (*self->here == '-')) if ((i > 0) && (*here == '-'))
{ {
tmp_ptr = self->here; while ((*++here <= ' ') && (here < end));
while ((*(++tmp_ptr) <= ' ') && (tmp_ptr < self->end)); continue;
if ((tmp_ptr < self->end) && (tmp_ptr - self->here) > 1)
{
self->here = tmp_ptr;
}
} }
/* Check to see if this character is part of a word */ /* Check to see if this character is part of a word */
if (memchr(self->cwordletters, *self->here, size)) if (memchr(self->cwordletters, *here, size))
{ {
wbuf[i++] = *self->here; if(i++ < MAX_WORD) *b++ = *here;
} }
else if (i != 0) else if (i != 0)
{ {
if(i >= MAX_WORD)
{
/* Ridiculously long word! */
i=0;
b=wbuf;
here++;
continue;
}
/* We've found the end of a word */ /* Check for and skip a phone number (I know it's lame :) */
if(i == 8 &&
/* Check for a phone number (I know it's lame :) */ wbuf[0] >= '0' && wbuf[0] <= '9' &&
if(wbuf[0] >= '0' && wbuf[0] <= '9' &&
wbuf[1] >= '0' && wbuf[1] <= '9' && wbuf[1] >= '0' && wbuf[1] <= '9' &&
wbuf[2] >= '0' && wbuf[2] <= '9' && wbuf[2] >= '0' && wbuf[2] <= '9' &&
wbuf[3] == '-' && wbuf[3] == '-' &&
...@@ -194,23 +201,29 @@ next_word(WordSequence *self) ...@@ -194,23 +201,29 @@ next_word(WordSequence *self)
wbuf[7] >= '0' && wbuf[7] <= '9') wbuf[7] >= '0' && wbuf[7] <= '9')
{ {
i=0; i=0;
self->here++; b=wbuf;
here++;
continue; continue;
} }
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) UNLESS(pyword = PyString_FromStringAndSize(wbuf, i))
{ {
self->here=here;
return NULL; return NULL;
} }
/* We've found the end of a word */
UNLESS(res = check_synstop(self, pyword)) UNLESS(res = check_synstop(self, pyword))
{ {
self->here=here;
Py_DECREF(pyword); Py_DECREF(pyword);
return NULL; return NULL;
} }
if (res != Py_None) if (res != Py_None)
{ {
self->here=here;
Py_DECREF(pyword); Py_DECREF(pyword);
return res; return res;
} }
...@@ -220,11 +233,14 @@ next_word(WordSequence *self) ...@@ -220,11 +233,14 @@ next_word(WordSequence *self)
Py_DECREF(res); Py_DECREF(res);
Py_DECREF(pyword); Py_DECREF(pyword);
i = 0; i = 0;
b=wbuf;
} }
self->here++; here++;
} }
self->here=here;
/* We've reached the end of the string */ /* We've reached the end of the string */
if (i == 0) if (i == 0)
...@@ -538,7 +554,7 @@ static char WordSequence_module_documentation[] = ...@@ -538,7 +554,7 @@ static char WordSequence_module_documentation[] =
"\n" "\n"
"for use in an inverted index\n" "for use in an inverted index\n"
"\n" "\n"
"$Id: WordSequence.c,v 1.8 1997/07/17 14:44:45 jim Exp $\n" "$Id: WordSequence.c,v 1.9 1997/07/31 22:31:58 jim Exp $\n"
; ;
void void
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment