Treat fullwidth space characters defined in Unicode as valid whitespace.

Patch by Manabu TERADA.
parent db3e078e
...@@ -94,6 +94,11 @@ _tokenizer_regex = re.compile(r""" ...@@ -94,6 +94,11 @@ _tokenizer_regex = re.compile(r"""
) )
""", re.VERBOSE) """, re.VERBOSE)
# Use unicode regex to treat fullwidth space characters defined in Unicode
# as valid whitespace.
_tokenizer_unicode_regex = re.compile(
_tokenizer_regex.pattern, _tokenizer_regex.flags|re.UNICODE)
class QueryParser: class QueryParser:
implements(IQueryParser) implements(IQueryParser)
...@@ -109,6 +114,12 @@ class QueryParser: ...@@ -109,6 +114,12 @@ class QueryParser:
def parseQuery(self, query): def parseQuery(self, query):
# Lexical analysis. # Lexical analysis.
try:
# Try to use unicode and treat fullwidth whitespace as valid one.
if not isinstance(query, unicode):
query = query.decode('utf-8')
tokens = _tokenizer_unicode_regex.findall(query)
except UnicodeDecodeError:
tokens = _tokenizer_regex.findall(query) tokens = _tokenizer_regex.findall(query)
self._tokens = tokens self._tokens = tokens
# classify tokens # classify tokens
......
...@@ -210,6 +210,18 @@ class TestQueryParser(TestQueryParserBase): ...@@ -210,6 +210,18 @@ class TestQueryParser(TestQueryParserBase):
self.expect("foo* bar", AndNode([GlobNode("foo*"), self.expect("foo* bar", AndNode([GlobNode("foo*"),
AtomNode("bar")])) AtomNode("bar")]))
def test024(self):
# Split by UTF-8 fullwidth space
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect("foo\xe3\x80\x80bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
def test025(self):
# Split by Unicode fullwidth space
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect(u"foo\u3000bar", AndNode([AtomNode(u"foo"), AtomNode(u"bar")]))
def test101(self): def test101(self):
self.failure("") self.failure("")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment