Commit acc6b48e authored by Casey Duncan's avatar Casey Duncan

Collector #597: Added locale support to ZCTextindex splitters. Added unittest

Added ZPL header to stopper test module
parent 2a75dd01
......@@ -21,14 +21,15 @@ class HTMLWordSplitter:
__implements__ = ISplitter
def process(self, text, wordpat=r"\w+"):
def process(self, text, wordpat=r"(?L)\w+"):
splat = []
for t in text:
splat += self._split(t, wordpat)
return splat
def processGlob(self, text):
return self.process(text, r"\w+[\w*?]*") # see Lexicon.globToWordIds()
# see Lexicon.globToWordIds()
return self.process(text, r"(?L)\w+[\w*?]*")
def _split(self, text, wordpat):
text = text.lower()
......
......@@ -156,8 +156,8 @@ def _text2list(text):
class Splitter:
import re
rx = re.compile(r"\w+")
rxGlob = re.compile(r"\w+[\w*?]*") # See globToWordIds() above
rx = re.compile(r"(?L)\w+")
rxGlob = re.compile(r"(?L)\w+[\w*?]*") # See globToWordIds() above
def process(self, lst):
result = []
......
......@@ -12,6 +12,7 @@
#
##############################################################################
import sys
from unittest import TestCase, TestSuite, main, makeSuite
from Products.ZCTextIndex.Lexicon import Lexicon
......@@ -112,7 +113,24 @@ class Test(TestCase):
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('hsif')
self.assertEqual(wids, [2])
def testSplitterLocaleAwareness(self):
from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
import locale
loc = locale.setlocale(locale.LC_ALL) # get current locale
# set German locale
if sys.platform != 'win32':
locale.setlocale(locale.LC_ALL, 'de_DE.ISO8859-1')
else:
locale.setlocale(locale.LC_ALL, 'German_Germany.1252')
words = ['mlltonne waschbr behrde berflieger']
words = Splitter().process(words)
self.assertEqual(
words, ['mlltonne', 'waschbr', 'behrde', 'berflieger'])
words = HTMLWordSplitter().process(words)
self.assertEqual(
words, ['mlltonne', 'waschbr', 'behrde', 'berflieger'])
locale.setlocale(locale.LC_ALL, loc) # restore saved locale
def test_suite():
return makeSuite(Test)
......
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Tests for the C version of the StopWordRemover."""
import unittest
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment