diff --git a/lib/python/Products/ZCTextIndex/HTMLSplitter.py b/lib/python/Products/ZCTextIndex/HTMLSplitter.py index 67913c8f6ccdcddb0f5de077644a611f8f1de3b8..81b5d40e51f0091f9fcc9c9bcf6370a8ef6f987f 100644 --- a/lib/python/Products/ZCTextIndex/HTMLSplitter.py +++ b/lib/python/Products/ZCTextIndex/HTMLSplitter.py @@ -21,14 +21,15 @@ class HTMLWordSplitter: __implements__ = ISplitter - def process(self, text, wordpat=r"\w+"): + def process(self, text, wordpat=r"(?L)\w+"): splat = [] for t in text: splat += self._split(t, wordpat) return splat def processGlob(self, text): - return self.process(text, r"\w+[\w*?]*") # see Lexicon.globToWordIds() + # see Lexicon.globToWordIds() + return self.process(text, r"(?L)\w+[\w*?]*") def _split(self, text, wordpat): text = text.lower() diff --git a/lib/python/Products/ZCTextIndex/Lexicon.py b/lib/python/Products/ZCTextIndex/Lexicon.py index cb2a9243e4cf179295b81b5c5e85ecaacaf1bb3a..fc9dd90bb1bb8bd3906f8545bf83bc2ee8def139 100644 --- a/lib/python/Products/ZCTextIndex/Lexicon.py +++ b/lib/python/Products/ZCTextIndex/Lexicon.py @@ -156,8 +156,8 @@ def _text2list(text): class Splitter: import re - rx = re.compile(r"\w+") - rxGlob = re.compile(r"\w+[\w*?]*") # See globToWordIds() above + rx = re.compile(r"(?L)\w+") + rxGlob = re.compile(r"(?L)\w+[\w*?]*") # See globToWordIds() above def process(self, lst): result = [] diff --git a/lib/python/Products/ZCTextIndex/tests/testLexicon.py b/lib/python/Products/ZCTextIndex/tests/testLexicon.py index 75a8c347d8c9cdb29e1e92abdfb13b8ae13943b5..18ec114c335398b86496f157a68a0138a4099245 100644 --- a/lib/python/Products/ZCTextIndex/tests/testLexicon.py +++ b/lib/python/Products/ZCTextIndex/tests/testLexicon.py @@ -12,6 +12,7 @@ # ############################################################################## +import sys from unittest import TestCase, TestSuite, main, makeSuite from Products.ZCTextIndex.Lexicon import Lexicon @@ -112,7 +113,24 @@ class Test(TestCase): wids = lexicon.sourceToWordIds('cats and dogs') wids = lexicon.termToWordIds('hsif') self.assertEqual(wids, [2]) - + + def testSplitterLocaleAwareness(self): + from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter + import locale + loc = locale.setlocale(locale.LC_ALL) # get current locale + # set German locale + if sys.platform != 'win32': + locale.setlocale(locale.LC_ALL, 'de_DE.ISO8859-1') + else: + locale.setlocale(locale.LC_ALL, 'German_Germany.1252') + words = ['mülltonne waschbär behörde überflieger'] + words = Splitter().process(words) + self.assertEqual( + words, ['mülltonne', 'waschbär', 'behörde', 'überflieger']) + words = HTMLWordSplitter().process(words) + self.assertEqual( + words, ['mülltonne', 'waschbär', 'behörde', 'überflieger']) + locale.setlocale(locale.LC_ALL, loc) # restore saved locale def test_suite(): return makeSuite(Test) diff --git a/lib/python/Products/ZCTextIndex/tests/testStopper.py b/lib/python/Products/ZCTextIndex/tests/testStopper.py index 991dde3593fc0d9cd2c9b21cfb485c8f643f283e..4ccb516265667e40cdecc6d7b0159a9514f3c7a8 100644 --- a/lib/python/Products/ZCTextIndex/tests/testStopper.py +++ b/lib/python/Products/ZCTextIndex/tests/testStopper.py @@ -1,3 +1,16 @@ +############################################################################## +# +# Copyright (c) 2002 Zope Corporation and Contributors. +# All Rights Reserved. +# +# This software is subject to the provisions of the Zope Public License, +# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED +# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS +# FOR A PARTICULAR PURPOSE. +# +############################################################################## """Tests for the C version of the StopWordRemover.""" import unittest