Enhance charset replacement with regular expression.

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@34372 20353a03-c40f-0410-a6d1-a30d3c3de9de

Enhance charset replacement with regular expression.
git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@34372 20353a03-c40f-0410-a6d1-a30d3c3de9de
3b7afff8 · Nicolas Delaby · dde241cd · 3b7afff8 · 3b7afff8
Commit 3b7afff8 authored Apr 08, 2010 by Nicolas Delaby
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 5 deletions

product/ERP5/Document/Document.py product/ERP5/Document/Document.py +1 -1

product/ERP5/Document/TextDocument.py product/ERP5/Document/TextDocument.py +11 -4

No files found.
--- a/product/ERP5/Document/Document.py
+++ b/product/ERP5/Document/Document.py
@@ -490,7 +490,7 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
  href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
  body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
  title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
-  charset_parser = re.compile('charset="?([a-z0-9\-]+)', re.IGNORECASE)
+  charset_parser = re.compile('(?P<keyword>charset="?)(?P<charset>[a-z0-9\-]+)', re.IGNORECASE)
  # Declarative security
  security = ClassSecurityInfo()

--- a/product/ERP5/Document/TextDocument.py
+++ b/product/ERP5/Document/TextDocument.py
@@ -230,9 +230,7 @@ class TextDocument(Document, TextContent):
            mime_type = 'text/x-html-safe'
            if charset is None:
              # find charset
-              charset_list = self.charset_parser.findall(text_content)
+              charset = self.charset_parser.search(text_content).group('charset')
-              if charset_list:
-                charset = charset_list[0]
            if charset and charset not in ('utf-8', 'UTF-8'):
              try:
                text_content = text_content.decode(charset).encode('utf-8')
@@ -241,7 +239,16 @@ class TextDocument(Document, TextContent):
              else:
                charset = 'utf-8' # Override charset if convertion succeeds
                # change charset value in html_document as well
-                self.charset_parser.sub('utf-8', text_content)
+                def subCharset(matchobj):
+                  keyword = matchobj.group('keyword')
+                  charset = matchobj.group('charset')
+                  if not (keyword or charset):
+                    # no match, return same string
+                    return matchobj.group(0)
+                  elif keyword:
+                    # if keyword is present, replace charset just after
+                    return keyword + 'utf-8'
+                text_content = self.charset_parser.sub(subCharset, text_content)
          result = portal_transforms.convertToData(mime_type, text_content,
                                                   object=self, context=self,
                                                   filename=filename,