Commit 3b7afff8 authored by Nicolas Delaby's avatar Nicolas Delaby

Enhance charset replacement with regular expression.


git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@34372 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent dde241cd
...@@ -490,7 +490,7 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S ...@@ -490,7 +490,7 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE) href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL) body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL) title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
charset_parser = re.compile('charset="?([a-z0-9\-]+)', re.IGNORECASE) charset_parser = re.compile('(?P<keyword>charset="?)(?P<charset>[a-z0-9\-]+)', re.IGNORECASE)
# Declarative security # Declarative security
security = ClassSecurityInfo() security = ClassSecurityInfo()
......
...@@ -230,9 +230,7 @@ class TextDocument(Document, TextContent): ...@@ -230,9 +230,7 @@ class TextDocument(Document, TextContent):
mime_type = 'text/x-html-safe' mime_type = 'text/x-html-safe'
if charset is None: if charset is None:
# find charset # find charset
charset_list = self.charset_parser.findall(text_content) charset = self.charset_parser.search(text_content).group('charset')
if charset_list:
charset = charset_list[0]
if charset and charset not in ('utf-8', 'UTF-8'): if charset and charset not in ('utf-8', 'UTF-8'):
try: try:
text_content = text_content.decode(charset).encode('utf-8') text_content = text_content.decode(charset).encode('utf-8')
...@@ -241,7 +239,16 @@ class TextDocument(Document, TextContent): ...@@ -241,7 +239,16 @@ class TextDocument(Document, TextContent):
else: else:
charset = 'utf-8' # Override charset if convertion succeeds charset = 'utf-8' # Override charset if convertion succeeds
# change charset value in html_document as well # change charset value in html_document as well
self.charset_parser.sub('utf-8', text_content) def subCharset(matchobj):
keyword = matchobj.group('keyword')
charset = matchobj.group('charset')
if not (keyword or charset):
# no match, return same string
return matchobj.group(0)
elif keyword:
# if keyword is present, replace charset just after
return keyword + 'utf-8'
text_content = self.charset_parser.sub(subCharset, text_content)
result = portal_transforms.convertToData(mime_type, text_content, result = portal_transforms.convertToData(mime_type, text_content,
object=self, context=self, object=self, context=self,
filename=filename, filename=filename,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment