From e4b0e224bd82679afff7fe34776da3b7266f7658 Mon Sep 17 00:00:00 2001
From: Nicolas Delaby <nicolas@nexedi.com>
Date: Thu, 8 Apr 2010 08:58:29 +0000
Subject: [PATCH] Output always safe html content.   * _safeHTML is removed   *
 The stripping is done inside convert method   * Conversion Cache is handled
 corectly

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@34360 20353a03-c40f-0410-a6d1-a30d3c3de9de
---
 product/ERP5/Document/Document.py     | 64 ++-------------------------
 product/ERP5/Document/TextDocument.py | 41 ++++++++++++-----
 2 files changed, 35 insertions(+), 70 deletions(-)

diff --git a/product/ERP5/Document/Document.py b/product/ERP5/Document/Document.py
index 1d4c52782a..e79580c190 100644
--- a/product/ERP5/Document/Document.py
+++ b/product/ERP5/Document/Document.py
@@ -490,7 +490,6 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
   href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
   body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
   title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
-  base_parser = re.compile('<base[^>]*href=[\'"](.*?)[\'"][^>]*>', re.IGNORECASE + re.DOTALL)
   charset_parser = re.compile('charset="?([a-z0-9\-]+)', re.IGNORECASE)
 
   # Declarative security
@@ -1151,14 +1150,9 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
     """
     if not self.hasBaseData():
       raise ConversionError('This document has not been processed yet.')
-    try:
-      # FIXME: no substitution may occur in this case.
-      mime, data = self.getConversion(format='base-html')
-      return data
-    except KeyError:
-      kw['format'] = 'html'
-      mime, html = self.convert(**kw)
-      return html
+    kw['format'] = 'html'
+    mime, html = self.convert(**kw)
+    return html
 
   security.declareProtected(Permissions.View, 'asStrippedHTML')
   def asStrippedHTML(self, **kw):
@@ -1167,16 +1161,7 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
       (without html and body tags, etc.) which can be used to inline
       a preview of the document.
     """
-    if not self.hasBaseData():
-      return ''
-    try:
-      # FIXME: no substitution may occur in this case.
-      mime, data = self.getConversion(format='stripped-html')
-      return data
-    except KeyError:
-      kw['format'] = 'html'
-      mime, html = self.convert(**kw)
-      return self._stripHTML(str(html))
+    return self._stripHTML(self._asHTML(**kw))
 
   def _guessEncoding(self, string):
     """
@@ -1199,49 +1184,8 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
       stripped_html = body_list[0]
     else:
       stripped_html = html
-    # find charset and convert to utf-8
-    charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient if this
-                                         # is datastream instance but hard to do better
-    if charset and not charset_list:
-      # Use optional parameter is we can not find encoding in HTML
-      charset_list = [charset]
-    if charset_list and charset_list[0] not in ('utf-8', 'UTF-8'):
-      try:
-        stripped_html = unicode(str(stripped_html),
-                                charset_list[0]).encode('utf-8')
-      except (UnicodeDecodeError, LookupError):
-        return str(stripped_html)
     return stripped_html
 
-  def _safeHTML(self, html, format='text/x-html-safe', charset=None):
-    """
-      A private method to strip HTML content in safe mode,
-      w/o emmbed javascript, forms and any external plugins imports.
-      This should be used when we do not trust the user (Anonymous)
-      who push data into database.
-      - html: content to strip
-      - format: destination format
-      - charset: charset used to encode string. Take precedence
-      on charset values found in html string
-    """
-    portal = self.getPortalObject()
-    if charset is None:
-      # find charset
-      charset_list = self.charset_parser.findall(html)
-      if charset_list:
-        charset = charset_list[0]
-    if charset and charset not in ('utf-8', 'UTF-8'):
-      try:
-        safe_html_string = html.decode(charset).encode('utf-8')
-      except (UnicodeDecodeError, LookupError):
-        pass
-      else:
-        charset = 'utf-8' # Override charset if convertion succeeds
-    transform_tool = getToolByName(portal, 'portal_transforms')
-    safe_html_string = transform_tool.convertToData(format, html,
-                                                    encoding=charset)
-    return safe_html_string
-
   security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
   def getContentInformation(self):
     """
diff --git a/product/ERP5/Document/TextDocument.py b/product/ERP5/Document/TextDocument.py
index cf408b49bc..766bc96202 100644
--- a/product/ERP5/Document/TextDocument.py
+++ b/product/ERP5/Document/TextDocument.py
@@ -202,7 +202,8 @@ class TextDocument(Document, TextContent):
                                          **substitution_method_parameter_dict)
 
     security.declareProtected(Permissions.AccessContentsInformation, 'convert')
-    def convert(self, format, substitution_method_parameter_dict=None, safe_substitute=True, **kw):
+    def convert(self, format, substitution_method_parameter_dict=None,
+                safe_substitute=True, charset=None, text_content=None, **kw):
       """
         Convert text using portal_transforms or oood
       """
@@ -212,35 +213,55 @@ class TextDocument(Document, TextContent):
       if format == 'raw':
         return 'text/plain', self.getTextContent()
       portal = self.getPortalObject()
-      mime_type = getToolByName(portal, 'mimetypes_registry').lookupExtension('name.%s' % format)
-      mime_type = str(mime_type)
+      mime_type = getToolByName(portal, 'mimetypes_registry').\
+                                            lookupExtension('name.%s' % format)
+      original_mime_type = mime_type = str(mime_type)
       src_mimetype = self.getTextFormat(DEFAULT_TEXT_FORMAT)
       if not src_mimetype.startswith('text/'):
         src_mimetype = 'text/%s' % src_mimetype
-      # check if document has set text_content and convert if necessary
-      text_content = self.getTextContent()
+      if text_content is None:
+        # check if document has set text_content and convert if necessary
+        text_content = self.getTextContent()
       if text_content:
         if not self.hasConversion(format=format):
           portal_transforms = getToolByName(portal, 'portal_transforms')
           filename = self.getSourceReference(self.getTitleOrId())
+          if mime_type == 'text/html':
+            mime_type = 'text/x-html-safe'
+            if charset is None:
+              # find charset
+              charset_list = self.charset_parser.findall(text_content)
+              if charset_list:
+                charset = charset_list[0]
+            if charset and charset not in ('utf-8', 'UTF-8'):
+              try:
+                text_content = text_content.decode(charset).encode('utf-8')
+              except (UnicodeDecodeError, LookupError):
+                pass
+              else:
+                charset = 'utf-8' # Override charset if convertion succeeds
+                # change charset value in html_document as well
+                self.charset_parser.sub('utf-8', text_content)
           result = portal_transforms.convertToData(mime_type, text_content,
                                                    object=self, context=self,
                                                    filename=filename,
-                                                   mimetype=src_mimetype)
+                                                   mimetype=src_mimetype,
+                                                   encoding=charset)
           if result is None:
             raise ConversionError('TextDocument conversion error. '
-                                  'portal_transforms failed to convert to %s: %r' % (mime_type, self))
-          self.setConversion(result, mime_type, format=format)
+                                  'portal_transforms failed to convert'\
+                                  'to %s: %r' % (mime_type, self))
+          self.setConversion(result, original_mime_type, format=format)
         else:
           mime_type, result = self.getConversion(format=format)
         if substitution_method_parameter_dict is None:
           substitution_method_parameter_dict = {}
         result = self._substituteTextContent(result, safe_substitute=safe_substitute,
                                              **substitution_method_parameter_dict)
-        return mime_type, result
+        return original_mime_type, result
       else:
         # text_content is not set, return empty string instead of None
-        return mime_type, ''
+        return original_mime_type, ''
 
     def __call__(self):
       _setCacheHeaders(_ViewEmulator().__of__(self), {})
-- 
2.30.9