From e4b0e224bd82679afff7fe34776da3b7266f7658 Mon Sep 17 00:00:00 2001 From: Nicolas Delaby <nicolas@nexedi.com> Date: Thu, 8 Apr 2010 08:58:29 +0000 Subject: [PATCH] Output always safe html content. * _safeHTML is removed * The stripping is done inside convert method * Conversion Cache is handled corectly git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@34360 20353a03-c40f-0410-a6d1-a30d3c3de9de --- product/ERP5/Document/Document.py | 64 ++------------------------- product/ERP5/Document/TextDocument.py | 41 ++++++++++++----- 2 files changed, 35 insertions(+), 70 deletions(-) diff --git a/product/ERP5/Document/Document.py b/product/ERP5/Document/Document.py index 1d4c52782a..e79580c190 100644 --- a/product/ERP5/Document/Document.py +++ b/product/ERP5/Document/Document.py @@ -490,7 +490,6 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE) body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL) title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL) - base_parser = re.compile('<base[^>]*href=[\'"](.*?)[\'"][^>]*>', re.IGNORECASE + re.DOTALL) charset_parser = re.compile('charset="?([a-z0-9\-]+)', re.IGNORECASE) # Declarative security @@ -1151,14 +1150,9 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S """ if not self.hasBaseData(): raise ConversionError('This document has not been processed yet.') - try: - # FIXME: no substitution may occur in this case. - mime, data = self.getConversion(format='base-html') - return data - except KeyError: - kw['format'] = 'html' - mime, html = self.convert(**kw) - return html + kw['format'] = 'html' + mime, html = self.convert(**kw) + return html security.declareProtected(Permissions.View, 'asStrippedHTML') def asStrippedHTML(self, **kw): @@ -1167,16 +1161,7 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S (without html and body tags, etc.) which can be used to inline a preview of the document. """ - if not self.hasBaseData(): - return '' - try: - # FIXME: no substitution may occur in this case. - mime, data = self.getConversion(format='stripped-html') - return data - except KeyError: - kw['format'] = 'html' - mime, html = self.convert(**kw) - return self._stripHTML(str(html)) + return self._stripHTML(self._asHTML(**kw)) def _guessEncoding(self, string): """ @@ -1199,49 +1184,8 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S stripped_html = body_list[0] else: stripped_html = html - # find charset and convert to utf-8 - charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient if this - # is datastream instance but hard to do better - if charset and not charset_list: - # Use optional parameter is we can not find encoding in HTML - charset_list = [charset] - if charset_list and charset_list[0] not in ('utf-8', 'UTF-8'): - try: - stripped_html = unicode(str(stripped_html), - charset_list[0]).encode('utf-8') - except (UnicodeDecodeError, LookupError): - return str(stripped_html) return stripped_html - def _safeHTML(self, html, format='text/x-html-safe', charset=None): - """ - A private method to strip HTML content in safe mode, - w/o emmbed javascript, forms and any external plugins imports. - This should be used when we do not trust the user (Anonymous) - who push data into database. - - html: content to strip - - format: destination format - - charset: charset used to encode string. Take precedence - on charset values found in html string - """ - portal = self.getPortalObject() - if charset is None: - # find charset - charset_list = self.charset_parser.findall(html) - if charset_list: - charset = charset_list[0] - if charset and charset not in ('utf-8', 'UTF-8'): - try: - safe_html_string = html.decode(charset).encode('utf-8') - except (UnicodeDecodeError, LookupError): - pass - else: - charset = 'utf-8' # Override charset if convertion succeeds - transform_tool = getToolByName(portal, 'portal_transforms') - safe_html_string = transform_tool.convertToData(format, html, - encoding=charset) - return safe_html_string - security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation') def getContentInformation(self): """ diff --git a/product/ERP5/Document/TextDocument.py b/product/ERP5/Document/TextDocument.py index cf408b49bc..766bc96202 100644 --- a/product/ERP5/Document/TextDocument.py +++ b/product/ERP5/Document/TextDocument.py @@ -202,7 +202,8 @@ class TextDocument(Document, TextContent): **substitution_method_parameter_dict) security.declareProtected(Permissions.AccessContentsInformation, 'convert') - def convert(self, format, substitution_method_parameter_dict=None, safe_substitute=True, **kw): + def convert(self, format, substitution_method_parameter_dict=None, + safe_substitute=True, charset=None, text_content=None, **kw): """ Convert text using portal_transforms or oood """ @@ -212,35 +213,55 @@ class TextDocument(Document, TextContent): if format == 'raw': return 'text/plain', self.getTextContent() portal = self.getPortalObject() - mime_type = getToolByName(portal, 'mimetypes_registry').lookupExtension('name.%s' % format) - mime_type = str(mime_type) + mime_type = getToolByName(portal, 'mimetypes_registry').\ + lookupExtension('name.%s' % format) + original_mime_type = mime_type = str(mime_type) src_mimetype = self.getTextFormat(DEFAULT_TEXT_FORMAT) if not src_mimetype.startswith('text/'): src_mimetype = 'text/%s' % src_mimetype - # check if document has set text_content and convert if necessary - text_content = self.getTextContent() + if text_content is None: + # check if document has set text_content and convert if necessary + text_content = self.getTextContent() if text_content: if not self.hasConversion(format=format): portal_transforms = getToolByName(portal, 'portal_transforms') filename = self.getSourceReference(self.getTitleOrId()) + if mime_type == 'text/html': + mime_type = 'text/x-html-safe' + if charset is None: + # find charset + charset_list = self.charset_parser.findall(text_content) + if charset_list: + charset = charset_list[0] + if charset and charset not in ('utf-8', 'UTF-8'): + try: + text_content = text_content.decode(charset).encode('utf-8') + except (UnicodeDecodeError, LookupError): + pass + else: + charset = 'utf-8' # Override charset if convertion succeeds + # change charset value in html_document as well + self.charset_parser.sub('utf-8', text_content) result = portal_transforms.convertToData(mime_type, text_content, object=self, context=self, filename=filename, - mimetype=src_mimetype) + mimetype=src_mimetype, + encoding=charset) if result is None: raise ConversionError('TextDocument conversion error. ' - 'portal_transforms failed to convert to %s: %r' % (mime_type, self)) - self.setConversion(result, mime_type, format=format) + 'portal_transforms failed to convert'\ + 'to %s: %r' % (mime_type, self)) + self.setConversion(result, original_mime_type, format=format) else: mime_type, result = self.getConversion(format=format) if substitution_method_parameter_dict is None: substitution_method_parameter_dict = {} result = self._substituteTextContent(result, safe_substitute=safe_substitute, **substitution_method_parameter_dict) - return mime_type, result + return original_mime_type, result else: # text_content is not set, return empty string instead of None - return mime_type, '' + return original_mime_type, '' def __call__(self): _setCacheHeaders(_ViewEmulator().__of__(self), {}) -- 2.30.9