From a38c59c98518e5be735d7db275ea5d70121c22b4 Mon Sep 17 00:00:00 2001 From: Nicolas Delaby <nicolas@nexedi.com> Date: Thu, 4 Mar 2010 17:50:37 +0000 Subject: [PATCH] Make safe_html transforms more robuts against dirty html documents. - In case of failure of HTMLParser, lxml take under its hand the broken html and recover it. Then put back only once to HTMLParser again. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@33407 20353a03-c40f-0410-a6d1-a30d3c3de9de --- .../PortalTransforms/transforms/safe_html.py | 52 ++++++++++++++----- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/product/PortalTransforms/transforms/safe_html.py b/product/PortalTransforms/transforms/safe_html.py index 5e24393594..1d5332c9ab 100644 --- a/product/PortalTransforms/transforms/safe_html.py +++ b/product/PortalTransforms/transforms/safe_html.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- import logging -from HTMLParser import HTMLParser +from HTMLParser import HTMLParser, HTMLParseError import re from cgi import escape from zope.interface import implements @@ -14,6 +14,9 @@ from Products.CMFDefault.utils import VALID_TAGS from Products.CMFDefault.utils import NASTY_TAGS from Products.PortalTransforms.utils import safeToInt +from lxml import etree +from lxml.etree import HTMLParser as LHTMLParser + # tag mapping: tag -> short or long tag VALID_TAGS = VALID_TAGS.copy() NASTY_TAGS = NASTY_TAGS.copy() @@ -256,17 +259,42 @@ class SafeHTML: data.setData(orig) return data - try: - safe = scrubHTML( - bodyfinder(orig), - valid=self.config.get('valid_tags', {}), - nasty=self.config.get('nasty_tags', {}), - remove_javascript=self.config.get('remove_javascript', True), - raise_error=False) - except IllegalHTML, inst: - data.setData(msg_pat % ("Error", str(inst))) - else: - data.setData(safe) + html_string = orig + allready_repaired = False + while True: + try: + safe = scrubHTML( + bodyfinder(html_string), + valid=self.config.get('valid_tags', {}), + nasty=self.config.get('nasty_tags', {}), + remove_javascript=self.config.get('remove_javascript', True), + raise_error=False) + except IllegalHTML, inst: + data.setData(msg_pat % ("Error", str(inst))) + break + except HTMLParseError: + # ouch ! + # HTMLParser is not able to parse very dirty HTML string, + # try to repair any broken html with help of lxml + if allready_repaired: + raise + allready_repaired = True + encoding = kwargs.get('encoding') + # recover parameter is equal to True by default + # in lxml API. I pass the argument to improve readability + # of above code. + try: + lparser = LHTMLParser(encoding=encoding, recover=True) + except LookupError: + # Provided encoding is not known by parser, so discard it + lparser = LHTMLParser(recover=True) + repaired_html_tree = etree.HTML(orig, parser=lparser) + html_string = etree.tostring(repaired_html_tree) + # avoid breaking now. + # continue into the loop with repaired html + else: + data.setData(safe) + break return data def register(): -- 2.30.9