Commit f6a8f1d5 authored by Julien Muchembled's avatar Julien Muchembled

safe_html: fix repairing with BeautifulSoup (+ some refactoring)

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@41722 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 4b5b7cb2
...@@ -205,15 +205,14 @@ class StrippingParser(HTMLParser): ...@@ -205,15 +205,14 @@ class StrippingParser(HTMLParser):
elif remove_script and hasScript(v): elif remove_script and hasScript(v):
if not self.raise_error: continue if not self.raise_error: continue
else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v
elif tag.lower() == 'meta' and k.lower() == 'content' and\ else:
if tag.lower() == 'meta' and k.lower() == 'content' and \
self.default_encoding and self.default_encoding not in v: self.default_encoding and self.default_encoding not in v:
match = charset_parser.search(v) match = charset_parser.search(v)
if match is not None: if match is not None:
self.original_charset = match.group('charset') self.original_charset = match.group('charset')
self.result.append(' %s="%s"' % (k, v = charset_parser.sub(
charset_parser.sub(CharsetReplacer(self.default_encoding), v) CharsetReplacer(self.default_encoding), v)
,))
else:
self.result.append(' %s="%s"' % (k, v)) self.result.append(' %s="%s"' % (k, v))
#UNUSED endTag = '</%s>' % tag #UNUSED endTag = '</%s>' % tag
...@@ -351,13 +350,11 @@ class SafeHTML: ...@@ -351,13 +350,11 @@ class SafeHTML:
data.setData(orig) data.setData(orig)
return data return data
html_string = orig repaired = 0
already_repaired = False
one_more_bullet_with_beautifulsoup = soupfromstring is not None
while True: while True:
try: try:
safe = scrubHTML( orig = scrubHTML(
html_string, orig,
valid=self.config.get('valid_tags', {}), valid=self.config.get('valid_tags', {}),
nasty=self.config.get('nasty_tags', {}), nasty=self.config.get('nasty_tags', {}),
remove_javascript=self.config.get('remove_javascript', True), remove_javascript=self.config.get('remove_javascript', True),
...@@ -368,23 +365,9 @@ class SafeHTML: ...@@ -368,23 +365,9 @@ class SafeHTML:
break break
except HTMLParseError: except HTMLParseError:
# ouch ! # ouch !
# HTMLParser is not able to parse very dirty HTML string, # HTMLParser is not able to parse very dirty HTML string
if not repaired:
# try to repair any broken html with help of lxml # try to repair any broken html with help of lxml
if already_repaired and not one_more_bullet_with_beautifulsoup:
# Even lxml nor BeautifulSoup doesn't perform miracles
# so Give up !
raise
elif already_repaired and one_more_bullet_with_beautifulsoup:
# Is BeautifulSoup can perform miracles ?
one_more_bullet_with_beautifulsoup = False
# This function can raise the exception HTMLParseError.
# So consider this parsing as last chance
# to get parsable html.
repaired_html_tree = soupfromstring(html_string)
html_string = tostring(repaired_html_tree,
include_meta_content_type=True,
method='xml')
already_repaired = True
encoding = kwargs.get('encoding') encoding = kwargs.get('encoding')
# recover parameter is equal to True by default # recover parameter is equal to True by default
# in lxml API. I pass the argument to improve readability # in lxml API. I pass the argument to improve readability
...@@ -393,17 +376,27 @@ class SafeHTML: ...@@ -393,17 +376,27 @@ class SafeHTML:
lparser = LHTMLParser(encoding=encoding, recover=True, lparser = LHTMLParser(encoding=encoding, recover=True,
remove_comments=True) remove_comments=True)
except LookupError: except LookupError:
# Provided encoding is not known by parser, so discard it # Provided encoding is not known by parser so discard it
lparser = LHTMLParser(recover=True, lparser = LHTMLParser(recover=True,
remove_comments=True) remove_comments=True)
repaired_html_tree = etree.HTML(orig, parser=lparser) repaired_html_tree = etree.HTML(orig, parser=lparser)
html_string = tostring(repaired_html_tree, elif repaired > (soupfromstring is not None):
# Neither lxml nor BeautifulSoup worked so give up !
raise
else:
# Can BeautifulSoup perform miracles ?
# This function may raise HTMLParseError.
# So consider this parsing as last chance
# to get parsable html.
repaired_html_tree = soupfromstring(orig)
orig = tostring(repaired_html_tree,
include_meta_content_type=True, include_meta_content_type=True,
method='xml') method='xml')
repaired += 1
# avoid breaking now. # avoid breaking now.
# continue into the loop with repaired html # continue into the loop with repaired html
else: else:
data.setData(safe) data.setData(orig)
break break
return data return data
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment