Commit b255c894 authored by Julien Muchembled's avatar Julien Muchembled

PortalTransforms: merge upstream 2.0

This fixes test_20_reStructuredText partially.

Conflicts:
	Products/PortalTransforms/TransformEngine.py
	Products/PortalTransforms/libtransforms/commandtransform.py
	Products/PortalTransforms/transforms/safe_html.py
	Products/PortalTransforms/utils.py

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@41726 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 4adafd42
This diff is collapsed.
...@@ -3,9 +3,15 @@ ...@@ -3,9 +3,15 @@
from time import time from time import time
from Acquisition import aq_base from Acquisition import aq_base
_marker = object()
class Cache: class Cache:
def __init__(self, context, _id='_v_transform_cache'): def __init__(self, obj, context=None, _id='_v_transform_cache'):
self.obj = obj
if context is None:
self.context = obj
else:
self.context = context self.context = context
self._id =_id self._id =_id
...@@ -17,17 +23,19 @@ class Cache: ...@@ -17,17 +23,19 @@ class Cache:
key = key.replace('+', '_') key = key.replace('+', '_')
key = key.replace('-', '_') key = key.replace('-', '_')
key = key.replace(' ', '_') key = key.replace(' ', '_')
if hasattr(aq_base(self.context), 'absolute_url'):
return key, self.context.absolute_url()
return key return key
def setCache(self, key, value): def setCache(self, key, value):
"""cache a value indexed by key""" """cache a value indexed by key"""
if not value.isCacheable(): if not value.isCacheable():
return return
context = self.context obj = self.obj
key = self._genCacheKey(key) key = self._genCacheKey(key)
if getattr(aq_base(context), self._id, None) is None: if getattr(aq_base(obj), self._id, None) is None:
setattr(context, self._id, {}) setattr(obj, self._id, {})
getattr(context, self._id)[key] = (time(), value) getattr(obj, self._id)[key] = (time(), value)
return key return key
def getCache(self, key): def getCache(self, key):
...@@ -36,9 +44,9 @@ class Cache: ...@@ -36,9 +44,9 @@ class Cache:
return None if not present return None if not present
else return a tuple (time spent in cache, value) else return a tuple (time spent in cache, value)
""" """
context = self.context obj = self.obj
key = self._genCacheKey(key) key = self._genCacheKey(key)
dict = getattr(context, self._id, None) dict = getattr(obj, self._id, None)
if dict is None : if dict is None :
return None return None
try: try:
...@@ -50,14 +58,14 @@ class Cache: ...@@ -50,14 +58,14 @@ class Cache:
def purgeCache(self, key=None): def purgeCache(self, key=None):
"""Remove cache """Remove cache
""" """
context = self.context obj = self.obj
id = self._id id = self._id
if not shasattr(context, id): if getattr(obj, id, _marker) is _marker:
return return
if key is None: if key is None:
delattr(context, id) delattr(obj, id)
else: else:
cache = getattr(context, id) cache = getattr(obj, id)
key = self._genCacheKey(key) key = self._genCacheKey(key)
if cache.has_key(key): if cache.has_key(key):
del cache[key] del cache[key]
...@@ -87,6 +87,8 @@ class popentransform: ...@@ -87,6 +87,8 @@ class popentransform:
def convert(self, data, cache, **kwargs): def convert(self, data, cache, **kwargs):
command = "%s %s" % (self.binary, self.binaryArgs) command = "%s %s" % (self.binary, self.binaryArgs)
tmpname = None
try:
if not self.useStdin: if not self.useStdin:
tmpfile, tmpname = tempfile.mkstemp(text=False) # create tmp tmpfile, tmpname = tempfile.mkstemp(text=False) # create tmp
os.write(tmpfile, data) # write data to tmp using a file descriptor os.write(tmpfile, data) # write data to tmp using a file descriptor
...@@ -103,12 +105,12 @@ class popentransform: ...@@ -103,12 +105,12 @@ class popentransform:
out = self.getData(couterr) out = self.getData(couterr)
couterr.close() couterr.close()
if not self.useStdin:
# remove tmp file
os.unlink(tmpname)
cache.setData(out) cache.setData(out)
return cache return cache
finally:
if not self.useStdin and tmpname is not None:
# remove tmp file
os.unlink(tmpname)
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
import shlex import shlex
......
import re import re
import os import os
import sys import sys
from sgmllib import SGMLParser from sgmllib import SGMLParser, SGMLParseError
try: try:
# Need to be imported before win32api to avoid dll loading # Need to be imported before win32api to avoid dll loading
...@@ -208,6 +208,25 @@ class StrippingParser( SGMLParser ): ...@@ -208,6 +208,25 @@ class StrippingParser( SGMLParser ):
self.result = "%s</%s>" % (self.result, tag) self.result = "%s</%s>" % (self.result, tag)
remTag = '</%s>' % tag remTag = '</%s>' % tag
def parse_declaration(self, i):
"""Fix handling of CDATA sections. Code borrowed from BeautifulSoup.
"""
j = None
if self.rawdata[i:i+9] == '<![CDATA[':
k = self.rawdata.find(']]>', i)
if k == -1:
k = len(self.rawdata)
data = self.rawdata[i+9:k]
j = k+3
self.result.append("<![CDATA[%s]]>" % data)
else:
try:
j = SGMLParser.parse_declaration(self, i)
except SGMLParseError:
toHandle = self.rawdata[i:]
self.result.append(toHandle)
j = i + len(toHandle)
return j
def scrubHTML( html ): def scrubHTML( html ):
""" Strip illegal HTML tags from string text. """ """ Strip illegal HTML tags from string text. """
......
## Testing Markdown ## Testing Markdown
`code` and _italic_ and *bold* and even a [link](http://plone.org). `code` and _italic_ and *bold* and even a [link](http://plone.org).
Fööbär
...@@ -15,6 +15,10 @@ ...@@ -15,6 +15,10 @@
</tr> </tr>
</table> </table>
<p>This is a text used as a blind text.</p> <p>This is a text used as a blind text.</p>
<div><![CDATA[
Some CDATA text.
]]>
</div>
<ul> <ul>
<li>A sample list item1</li> <li>A sample list item1</li>
<li>A sample list item2</li> <li>A sample list item2</li>
......
...@@ -3,4 +3,5 @@ ...@@ -3,4 +3,5 @@
<h2> Testing Markdown </h2> <h2> Testing Markdown </h2>
<p> <code>code</code> and <em>italic</em> and <em>bold</em> and even a <a href="http://plone.org">link</a>. <p> <code>code</code> and <em>italic</em> and <em>bold</em> and even a <a href="http://plone.org">link</a>.
</p> </p>
<p>Fööbär</p>
<h2 class="title">Heading 1</h2> <h2 class="title">Heading 1</h2>
<p>Some text.</p> <p>Some text.</p>
<div class="section"> <div class="section" id="heading-2">
<h3><a id="heading-2" name="heading-2">Heading 2</a></h3> <h3>Heading 2</h3>
<p>Some text, bla ble bli blo blu. Yes, i know this is <a class="reference" href="http://www.example.com">Stupid</a>.</p> <p>Some text, bla ble bli blo blu. Yes, i know this is<a class="reference external" href="http://www.example.com">Stupid</a>.</p>
</div> </div>
<h2 class="title">Title</h2> <h2 class="title">Title</h2>
<h3 class="subtitle">Subtitle</h3> <h3 class="subtitle">Subtitle</h3>
<p>This is a test document to make sure subtitle gets the right heading.</p> <p>This is a test document to make sure subtitle gets the right heading.</p>
<div class="section"> <div class="section" id="now-the-real-heading">
<h3><a id="now-the-real-heading" name="now-the-real-heading">Now the real heading</a></h3> <h3>Now the real heading</h3>
<p>The brown fox jumped over the lazy dog.</p> <p>The brown fox jumped over the lazy dog.</p>
<div class="section"> <div class="section" id="with-a-subheading">
<h4><a id="with-a-subheading" name="with-a-subheading">With a subheading</a></h4> <h4>With a subheading</h4>
<p>Some text, bla ble bli blo blu. Yes, i know this is <a class="reference" href="http://www.example.com">Stupid</a>.</p> <p>Some text, bla ble bli blo blu. Yes, i know this is<a class="reference external" href="http://www.example.com">Stupid</a>.</p>
</div> </div>
</div> </div>
...@@ -6,6 +6,10 @@ ...@@ -6,6 +6,10 @@
</tr> </tr>
</table> </table>
<p>This is a text used as a blind text.</p> <p>This is a text used as a blind text.</p>
<div><![CDATA[
Some CDATA text.
]]>
</div>
<ul> <ul>
<li>A sample list item1</li> <li>A sample list item1</li>
<li>A sample list item2</li> <li>A sample list item2</li>
......
...@@ -67,6 +67,15 @@ class DummyHtmlFilter2(BaseTransform): ...@@ -67,6 +67,15 @@ class DummyHtmlFilter2(BaseTransform):
data.setData("<div class='dummy'>%s</div>" % orig) data.setData("<div class='dummy'>%s</div>" % orig)
return data return data
class QuxToVHost(DummyHtmlFilter1):
__name__ = 'qux_to_vhost'
def convert(self, orig, data, context, **kwargs):
data.setData(re.sub('qux', context.REQUEST['SERVER_URL'], orig))
return data
class TransformNoIO(BaseTransform): class TransformNoIO(BaseTransform):
implements(ITransform) implements(ITransform)
...@@ -223,6 +232,52 @@ class TestEngine(ATSiteTestCase): ...@@ -223,6 +232,52 @@ class TestEngine(ATSiteTestCase):
out = self.engine.convertTo(mt, other_data, mimetype=mt, object=self) out = self.engine.convertTo(mt, other_data, mimetype=mt, object=self)
self.failUnlessEqual(out.getData(), other_data, out.getData()) self.failUnlessEqual(out.getData(), other_data, out.getData())
def testCacheWithVHost(self):
"""Ensure that the transform cache key includes virtual
hosting so that transforms which are dependent on the virtual
hosting don't get invalid data from the cache. This happens,
for example, in the resolve UID functionality used by visual
editors."""
mt = 'text/x-html-safe'
self.engine.registerTransform(QuxToVHost())
required = ['qux_to_vhost']
self.engine.manage_addPolicy(mt, required)
data = '<a href="qux">vhost link</a>'
out = self.engine.convertTo(
mt, data, mimetype='text/html', object=self.folder,
context=self.folder)
self.failUnlessEqual(
out.getData(), '<a href="http://nohost">vhost link</a>',
out.getData())
# Test when object is not a context
out = self.engine.convertTo(
mt, data, mimetype='text/html', object=self,
context=self.folder)
self.failUnlessEqual(
out.getData(), '<a href="http://nohost">vhost link</a>',
out.getData())
# Change the virtual hosting
self.folder.REQUEST['SERVER_URL'] = 'http://otherhost'
out = self.engine.convertTo(
mt, data, mimetype='text/html', object=self.folder,
context=self.folder)
self.failUnlessEqual(
out.getData(), '<a href="http://otherhost">vhost link</a>',
out.getData())
# Test when object is not a context
out = self.engine.convertTo(
mt, data, mimetype='text/html', object=self,
context=self.folder)
self.failUnlessEqual(
out.getData(), '<a href="http://otherhost">vhost link</a>',
out.getData())
def test_suite(): def test_suite():
from unittest import TestSuite, makeSuite from unittest import TestSuite, makeSuite
......
...@@ -16,6 +16,87 @@ class TestGraph(ATSiteTestCase): ...@@ -16,6 +16,87 @@ class TestGraph(ATSiteTestCase):
out = self.engine.convertTo('text/plain', data, filename=FILE_PATH) out = self.engine.convertTo('text/plain', data, filename=FILE_PATH)
self.failUnless(out.getData()) self.failUnless(out.getData())
def testFindPath(self):
originalMap = self.engine._mtmap
"""
The dummy map used for this test corresponds to a graph
depicted in ASCII art below :
+---+
| |
| v
+-->1<-->2-->4-->6<--7
^ ^ |
| | |
v | |
3<---+ |
^ |
| |
v |
5<-------+
"""
# we need a DummyTransform class
class DT:
def __init__(self, name):
self._name = name
def name(self):
return self._name
dummyMap1 = {
'1': { '1': [DT('transform1-1')],
'2': [DT('transform1-2')],
'3': [DT('transform1-3')]},
'2': { '1': [DT('transform2-1')],
'3': [DT('transform2-3')],
'4': [DT('transform2-4')]},
'3': { '1': [DT('transform3-1')],
'2': [DT('transform3-2')],
'5': [DT('transform3-5')]},
'4': { '5': [DT('transform4-5')],
'6': [DT('transform4-6')]},
'5': { '3': [DT('transform5-3')]},
'7': { '6': [DT('transform7-6')]}
}
expectedPathes = {
'1-1': [],
'1-2': ['transform1-2'],
'1-3': ['transform1-3'],
'1-4': ['transform1-2', 'transform2-4'],
'1-5': ['transform1-3', 'transform3-5'],
'1-6': ['transform1-2', 'transform2-4', 'transform4-6'],
'1-7': None,
'2-1': ['transform2-1'],
'2-2': [],
'2-4': ['transform2-4'],
'4-2': ['transform4-5', 'transform5-3', 'transform3-2'],
'5-3': ['transform5-3']
}
self.engine._mtmap = dummyMap1
for orig in ['1','2','3','4','5','6','7']:
for target in ['1','2','3','4','5','6','7']:
# build the name of the path
pathName = orig + '-' + target
# do we have any expectation for this path ?
if pathName in expectedPathes.keys():
# we do. Here is the expected shortest path
expectedPath = expectedPathes[pathName]
# what's the shortest path according to the engine ?
gotPath = self.engine._findPath(orig,target)
# just keep the name of the transforms, please
if gotPath is not None:
gotPath = [transform.name() for transform in gotPath]
# this must be the same as in our expectation
self.assertEquals(expectedPath, gotPath)
self.engine._mtmap = originalMap
def testFindPathWithEmptyTransform(self):
""" _findPath should not throw "index out of range" when dealing with
empty transforms list
"""
dummyMap = {'1': {'2': []}}
self.engine._mtmap = dummyMap
self.engine._findPath('1','2')
def testIdentity(self): def testIdentity(self):
orig = 'Some text' orig = 'Some text'
converted = self.engine.convertTo( converted = self.engine.convertTo(
......
import os import os
import logging import logging
from Testing import ZopeTestCase
from Products.Archetypes.tests.atsitetestcase import ATSiteTestCase from Products.Archetypes.tests.atsitetestcase import ATSiteTestCase
from Products.CMFCore.utils import getToolByName
from utils import input_file_path, output_file_path, normalize_html,\ from utils import input_file_path, output_file_path, normalize_html,\
load, matching_inputs load, matching_inputs
from Products.PortalTransforms.data import datastream from Products.PortalTransforms.data import datastream
from Products.PortalTransforms.interfaces import IDataStream from Products.PortalTransforms.interfaces import IDataStream
from Products.PortalTransforms.interfaces import idatastream
from Products.MimetypesRegistry.MimeTypesTool import MimeTypesTool
from Products.PortalTransforms.TransformEngine import TransformTool
from Products.PortalTransforms.libtransforms.utils import MissingBinary from Products.PortalTransforms.libtransforms.utils import MissingBinary
from Products.PortalTransforms.transforms.image_to_gif import image_to_gif from Products.PortalTransforms.transforms.image_to_gif import image_to_gif
...@@ -24,7 +21,6 @@ from Products.PortalTransforms.transforms.textile_to_html import HAS_TEXTILE ...@@ -24,7 +21,6 @@ from Products.PortalTransforms.transforms.textile_to_html import HAS_TEXTILE
from Products.PortalTransforms.transforms.markdown_to_html import HAS_MARKDOWN from Products.PortalTransforms.transforms.markdown_to_html import HAS_MARKDOWN
from os.path import exists from os.path import exists
import sys
# we have to set locale because lynx output is locale sensitive ! # we have to set locale because lynx output is locale sensitive !
os.environ['LC_ALL'] = 'C' os.environ['LC_ALL'] = 'C'
logger = logging.getLogger('PortalTransforms') logger = logging.getLogger('PortalTransforms')
...@@ -59,9 +55,11 @@ class TransformTest(ATSiteTestCase): ...@@ -59,9 +55,11 @@ class TransformTest(ATSiteTestCase):
got = self.normalize(got) got = self.normalize(got)
output.close() output.close()
self.assertEquals(got, expected, got_start = got.strip()[:30]
expected_start = expected.strip()[:30]
self.assertEquals(got_start, expected_start,
'[%s]\n\n!=\n\n[%s]\n\nIN %s(%s)' % ( '[%s]\n\n!=\n\n[%s]\n\nIN %s(%s)' % (
got, expected, self.transform.name(), self.input)) got_start, expected_start, self.transform.name(), self.input))
self.assertEquals(self.subobjects, len(res_data.getSubObjects()), self.assertEquals(self.subobjects, len(res_data.getSubObjects()),
'%s\n\n!=\n\n%s\n\nIN %s(%s)' % ( '%s\n\n!=\n\n%s\n\nIN %s(%s)' % (
self.subobjects, len(res_data.getSubObjects()), self.subobjects, len(res_data.getSubObjects()),
...@@ -70,13 +68,13 @@ class TransformTest(ATSiteTestCase): ...@@ -70,13 +68,13 @@ class TransformTest(ATSiteTestCase):
def testSame(self): def testSame(self):
try: try:
self.do_convert(filename=self.input) self.do_convert(filename=self.input)
except MissingBinary, e: except MissingBinary:
pass pass
def testSameNoFilename(self): def testSameNoFilename(self):
try: try:
self.do_convert() self.do_convert()
except MissingBinary, e: except MissingBinary:
pass pass
def __repr__(self): def __repr__(self):
...@@ -86,12 +84,13 @@ class PILTransformsTest(ATSiteTestCase): ...@@ -86,12 +84,13 @@ class PILTransformsTest(ATSiteTestCase):
def afterSetUp(self): def afterSetUp(self):
ATSiteTestCase.afterSetUp(self) ATSiteTestCase.afterSetUp(self)
self.pt = self.portal.portal_transforms self.pt = self.portal.portal_transforms
self.mimetypes_registry = getToolByName(self.portal, 'mimetypes_registry')
def test_image_to_bmp(self): def test_image_to_bmp(self):
self.pt.registerTransform(image_to_bmp()) self.pt.registerTransform(image_to_bmp())
imgFile = open(input_file_path('logo.jpg'), 'rb') imgFile = open(input_file_path('logo.jpg'), 'rb')
data = imgFile.read() data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/jpeg') self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/jpeg')
data = self.pt.convertTo(target_mimetype='image/x-ms-bmp',orig=data) data = self.pt.convertTo(target_mimetype='image/x-ms-bmp',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/x-ms-bmp') self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/x-ms-bmp')
...@@ -99,7 +98,7 @@ class PILTransformsTest(ATSiteTestCase): ...@@ -99,7 +98,7 @@ class PILTransformsTest(ATSiteTestCase):
self.pt.registerTransform(image_to_gif()) self.pt.registerTransform(image_to_gif())
imgFile = open(input_file_path('logo.png'), 'rb') imgFile = open(input_file_path('logo.png'), 'rb')
data = imgFile.read() data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/png') self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/png')
data = self.pt.convertTo(target_mimetype='image/gif',orig=data) data = self.pt.convertTo(target_mimetype='image/gif',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/gif') self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/gif')
...@@ -107,7 +106,7 @@ class PILTransformsTest(ATSiteTestCase): ...@@ -107,7 +106,7 @@ class PILTransformsTest(ATSiteTestCase):
self.pt.registerTransform(image_to_jpeg()) self.pt.registerTransform(image_to_jpeg())
imgFile = open(input_file_path('logo.gif'), 'rb') imgFile = open(input_file_path('logo.gif'), 'rb')
data = imgFile.read() data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/gif') self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/gif')
data = self.pt.convertTo(target_mimetype='image/jpeg',orig=data) data = self.pt.convertTo(target_mimetype='image/jpeg',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/jpeg') self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/jpeg')
...@@ -115,7 +114,7 @@ class PILTransformsTest(ATSiteTestCase): ...@@ -115,7 +114,7 @@ class PILTransformsTest(ATSiteTestCase):
self.pt.registerTransform(image_to_png()) self.pt.registerTransform(image_to_png())
imgFile = open(input_file_path('logo.jpg'), 'rb') imgFile = open(input_file_path('logo.jpg'), 'rb')
data = imgFile.read() data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/jpeg') self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/jpeg')
data = self.pt.convertTo(target_mimetype='image/png',orig=data) data = self.pt.convertTo(target_mimetype='image/png',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/png') self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/png')
...@@ -123,7 +122,7 @@ class PILTransformsTest(ATSiteTestCase): ...@@ -123,7 +122,7 @@ class PILTransformsTest(ATSiteTestCase):
self.pt.registerTransform(image_to_pcx()) self.pt.registerTransform(image_to_pcx())
imgFile = open(input_file_path('logo.gif'), 'rb') imgFile = open(input_file_path('logo.gif'), 'rb')
data = imgFile.read() data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/gif') self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/gif')
data = self.pt.convertTo(target_mimetype='image/pcx',orig=data) data = self.pt.convertTo(target_mimetype='image/pcx',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/pcx') self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/pcx')
...@@ -131,7 +130,7 @@ class PILTransformsTest(ATSiteTestCase): ...@@ -131,7 +130,7 @@ class PILTransformsTest(ATSiteTestCase):
self.pt.registerTransform(image_to_ppm()) self.pt.registerTransform(image_to_ppm())
imgFile = open(input_file_path('logo.png'), 'rb') imgFile = open(input_file_path('logo.png'), 'rb')
data = imgFile.read() data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/png') self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/png')
data = self.pt.convertTo(target_mimetype='image/x-portable-pixmap',orig=data) data = self.pt.convertTo(target_mimetype='image/x-portable-pixmap',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/x-portable-pixmap') self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/x-portable-pixmap')
...@@ -139,7 +138,7 @@ class PILTransformsTest(ATSiteTestCase): ...@@ -139,7 +138,7 @@ class PILTransformsTest(ATSiteTestCase):
self.pt.registerTransform(image_to_tiff()) self.pt.registerTransform(image_to_tiff())
imgFile = open(input_file_path('logo.jpg'), 'rb') imgFile = open(input_file_path('logo.jpg'), 'rb')
data = imgFile.read() data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/jpeg') self.failUnlessEqual(self.mimetypes_registry.classify(data),'image/jpeg')
data = self.pt.convertTo(target_mimetype='image/tiff',orig=data) data = self.pt.convertTo(target_mimetype='image/tiff',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/tiff') self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/tiff')
......
...@@ -5,6 +5,7 @@ from sys import modules ...@@ -5,6 +5,7 @@ from sys import modules
from os.path import join, abspath, dirname, basename from os.path import join, abspath, dirname, basename
def normalize_html(s): def normalize_html(s):
s = re.sub(r"&nbsp;", " ", s)
s = re.sub(r"\s+", " ", s) s = re.sub(r"\s+", " ", s)
s = re.sub(r"(?s)\s+<", "<", s) s = re.sub(r"(?s)\s+<", "<", s)
s = re.sub(r"(?s)>\s+", ">", s) s = re.sub(r"(?s)>\s+", ">", s)
......
""" """
Uses the http://www.freewisdom.org/projects/python-markdown/ module to do its handy work Uses the http://www.freewisdom.org/projects/python-markdown/ module
author: Tom Lazar <tom@tomster.org> at the archipelago sprint 2006
Author: Tom Lazar <tom@tomster.org> at the archipelago sprint 2006
""" """
import os
from zope.interface import implements from zope.interface import implements
from Products.CMFDefault.utils import bodyfinder
from Products.PortalTransforms.interfaces import ITransform from Products.PortalTransforms.interfaces import ITransform
from Products.PortalTransforms.libtransforms.commandtransform import commandtransform
from Products.PortalTransforms.libtransforms.utils import bin_search
from Products.PortalTransforms.libtransforms.utils import sansext
from Products.PortalTransforms.utils import log from Products.PortalTransforms.utils import log
try: try:
...@@ -37,11 +30,16 @@ class markdown: ...@@ -37,11 +30,16 @@ class markdown:
def convert(self, orig, data, **kwargs): def convert(self, orig, data, **kwargs):
if HAS_MARKDOWN: if HAS_MARKDOWN:
html = markdown_transformer.markdown(orig) # markdown expects unicode input:
orig = unicode(orig.decode('utf-8'))
# PortalTransforms, however expects a string as result,
# so we encode the unicode result back to UTF8:
html = markdown_transformer.markdown(orig).encode('utf-8')
else: else:
html = orig html = orig
data.setData(html) data.setData(html)
return data return data
def register(): def register():
return markdown() return markdown()
import re, tempfile import os
import os, os.path from Products.PortalTransforms.libtransforms.utils import bodyfinder, scrubHTML
from Products.PortalTransforms.libtransforms.utils import bin_search, \
sansext, bodyfinder, scrubHTML
from Products.PortalTransforms.libtransforms.commandtransform import commandtransform from Products.PortalTransforms.libtransforms.commandtransform import commandtransform
class document(commandtransform): class document(commandtransform):
......
...@@ -31,7 +31,7 @@ VALID_TAGS['ins'] = 1 ...@@ -31,7 +31,7 @@ VALID_TAGS['ins'] = 1
VALID_TAGS['del'] = 1 VALID_TAGS['del'] = 1
VALID_TAGS['q'] = 1 VALID_TAGS['q'] = 1
VALID_TAGS['map'] = 1 VALID_TAGS['map'] = 1
VALID_TAGS['area'] = 1 VALID_TAGS['area'] = 0
VALID_TAGS['abbr'] = 1 VALID_TAGS['abbr'] = 1
VALID_TAGS['acronym'] = 1 VALID_TAGS['acronym'] = 1
VALID_TAGS['var'] = 1 VALID_TAGS['var'] = 1
...@@ -71,6 +71,10 @@ VALID_TAGS['source'] = 1 ...@@ -71,6 +71,10 @@ VALID_TAGS['source'] = 1
VALID_TAGS['time'] = 1 VALID_TAGS['time'] = 1
VALID_TAGS['video'] = 1 VALID_TAGS['video'] = 1
# add some tags to nasty. These should also probably be backported to CMFDefault.
NASTY_TAGS['style'] = 1 # this helps improve Word HTML cleanup.
NASTY_TAGS['meta'] = 1 # allowed by parsers, but can cause unexpected behavior
msg_pat = """ msg_pat = """
<div class="system-message"> <div class="system-message">
...@@ -203,7 +207,7 @@ class StrippingParser(HTMLParser): ...@@ -203,7 +207,7 @@ class StrippingParser(HTMLParser):
if not self.raise_error: continue if not self.raise_error: continue
else: raise IllegalHTML, 'Script event "%s" not allowed.' % k else: raise IllegalHTML, 'Script event "%s" not allowed.' % k
elif v is None: elif v is None:
self.result.append(' %s' % (k,)) self.result.append(' %s' % k)
elif remove_script and hasScript(v): elif remove_script and hasScript(v):
if not self.raise_error: continue if not self.raise_error: continue
else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v
...@@ -238,6 +242,26 @@ class StrippingParser(HTMLParser): ...@@ -238,6 +242,26 @@ class StrippingParser(HTMLParser):
self.result.append('</%s>' % tag) self.result.append('</%s>' % tag)
#remTag = '</%s>' % tag #remTag = '</%s>' % tag
def parse_declaration(self, i):
"""Fix handling of CDATA sections. Code borrowed from BeautifulSoup.
"""
j = None
if self.rawdata[i:i+9] == '<![CDATA[':
k = self.rawdata.find(']]>', i)
if k == -1:
k = len(self.rawdata)
data = self.rawdata[i+9:k]
j = k+3
self.result.append("<![CDATA[%s]]>" % data)
else:
try:
j = HTMLParser.parse_declaration(self, i)
except HTMLParseError:
toHandle = self.rawdata[i:]
self.result.append(toHandle)
j = i + len(toHandle)
return j
def getResult(self): def getResult(self):
return ''.join(self.result) return ''.join(self.result)
...@@ -291,6 +315,10 @@ class SafeHTML: ...@@ -291,6 +315,10 @@ class SafeHTML:
'output': self.output, 'output': self.output,
'valid_tags': VALID_TAGS, 'valid_tags': VALID_TAGS,
'nasty_tags': NASTY_TAGS, 'nasty_tags': NASTY_TAGS,
'stripped_attributes': ['lang','valign','halign','border','frame','rules','cellspacing','cellpadding','bgcolor'],
'stripped_combinations': {'table th td': 'width height'},
'style_whitelist': ['text-align', 'list-style-type', 'float'],
'class_blacklist': [],
'remove_javascript': 1, 'remove_javascript': 1,
'disable_transform': 0, 'disable_transform': 0,
'default_encoding': 'utf-8', 'default_encoding': 'utf-8',
...@@ -310,6 +338,19 @@ class SafeHTML: ...@@ -310,6 +338,19 @@ class SafeHTML:
'everything they contain (like applet, object). ' + 'everything they contain (like applet, object). ' +
'They are only deleted if they are not marked as valid_tags.', 'They are only deleted if they are not marked as valid_tags.',
('tag', 'value')), ('tag', 'value')),
'stripped_attributes': ('list',
'stripped_attributes',
'These attributes are stripped from any tag.'),
'stripped_combinations' : ('dict',
'stripped_combinations',
'These attributes are stripped from any tag.',
('tag', 'value')),
'style_whitelist': ('list',
'style_whitelist',
'These CSS styles are allowed in style attributes.'),
'class_blacklist': ('list',
'class_blacklist',
'These class names are not allowed in class attributes.'),
'remove_javascript' : ("int", 'remove_javascript' : ("int",
'remove_javascript', 'remove_javascript',
'1 to remove javascript attributes that begin with on (e.g. onClick) ' + '1 to remove javascript attributes that begin with on (e.g. onClick) ' +
...@@ -355,7 +396,9 @@ class SafeHTML: ...@@ -355,7 +396,9 @@ class SafeHTML:
repaired = 0 repaired = 0
while True: while True:
try: try:
orig = scrubHTML( # Do 2 passes. This provides more reliable filtering of certain
# malicious HTML (cf upstream commit svn10522).
for repeat in range(2): orig = scrubHTML(
orig, orig,
valid=self.config.get('valid_tags', {}), valid=self.config.get('valid_tags', {}),
nasty=self.config.get('nasty_tags', {}), nasty=self.config.get('nasty_tags', {}),
...@@ -366,6 +409,8 @@ class SafeHTML: ...@@ -366,6 +409,8 @@ class SafeHTML:
data.setData(msg_pat % ("Error", str(inst))) data.setData(msg_pat % ("Error", str(inst)))
break break
except HTMLParseError: except HTMLParseError:
if repeat:
raise # try to repair only on first pass
# ouch ! # ouch !
# HTMLParser is not able to parse very dirty HTML string # HTMLParser is not able to parse very dirty HTML string
if not repaired: if not repaired:
......
...@@ -45,7 +45,8 @@ class word_to_html: ...@@ -45,7 +45,8 @@ class word_to_html:
def convert(self, data, cache, **kwargs): def convert(self, data, cache, **kwargs):
orig_file = 'unknown.doc' orig_file = 'unknown.doc'
doc = None
try:
doc = document(orig_file, data) doc = document(orig_file, data)
doc.convert() doc.convert()
html = doc.html() html = doc.html()
...@@ -54,11 +55,13 @@ class word_to_html: ...@@ -54,11 +55,13 @@ class word_to_html:
objects = {} objects = {}
if images: if images:
doc.fixImages(path, images, objects) doc.fixImages(path, images, objects)
doc.cleanDir(doc.tmpdir)
cache.setData(html) cache.setData(html)
cache.setSubObjects(objects) cache.setSubObjects(objects)
return cache return cache
finally:
if doc is not None:
doc.cleanDir(doc.tmpdir)
def register(): def register():
return word_to_html() return word_to_html()
...@@ -6,37 +6,37 @@ from Products.PortalTransforms.libtransforms.utils import bin_search, MissingBin ...@@ -6,37 +6,37 @@ from Products.PortalTransforms.libtransforms.utils import bin_search, MissingBin
COMMAND_CONFIGS = ( COMMAND_CONFIGS = (
('lynx_dump', '.html', ('lynx_dump', '.html',
{'binary_path' : 'lynx', {'binary_path' : 'lynx',
'command_line' : '-dump %s', 'command_line' : '-dump %(input)s',
'inputs' : ('text/html',), 'inputs' : ('text/html',),
'output' : 'text/plain', 'output' : 'text/plain',
}), }),
('tidy_html', '.html', ('tidy_html', '.html',
{'binary_path' : 'tidy', {'binary_path' : 'tidy',
'command_line' : '%s', 'command_line' : '%(input)s',
'inputs' : ('text/html',), 'inputs' : ('text/html',),
'output' : 'text/html', 'output' : 'text/html',
}), }),
('rtf_to_html', None, ('rtf_to_html', None,
{'binary_path' : 'unrtf', {'binary_path' : 'unrtf',
'command_line' : '%s', 'command_line' : '%(input)s',
'inputs' : ('application/rtf',), 'inputs' : ('application/rtf',),
'output' : 'text/html', 'output' : 'text/html',
}), }),
('ppt_to_html', None, ('ppt_to_html', None,
{'binary_path' : 'ppthtml', {'binary_path' : 'ppthtml',
'command_line' : '%s', 'command_line' : '%(input)s',
'inputs' : ('application/vnd.ms-powerpoint',), 'inputs' : ('application/vnd.ms-powerpoint',),
'output' : 'text/html', 'output' : 'text/html',
}), }),
('excel_to_html', None, ('excel_to_html', None,
{'binary_path' : 'xlhtml', {'binary_path' : 'xlhtml',
'command_line' : '-nh -a %s', 'command_line' : '-nh -a %(input)s',
'inputs' : ('application/vnd.ms-excel',), 'inputs' : ('application/vnd.ms-excel',),
'output' : 'text/html', 'output' : 'text/html',
}), }),
('ps_to_text', None, ('ps_to_text', None,
{'binary_path' : 'ps2ascii', {'binary_path' : 'ps2ascii',
'command_line' : '%s', 'command_line' : '%(input)s',
'inputs' : ('application/postscript',), 'inputs' : ('application/postscript',),
'output' : 'text/plain', 'output' : 'text/plain',
}), }),
......
...@@ -8,10 +8,10 @@ class TransformException(Exception): ...@@ -8,10 +8,10 @@ class TransformException(Exception):
FB_REGISTRY = None FB_REGISTRY = None
# logging function # logging function
from zLOG import LOG, INFO from zLOG import LOG, DEBUG
#logger = logging.getLogger('PortalTransforms') #logger = logging.getLogger('PortalTransforms')
def log(message, severity=INFO): def log(message, severity=DEBUG):
LOG('PortalTransforms', severity, message) LOG('PortalTransforms', severity, message)
#logger.log(severity, message) #logger.log(severity, message)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment