html_to_odt.py 1.54 KB
Newer Older
1
# -*- coding: utf-8 -*-
2
from Products.PortalTransforms.interfaces import itransform
3
from zope.interface import implements
4
from oood_commandtransform import OOOdCommandTransform, OOoDocumentDataStream
5
from  oood_commandtransform import includeMetaContentType
6
from zLOG import LOG
7 8 9 10
from lxml import etree, html
from lxml.etree import Element, SubElement

html_parser = etree.HTMLParser(remove_blank_text=True, encoding='utf-8')
11 12 13 14

class HTMLToOdt:
  """Transforms HTML to odt by using oood"""

15
  implements(itransform)
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33

  __name__ = 'html_to_odt'
  inputs   = ('text/html',)
  output = 'application/vnd.oasis.opendocument.text'

  tranform_engine = OOOdCommandTransform.__module__

  def name(self):
    return self.__name__

  def __getattr__(self, attr):
    if attr == 'inputs':
      return self.config['inputs']
    if attr == 'output':
      return self.config['output']
    raise AttributeError(attr)

  def convert(self, orig, data, cache=None, filename=None, context=None, **kwargs):
34 35
    # Try to recover broken HTML documents, specially regarding encoding used
    html_node = etree.XML(orig, parser=html_parser)
36
    includeMetaContentType(html_node)
37
    orig = html.tostring(html_node, encoding='utf-8',
38
                         include_meta_content_type=True)
39

40 41 42 43 44 45 46 47 48 49 50 51
    doc = OOOdCommandTransform(context, filename, orig, self.inputs[0])
    odt = doc.convertTo('odt')
    if cache is not None:
      cache.setData(odt)
      return cache
    else:
      stream = OOoDocumentDataStream()
      stream.setData(odt)
      return stream

def register():
  return HTMLToOdt()