Commit f5a85c42 authored by Nicolas Delaby's avatar Nicolas Delaby

Replace 4Suite by lxml


git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@28642 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent bbce6c36
# -*- coding: utf-8 -*-
############################################################################## ##############################################################################
# #
# Copyright (c) 2003-2005 Nexedi SARL and Contributors. All Rights Reserved. # Copyright (c) 2003-2005 Nexedi SARL and Contributors. All Rights Reserved.
...@@ -44,19 +45,13 @@ except ImportError: ...@@ -44,19 +45,13 @@ except ImportError:
import imghdr import imghdr
import random import random
from Products.ERP5Type import Permissions from Products.ERP5Type import Permissions
from zLOG import LOG, INFO from zLOG import LOG, INFO, PROBLEM
from zLOG import PROBLEM
from OFS.Image import Pdata from OFS.Image import Pdata
try: from lxml import etree
from Ft.Xml import Parse from lxml.etree import Element, XMLSyntaxError
except ImportError: from copy import deepcopy
LOG('OOoUtils', INFO, "Can't import Parse")
class Parse:
def __init__(self, *args, **kw):
raise ImportError, "Sorry, it was not possible to import Ft library, python2.4-4Suite-XML is not installed"
class CorruptedOOoFile(Exception): pass class CorruptedOOoFile(Exception): pass
...@@ -142,7 +137,7 @@ class OOoBuilder(Implicit): ...@@ -142,7 +137,7 @@ class OOoBuilder(Implicit):
def getMimeType(self): def getMimeType(self):
return self.extract('mimetype') return self.extract('mimetype')
def prepareContentXml(self, ooo_xml_file_id, xsl_content=None): def prepareContentXml(self, ooo_xml_file_id):
""" """
extracts content.xml text and prepare it : extracts content.xml text and prepare it :
- add tal namespace - add tal namespace
...@@ -150,44 +145,21 @@ class OOoBuilder(Implicit): ...@@ -150,44 +145,21 @@ class OOoBuilder(Implicit):
""" """
content_xml = self.extract(ooo_xml_file_id) content_xml = self.extract(ooo_xml_file_id)
output = StringIO() output = StringIO()
try: content_doc = etree.XML(content_xml)
from lxml import etree root = content_doc.getroot()
from lxml.etree import Element #Declare zope namespaces
from copy import deepcopy NSMAP = {'tal': 'http://xml.zope.org/namespaces/tal',
content_doc = etree.XML(content_xml) 'i18n': 'http://xml.zope.org/namespaces/i18n',
if xsl_content is not None: 'metal': 'http://xml.zope.org/namespaces/metal'}
stylesheet_doc = etree.XML(xsl_content) NSMAP.update(root.nsmap)
stylesheet = etree.XSLT(stylesheet_doc) new_root = Element(root.tag, nsmap=NSMAP)
content_doc = stylesheet(content_doc) new_root.attrib.update(dict(root.attrib))
root = content_doc.getroot() new_root.attrib.update({'{%s}attributes' % NSMAP.get('tal'): 'dummy python:request.RESPONSE.setHeader(\'Content-Type\', \'text/html;; charset=utf-8\')'})
#Declare zope namespaces for child in root.getchildren():
NSMAP = {'tal': 'http://xml.zope.org/namespaces/tal', new_root.append(deepcopy(child))
'i18n': 'http://xml.zope.org/namespaces/i18n', return etree.tostring(new_root, encoding='utf-8', xml_declaration=True,
'metal': 'http://xml.zope.org/namespaces/metal'} pretty_print=True)
NSMAP.update(root.nsmap)
new_root = Element(root.tag, nsmap=NSMAP)
new_root.attrib.update(dict(root.attrib))
new_root.attrib.update({'{%s}attributes' % NSMAP.get('tal'): 'dummy python:request.RESPONSE.setHeader(\'Content-Type\', \'text/html;; charset=utf-8\')'})
for child in root.getchildren():
new_root.append(deepcopy(child))
return etree.tostring(new_root, encoding='utf-8', xml_declaration=True,
pretty_print=True)
except ImportError:
document = Parse(content_xml)
document_element = document.documentElement
tal = document.createAttributeNS('http://www.w3.org/2000/xmlns/', 'xmlns:tal')
tal.value = u'http://xml.zope.org/namespaces/tal'
i18n = document.createAttributeNS('http://www.w3.org/2000/xmlns/', 'xmlns:i18n')
i18n.value = u'http://xml.zope.org/namespaces/i18n'
metal = document.createAttributeNS('http://www.w3.org/2000/xmlns/', 'xmlns:metal')
metal.value = u'http://xml.zope.org/namespaces/metal'
document_element.setAttributeNodeNS(tal)
document_element.setAttributeNodeNS(i18n)
document_element.setAttributeNodeNS(metal)
document_element.setAttributeNS(None, 'tal:attributes', 'dummy python:request.RESPONSE.setHeader("Content-Type", "text/html;; charset=utf-8")')
from xml.dom.ext import PrettyPrint
PrettyPrint(document_element, output)
return output.getvalue()
def addFileEntry(self, full_path, media_type, content=None): def addFileEntry(self, full_path, media_type, content=None):
""" Add a file entry to the manifest and possibly is content """ """ Add a file entry to the manifest and possibly is content """
...@@ -250,7 +222,6 @@ class OOoParser(Implicit): ...@@ -250,7 +222,6 @@ class OOoParser(Implicit):
self.oo_styles_dom = None self.oo_styles_dom = None
self.oo_files = {} self.oo_files = {}
self.pictures = {} self.pictures = {}
self.ns = {}
self.filename = None self.filename = None
def openFromString(self, text_content): def openFromString(self, text_content):
...@@ -279,17 +250,8 @@ class OOoParser(Implicit): ...@@ -279,17 +250,8 @@ class OOoParser(Implicit):
oo_unzipped.close() oo_unzipped.close()
# Get the main content and style definitions # Get the main content and style definitions
self.oo_content_dom = Parse(self.oo_files["content.xml"]) self.oo_content_dom = etree.XML(self.oo_files["content.xml"])
self.oo_styles_dom = Parse(self.oo_files["styles.xml"]) self.oo_styles_dom = etree.XML(self.oo_files["styles.xml"])
# Create a namespace table
xpath = './/*[name() = "office:document-styles"]'
doc_ns = self.oo_styles_dom.xpath(xpath)
for i in range(doc_ns[0].attributes.length)[1:]:
if doc_ns[0].attributes.item(i).nodeType == Node.ATTRIBUTE_NODE:
name = doc_ns[0].attributes.item(i).name
if name[:5] == "xmlns":
self.ns[name[6:]] = doc_ns[0].attributes.item(i).value
def getFilename(self): def getFilename(self):
""" """
...@@ -340,11 +302,8 @@ class OOoParser(Implicit): ...@@ -340,11 +302,8 @@ class OOoParser(Implicit):
""" """
Retrieve every spreadsheets from the document and get they DOM tree Retrieve every spreadsheets from the document and get they DOM tree
""" """
spreadsheets = [] find_path = './/{%s}table' % self.oo_content_dom.nsmap['table']
# List all spreadsheets return self.oo_content_dom.findall(find_path)
for table in self.oo_content_dom.xpath('.//*[name() = "table:table"]'):
spreadsheets.append(table)
return spreadsheets
def getPlainSpreadsheetsMapping(self, no_empty_lines=False, normalize=True): def getPlainSpreadsheetsMapping(self, no_empty_lines=False, normalize=True):
""" """
...@@ -363,22 +322,22 @@ class OOoParser(Implicit): ...@@ -363,22 +322,22 @@ class OOoParser(Implicit):
""" """
spreadsheets = [] spreadsheets = []
# List all embedded spreadsheets # List all embedded spreadsheets
emb_objects = self.oo_content_dom.xpath('.//*[name() = "draw:object"]') find_path = './/{%s}object' % self.oo_content_dom.nsmap['draw']
emb_objects = self.oo_content_dom.findall(find_path)
for embedded in emb_objects: for embedded in emb_objects:
document = embedded.getAttributeNS(self.ns["xlink"], "href") document = embedded.get('{%s}href' % embedded.nsmap['xlink'])
if document: if document:
try: try:
object_content = etree.XML(self.oo_files[document[3:] + '/content.xml'])
object_content = Parse(self.oo_files[document[3:] + '/content.xml']) find_path = './/{%s}table' % self.oo_content_dom.nsmap['table']
xpath = './/*[name() = "table:table"]' table_list = self.oo_content_dom.findall(find_path)
tables = self.oo_content_dom.xpath(xpath) if table_list:
if tables: for table in table_list:
for table in tables: spreadsheets.append(table)
spreadsheets.append(table) else: # XXX: insert the link to OLE document ?
else: # XXX: insert the link to OLE document ? pass
pass except XMLSyntaxError:
except: pass
pass
return spreadsheets return spreadsheets
def getEmbeddedSpreadsheetsMapping(self, no_empty_lines=False, normalize=True): def getEmbeddedSpreadsheetsMapping(self, no_empty_lines=False, normalize=True):
...@@ -397,20 +356,22 @@ class OOoParser(Implicit): ...@@ -397,20 +356,22 @@ class OOoParser(Implicit):
This method convert an OpenOffice spreadsheet to a simple table. This method convert an OpenOffice spreadsheet to a simple table.
This code is based on the oo2pt tool (http://cvs.sourceforge.net/viewcvs.py/collective/CMFReportTool/oo2pt). This code is based on the oo2pt tool (http://cvs.sourceforge.net/viewcvs.py/collective/CMFReportTool/oo2pt).
""" """
if spreadsheet == None or spreadsheet.nodeName != 'table:table': if spreadsheet is None or \
spreadsheet.tag != '{%s}table' % spreadsheet.nsmap['table']:
return None return None
table = [] table = []
# Get the table name # Get the table name
table_name = spreadsheet.getAttributeNS(self.ns["table"], "name") table_name = spreadsheet.get('{%s}name' % spreadsheet.nsmap["table"])
# Scan table and store usable informations # Scan table and store usable informations
for line in spreadsheet.xpath('.//*[name() = "table:table-row"]'): find_path = './/{%s}table-row' % spreadsheet.nsmap['table']
for line in spreadsheet.findall(find_path):
# TODO : to the same as cell about abusive repeated lines # TODO : to the same as cell about abusive repeated lines
line_group_found = line.getAttributeNS(self.ns["table"], "number-rows-repeated") line_group_found = line.get('{%s}number-rows-repeated' % line.nsmap["table"])
if not line_group_found: if not line_group_found:
lines_to_repeat = 1 lines_to_repeat = 1
else: else:
...@@ -420,7 +381,8 @@ class OOoParser(Implicit): ...@@ -420,7 +381,8 @@ class OOoParser(Implicit):
table_line = [] table_line = []
# Get all cells # Get all cells
cells = line.xpath('.//*[name() = "table:table-cell"]') find_path = './/{%s}table-cell' % line.nsmap['table']
cells = line.findall(find_path)
cell_index_range = range(len(cells)) cell_index_range = range(len(cells))
for cell_index in cell_index_range: for cell_index in cell_index_range:
...@@ -434,11 +396,11 @@ class OOoParser(Implicit): ...@@ -434,11 +396,11 @@ class OOoParser(Implicit):
# can be found in OOo documents : <table:table-cell table:number-columns-repeated='246'/> # can be found in OOo documents : <table:table-cell table:number-columns-repeated='246'/>
# This is bad because it create too much irrevelent content that slow down the process # This is bad because it create too much irrevelent content that slow down the process
# So it's a good idea to break the loop in this case # So it's a good idea to break the loop in this case
if len(cell.childNodes) == 0 and cell_index == cell_index_range[-1]: if len(cell) == 0 and cell_index == cell_index_range[-1]:
break break
# Handle cells group # Handle cells group
cell_group_found = cell.getAttributeNS(self.ns["table"], "number-columns-repeated") cell_group_found = cell.get('{%s}number-columns-repeated' % cell.nsmap['table'])
if not cell_group_found: if not cell_group_found:
cells_to_repeat = 1 cells_to_repeat = 1
else: else:
...@@ -448,21 +410,19 @@ class OOoParser(Implicit): ...@@ -448,21 +410,19 @@ class OOoParser(Implicit):
for j in range(cells_to_repeat): for j in range(cells_to_repeat):
# Get the cell content # Get the cell content
cell_data = None cell_data = None
attribute_type_mapping = {'date': 'date-value',
value_type = None 'time': 'time-value',
# value-type and value attributes can be in table or office 'float': 'value',
# namespaces, so we use local-name 'percentage': 'value',
value_type_attribute_list = cell.xpath('./@*[local-name()="value-type"]') 'currency': 'value'}
if value_type_attribute_list: # Depending of odf version, value-type and value attributes can be in
value_type = value_type_attribute_list[0].value # table or office namespaces, so we use local-name.
if value_type == 'date': value_type = str(cell.xpath('string(@*[local-name()="value-type"])'))
cell_data = cell.xpath('./@*[local-name()="date-value"]')[0].value if value_type in attribute_type_mapping:
elif value_type == 'time': xpath = '@*[local-name()="%s"]' % attribute_type_mapping[value_type]
cell_data = cell.xpath('./@*[local-name()="time-value"]')[0].value cell_data = str(cell.xpath(xpath)[0])
elif value_type in ('float', 'percentage', 'currency'): else: # read text nodes
cell_data = cell.xpath('./@*[local-name()="value"]')[0].value text_tags = cell.findall('./{%s}p' % cell.nsmap['text'])
else:
text_tags = cell.xpath('./*[name() = "text:p"]')
if len(text_tags): if len(text_tags):
cell_data = ''.join([text.xpath('string(.)') cell_data = ''.join([text.xpath('string(.)')
for text in text_tags]) for text in text_tags])
...@@ -474,13 +434,13 @@ class OOoParser(Implicit): ...@@ -474,13 +434,13 @@ class OOoParser(Implicit):
if no_empty_lines: if no_empty_lines:
empty_cell = 0 empty_cell = 0
for table_cell in table_line: for table_cell in table_line:
if table_cell == None: if table_cell is None:
empty_cell += 1 empty_cell += 1
if empty_cell == len(table_line): if empty_cell == len(table_line):
table_line = None table_line = None
# Add the line to the table # Add the line to the table
if table_line != None: if table_line is not None:
table.append(table_line) table.append(table_line)
else: else:
# If the line is empty here, the repeated line will also be empty, so # If the line is empty here, the repeated line will also be empty, so
...@@ -493,9 +453,9 @@ class OOoParser(Implicit): ...@@ -493,9 +453,9 @@ class OOoParser(Implicit):
# Get a homogenized table # Get a homogenized table
if normalize: if normalize:
table_size = self._getTableSizeDict(new_table) table_size = self._getTableSizeDict(new_table)
new_table = self._getNormalizedBoundsTable( table = new_table new_table = self._getNormalizedBoundsTable( table=new_table
, width = table_size['width'] , width=table_size['width']
, height = table_size['height'] , height=table_size['height']
) )
return {table_name: new_table} return {table_name: new_table}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment