############################################################################## # # Copyright (c) 2003-2005 Nexedi SARL and Contributors. All Rights Reserved. # Kevin DELDYCKE <kevin@nexedi.com> # Guillaume MICHON <guillaume@nexedi.com> # # WARNING: This program as such is intended to be used by professional # programmers who take the whole responsability of assessing all potential # consequences resulting from its eventual inadequacies and bugs # End users who are looking for a ready-to-use solution with commercial # garantees and support are strongly adviced to contract a Free Software # Service Company # # This program is Free Software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # ############################################################################## from Products.PythonScripts.Utility import allow_class from ZPublisher.HTTPRequest import FileUpload from xml.dom.ext.reader import PyExpat from xml.dom import Node from AccessControl import ClassSecurityInfo from Globals import InitializeClass, get_request from zipfile import ZipFile, ZIP_DEFLATED try: from cStringIO import StringIO except ImportError: from StringIO import StringIO from zLOG import LOG import imghdr import random class CorruptedOOoFile(Exception): pass OOo_mimeType_dict = { 'sxw' : 'application/vnd.sun.xml.writer', 'stw' : 'application/vnd.sun.xml.writer.template', 'sxg' : 'application/vnd.sun.xml.writer.global', 'sxc' : 'application/vnd.sun.xml.calc', 'stc' : 'application/vnd.sun.xml.calc.template', 'sxi' : 'application/vnd.sun.xml.impress', 'sti' : 'application/vnd.sun.xml.impress.template', 'sxd' : 'application/vnd.sun.xml.draw', 'std' : 'application/vnd.sun.xml.draw.template', 'sxm' : 'application/vnd.sun.xml.math', } class OOoBuilder: """ Tool that allows to reinject new files in a ZODB OOo document. """ # Declarative security security = ClassSecurityInfo() security.declarePrivate('__init__') def __init__(self, document): if hasattr(document, 'data') : self._document = StringIO(document.data) elif hasattr(document, 'read') : self._document = document else : self._document = StringIO(document) self._image_count = 0 self._manifest_additions_list = [] security.declarePublic('replace') def replace(self, filename, stream): """ Replaces the content of filename by stream in the archive. Creates a new file if filename was not already there. """ try: zf = ZipFile(self._document, mode='a', compression=ZIP_DEFLATED) except RuntimeError: zf = ZipFile(self._document, mode='a') try: # remove the file first if it exists fi = zf.getinfo(filename) zf.filelist.remove( fi ) except KeyError: # This is a new file pass zf.writestr(filename, stream) zf.close() security.declarePublic('extract') def extract(self, filename): """ Extracts a file from the archive """ try: zf = ZipFile(self._document, mode='r', compression=ZIP_DEFLATED) except RuntimeError: zf = ZipFile(self._document, mode='r') return zf.read(filename) security.declarePublic('getNameList') def getNameList(self): try: zf = ZipFile(self._document, mode='r', compression=ZIP_DEFLATED) except RuntimeError: zf = ZipFile(self._document, mode='r') li = zf.namelist() zf.close() return li security.declarePublic('getMimeType') def getMimeType(self): return self.extract('mimetype') security.declarePublic('prepareContentXml') def prepareContentXml(self) : """ extracts content.xml text and prepare it : - add tal namespace - indent the xml """ import pprint content_xml = self.extract('content.xml') reader = PyExpat.Reader() document = reader.fromString(content_xml) document_element = document.documentElement from xml.dom.ext import PrettyPrint output = StringIO() PrettyPrint(document_element, output) return output.getvalue().replace( "office:version='1.0'", """ xmlns:tal='http://xml.zope.org/namespaces/tal' xmlns:i18n='http://xml.zope.org/namespaces/i18n' xmlns:metal='http://xml.zope.org/namespaces/metal' tal:attributes='dummy python:request.RESPONSE.setHeader("Content-Type", "text/html;; charset=utf-8")' office:version='1.0'""") """ """ security.declarePublic('addFileEntry') def addFileEntry(self, full_path, media_type, content=None): """ Add a file entry to the manifest and possibly is content """ self.addManifest(full_path, media_type) if content: self.replace(full_path, content) security.declarePublic('addManifest') def addManifest(self, full_path, media_type): """ Add a path to the manifest """ li = '<manifest:file-entry manifest:media-type="%s" manifest:full-path="%s"/>'%(media_type, full_path) self._manifest_additions_list.append(li) security.declarePublic('updateManifest') def updateManifest(self): """ Add a path to the manifest """ MANIFEST_FILENAME = 'META-INF/manifest.xml' meta_infos = self.extract(MANIFEST_FILENAME) # prevent some duplicates for meta_line in meta_infos.split('\n'): for new_meta_line in self._manifest_additions_list: if meta_line.strip() == new_meta_line: self._manifest_additions_list.remove(new_meta_line) # add the new lines self._manifest_additions_list.append('</manifest:manifest>') meta_infos = meta_infos.replace( self._manifest_additions_list[-1], '\n'.join(self._manifest_additions_list) ) self.replace(MANIFEST_FILENAME, meta_infos) self._manifest_additions_list = [] security.declarePublic('addImage') def addImage(self, image, format='png'): """ Add an image to the current document and return its id """ count = self._image_count self._image_count += 1 name = "Picture/%s.%s" % (count, format) self.replace(name, image) return "#%s" % name security.declarePublic('render') def render(self, name='', extension='sxw'): """ returns the OOo document """ request = get_request() request.response.setHeader('Content-type', OOo_mimeType_dict.get(extension, 'application/vnd.sun.xml.writer')) if name: request.response.setHeader('Content-Disposition', 'attachment; filename=%s.%s' % (name, extension)) self._document.seek(0) return self._document.read() InitializeClass(OOoBuilder) allow_class(OOoBuilder) class OOoParser: """ General purpose tools to parse and handle OpenOffice v1.x documents. """ # Declarative security security = ClassSecurityInfo() security.declarePrivate('__init__') def __init__(self): # Create the PyExpat reader self.reader = PyExpat.Reader() self.oo_content_dom = None self.oo_styles_dom = None self.oo_files = {} self.pictures = {} self.ns = {} self.filename = None security.declareProtected('Import/Export objects', 'openFile') def openFile(self, file_descriptor): """ Load all files in the zipped OpenOffice document """ # Try to unzip the Open Office doc try: oo_unzipped = ZipFile(file_descriptor, mode="r") except: raise CorruptedOOoFile # Test the integrity of the file if oo_unzipped.testzip() != None: raise CorruptedOOoFile # Get the filename self.filename = file_descriptor.filename # List and load the content of the zip file for name in oo_unzipped.namelist(): self.oo_files[name] = oo_unzipped.read(name) oo_unzipped.close() # Get the main content and style definitions self.oo_content_dom = self.reader.fromString(self.oo_files["content.xml"]) self.oo_styles_dom = self.reader.fromString(self.oo_files["styles.xml"]) # Create a namespace table doc_ns = self.oo_styles_dom.getElementsByTagName("office:document-styles") for i in range(doc_ns[0].attributes.length): if doc_ns[0].attributes.item(i).nodeType == Node.ATTRIBUTE_NODE: name = doc_ns[0].attributes.item(i).name if name[:5] == "xmlns": self.ns[name[6:]] = doc_ns[0].attributes.item(i).value security.declarePublic('getFilename') def getFilename(self): """ Return the name of the OpenOffice file """ return self.filename security.declarePublic('getPicturesMapping') def getPicturesMapping(self): """ Return a dictionnary of all pictures in the document """ if len(self.pictures) <= 0: for file_name in self.oo_files: raw_data = self.oo_files[file_name] pict_type = imghdr.what(None, raw_data) if pict_type != None: self.pictures[file_name] = raw_data return self.pictures security.declarePublic('getContentDom') def getContentDom(self): """ Return the DOM tree of the main OpenOffice content """ return self.oo_content_dom security.declarePublic('getSpreadsheetsDom') def getSpreadsheetsDom(self, include_embedded=False): """ Return a list of DOM tree spreadsheets (optionnaly included embedded ones) """ spreadsheets = [] spreadsheets = self.getPlainSpreadsheetsDom() if include_embedded == True: spreadsheets += self.getEmbeddedSpreadsheetsDom() return spreadsheets security.declarePublic('getSpreadsheetsMapping') def getSpreadsheetsMapping(self, include_embedded=False, no_empty_lines=False, normalize=True): """ Return a list of table-like spreadsheets (optionnaly included embedded ones) """ tables = {} tables = self.getPlainSpreadsheetsMapping(no_empty_lines, normalize) if include_embedded == True: embedded_tables = self.getEmbeddedSpreadsheetsMapping(no_empty_lines, normalize) tables = self._getTableListUnion(tables, embedded_tables) return tables security.declarePublic('getPlainSpreadsheetsDom') def getPlainSpreadsheetsDom(self): """ Retrieve every spreadsheets from the document and get they DOM tree """ spreadsheets = [] # List all spreadsheets for table in self.oo_content_dom.getElementsByTagName("table:table"): spreadsheets.append(table) return spreadsheets security.declarePublic('getPlainSpreadsheetsMapping') def getPlainSpreadsheetsMapping(self, no_empty_lines=False, normalize=True): """ Return a list of plain spreadsheets from the document and transform them as table """ tables = {} for spreadsheet in self.getPlainSpreadsheetsDom(): new_table = self.getSpreadsheetMapping(spreadsheet, no_empty_lines, normalize) if new_table != None: tables = self._getTableListUnion(tables, new_table) return tables security.declarePublic('getEmbeddedSpreadsheetsDom') def getEmbeddedSpreadsheetsDom(self): """ Return a list of existing embedded spreadsheets in the file as DOM tree """ spreadsheets = [] # List all embedded spreadsheets emb_objects = self.oo_content_dom.getElementsByTagName("draw:object") for embedded in emb_objects: document = embedded.getAttributeNS(self.ns["xlink"], "href") if document: try: object_content = self.reader.fromString(self.oo_files[document[3:] + '/content.xml']) tables = object_content.getElementsByTagName("table:table") if tables: for table in tables: spreadsheets.append(table) else: # XXX: insert the link to OLE document ? pass except: pass return spreadsheets security.declarePublic('getEmbeddedSpreadsheetsMapping') def getEmbeddedSpreadsheetsMapping(self, no_empty_lines=False, normalize=True): """ Return a list of embedded spreadsheets in the document as table """ tables = {} for spreadsheet in self.getEmbeddedSpreadsheetsDom(): new_table = self.getSpreadsheetMapping(spreadsheet, no_empty_lines, normalize) if new_table != None: tables = self._getTableListUnion(tables, new_table) return tables security.declarePublic('getSpreadsheetMapping') def getSpreadsheetMapping(self, spreadsheet=None, no_empty_lines=False, normalize=True): """ This method convert an OpenOffice spreadsheet to a simple table. This code is based on the oo2pt tool (http://cvs.sourceforge.net/viewcvs.py/collective/CMFReportTool/oo2pt). """ if spreadsheet == None or spreadsheet.nodeName != 'table:table': return None table = [] # Get the table name table_name = spreadsheet.getAttributeNS(self.ns["table"], "name") # Scan table and store usable informations for line in spreadsheet.getElementsByTagName("table:table-row"): # TODO : to the same as cell about abusive repeated lines line_group_found = line.getAttributeNS(self.ns["table"], "number-rows-repeated") if not line_group_found: lines_to_repeat = 1 else: lines_to_repeat = int(line_group_found) for i in range(lines_to_repeat): table_line = [] # Get all cells cells = line.getElementsByTagName("table:table-cell") cell_index_range = range(len(cells)) for cell_index in cell_index_range: cell = cells[cell_index] # If the cell as no child, cells have no content # And if the cell is the last of the row, we don't need to add it to the line # So we can go to the next line (= exit this cells loop) # # I must do this test because sometimes the following cell group # can be found in OOo documents : <table:table-cell table:number-columns-repeated='246'/> # This is bad because it create too much irrevelent content that slow down the process # So it's a good idea to break the loop in this case if cell.childNodes.length == 0 and cell_index == cell_index_range[-1]: break # Handle cells group cell_group_found = cell.getAttributeNS(self.ns["table"], "number-columns-repeated") if not cell_group_found: cells_to_repeat = 1 else: cells_to_repeat = int(cell_group_found) # Ungroup repeated cells for j in range(cells_to_repeat): # Get the cell content cell_text = None text_tags = cell.getElementsByTagName("text:p") for text in text_tags: for k in range(text.childNodes.length): child = text.childNodes[k] if child.nodeType == Node.TEXT_NODE: if cell_text == None: cell_text = '' cell_text += child.nodeValue # Add the cell to the line table_line.append(cell_text) # Delete empty lines if needed if no_empty_lines: empty_cell = 0 for table_cell in table_line: if table_cell == None: empty_cell += 1 if empty_cell == len(table_line): table_line = None # Add the line to the table if table_line != None: table.append(table_line) # Reduce the table to the minimum new_table = self._getReducedTable(table) # Get a homogenized table if normalize: table_size = self._getTableSizeDict(new_table) new_table = self._getNormalizedBoundsTable( table = new_table , width = table_size['width'] , height = table_size['height'] ) return {table_name: new_table} security.declarePrivate('_getReducedTable') def _getReducedTable(self, table): """ Reduce the table to its minimum size """ empty_lines = 0 no_more_empty_lines = 0 # Eliminate all empty cells at the ends of lines and columns # Browse the table starting from the bottom for easy empty lines count for line in range(len(table)-1, -1, -1): empty_cells = 0 line_content = table[line] for cell in range(len(line_content)-1, -1, -1): if line_content[cell] in ('', None): empty_cells += 1 else: break if (not no_more_empty_lines) and (empty_cells == len(line_content)): empty_lines += 1 else: line_size = len(line_content) - empty_cells table[line] = line_content[:line_size] no_more_empty_lines = 1 table_height = len(table) - empty_lines return table[:table_height] security.declarePrivate('_getTableSizeDict') def _getTableSizeDict(self, table): """ Get table dimension as dictionnary contain both height and width """ max_cols = 0 for line_index in range(len(table)): line = table[line_index] if len(line) > max_cols: max_cols = len(line) return { 'width' : max_cols , 'height': len(table) } security.declarePrivate('_getNormalizedBoundsTable') def _getNormalizedBoundsTable(self, table, width=0, height=0): """ Add necessary cells and lines to obtain given bounds """ while height > len(table): table.append([]) for line in range(height): while width > len(table[line]): table[line].append(None) return table security.declarePrivate('_getTableListUnion') def _getTableListUnion(self, list1, list2): """ Coerce two dict containing tables structures. We need to use this method because a OpenOffice document can hold several embedded spreadsheets with the same id. This explain the use of random suffix in such extreme case. """ for list2_key in list2.keys(): # Generate a new table ID if needed new_key = list2_key while new_key in list1.keys(): new_key = list2_key + '_' + str(random.randint(1000,9999)) list1[new_key] = list2[list2_key] return list1 InitializeClass(OOoParser) allow_class(OOoParser)