OOoUtils.py 11.6 KB
Newer Older
Kevin Deldycke's avatar
Kevin Deldycke committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
##############################################################################
#
# Copyright (c) 2003-2005 Nexedi SARL and Contributors. All Rights Reserved.
#                         Kevin DELDYCKE    <kevin@nexedi.com>
#                         Guillaume MICHON  <guillaume@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

from Products.PythonScripts.Utility import allow_class
from ZPublisher.HTTPRequest import FileUpload
from xml.dom.ext.reader import PyExpat
from xml.dom import Node
from AccessControl import ClassSecurityInfo
from Globals import InitializeClass
from zipfile import ZipFile
from zLOG import LOG
import imghdr
39
import random
Kevin Deldycke's avatar
Kevin Deldycke committed
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68



class CorruptedOOoFile(Exception): pass



class OOoParser:
  """
    General purpose tools to parse and handle OpenOffice v1.x documents.
  """


  # Declarative security
  security = ClassSecurityInfo()


  security.declarePrivate('__init__')
  def __init__(self):
    # Create the PyExpat reader
    self.reader = PyExpat.Reader()
    self.oo_content_dom = None
    self.oo_styles_dom  = None
    self.oo_files = {}
    self.pictures = {}
    self.ns = {}


  security.declarePublic('openFile')
69
  def openFile(self, file_descriptor):
Kevin Deldycke's avatar
Kevin Deldycke committed
70 71 72 73 74
    """
      Load all files in the zipped OpenOffice document
    """
    # Try to unzip the Open Office doc
    try:
75
      oo_unzipped = ZipFile(file_descriptor, mode="r")
Kevin Deldycke's avatar
Kevin Deldycke committed
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
    except:
      raise CorruptedOOoFile
    # Test the integrity of the file
    if oo_unzipped.testzip() != None:
      raise CorruptedOOoFile

    # Initialize internal variables
    self.__init__()

    # List and load the content of the zip file
    for name in oo_unzipped.namelist():
      self.oo_files[name] = oo_unzipped.read(name)

    # Get the main content and style definitions
    self.oo_content_dom = self.reader.fromString(self.oo_files["content.xml"])
    self.oo_styles_dom  = self.reader.fromString(self.oo_files["styles.xml"])

    # Create a namespace table
    doc_ns = self.oo_styles_dom.getElementsByTagName("office:document-styles")
    for i in range(doc_ns[0].attributes.length):
        if doc_ns[0].attributes.item(i).nodeType == Node.ATTRIBUTE_NODE:
            name = doc_ns[0].attributes.item(i).name
            if name[:5] == "xmlns":
                self.ns[name[6:]] = doc_ns[0].attributes.item(i).value


  security.declarePublic('getPictures')
  def getPictures(self):
    """
      Return a dictionnary of all pictures in the document
    """
    if len(self.pictures) <= 0:
      for file_name in self.oo_files:
        raw_data = self.oo_files[file_name]
        pict_type = imghdr.what(None, raw_data)
        if pict_type != None:
          self.pictures[file_name] = raw_data
    return self.pictures


  security.declarePublic('getContentAsDom')
  def getContentAsDom(self):
    """
      Return the DOM tree of the main OpenOffice content
    """
    return self.oo_content_dom


124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
  security.declarePublic('getSpreadsheetsAsDom')
  def getSpreadsheetsAsDom(self, include_embedded=False):
    """
      Return a list of DOM tree spreadsheets (optionnaly included embedded ones)
    """
    spreadsheets = []
    spreadsheets = self.getPlainSpreadsheetsAsDom()
    if include_embedded == True:
      spreadsheets += self.getEmbeddedSpreadsheetsAsDom()
    return spreadsheets


  security.declarePublic('getSpreadsheetsAsTable')
  def getSpreadsheetsAsTable(self, include_embedded=False, no_empty_lines=False):
    """
      Return a list of table-like spreadsheets (optionnaly included embedded ones)
    """
141 142
    tables = {}
    tables = self.getPlainSpreadsheetsAsTable(no_empty_lines)
143
    if include_embedded == True:
144 145 146
      embedded_tables = self.getEmbeddedSpreadsheetsAsTable(no_empty_lines)
      tables = self._getTableListUnion(tables, embedded_tables)
    return tables
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165


  security.declarePublic('getPlainSpreadsheetsAsDom')
  def getPlainSpreadsheetsAsDom(self):
    """
      Retrieve every spreadsheets from the document and get they DOM tree
    """
    spreadsheets = []
    # List all spreadsheets
    for table in self.oo_content_dom.getElementsByTagName("table:table"):
      spreadsheets.append(table)
    return spreadsheets


  security.declarePublic('getPlainSpreadsheetsAsTable')
  def getPlainSpreadsheetsAsTable(self, no_empty_lines=False):
    """
      Return a list of plain spreadsheets from the document and transform them as table
    """
166
    tables = {}
167 168 169
    for spreadsheet in self.getPlainSpreadsheetsAsDom():
      new_table = self.getSpreadsheetAsTable(spreadsheet, no_empty_lines)
      if new_table != None:
170
        tables = self._getTableListUnion(tables, new_table)
171 172 173
    return tables


Kevin Deldycke's avatar
Kevin Deldycke committed
174 175 176 177 178 179 180 181 182 183 184 185 186
  security.declarePublic('getEmbeddedSpreadsheetsAsDom')
  def getEmbeddedSpreadsheetsAsDom(self):
    """
      Return a list of existing embedded spreadsheets in the file as DOM tree
    """
    spreadsheets = []
    # List all embedded spreadsheets
    emb_objects = self.oo_content_dom.getElementsByTagName("draw:object")
    for embedded in emb_objects:
      document = embedded.getAttributeNS(self.ns["xlink"], "href")
      if document:
        try:
          object_content = self.reader.fromString(self.oo_files[document[3:] + '/content.xml'])
187 188
          for table in object_content.getElementsByTagName("table:table"):
            spreadsheets.append(table)
Kevin Deldycke's avatar
Kevin Deldycke committed
189 190 191 192 193 194
        except:
          pass
    return spreadsheets


  security.declarePublic('getEmbeddedSpreadsheetsAsTable')
195
  def getEmbeddedSpreadsheetsAsTable(self, no_empty_lines=False):
Kevin Deldycke's avatar
Kevin Deldycke committed
196
    """
197
      Return a list of embedded spreadsheets in the document as table
Kevin Deldycke's avatar
Kevin Deldycke committed
198
    """
199
    tables = {}
Kevin Deldycke's avatar
Kevin Deldycke committed
200
    for spreadsheet in self.getEmbeddedSpreadsheetsAsDom():
201
      new_table = self.getSpreadsheetAsTable(spreadsheet, no_empty_lines)
Kevin Deldycke's avatar
Kevin Deldycke committed
202
      if new_table != None:
203
        tables = self._getTableListUnion(tables, new_table)
Kevin Deldycke's avatar
Kevin Deldycke committed
204 205 206 207
    return tables


  security.declarePublic('getSpreadsheetAsTable')
208
  def getSpreadsheetAsTable(self, spreadsheet=None, no_empty_lines=False):
Kevin Deldycke's avatar
Kevin Deldycke committed
209 210 211 212
    """
      This method convert an OpenOffice spreadsheet to a simple table.
      This code is base on the oo2pt tool (http://cvs.sourceforge.net/viewcvs.py/collective/CMFReportTool/oo2pt).
    """
213
    if spreadsheet == None or spreadsheet.nodeName != 'table:table':
Kevin Deldycke's avatar
Kevin Deldycke committed
214 215
      return None

216
    table = []
Kevin Deldycke's avatar
Kevin Deldycke committed
217

218 219 220
    # Get the table name
    table_name = spreadsheet.getAttributeNS(self.ns["table"], "name")

221 222 223 224
    # Store informations on column widths
    line_number = 0
    for column in spreadsheet.getElementsByTagName("table:table-column"):
      repeated = column.getAttributeNS(self.ns["table"], "number-columns-repeated")
Kevin Deldycke's avatar
Kevin Deldycke committed
225

226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
    # Scan table and store usable informations
    for line in spreadsheet.getElementsByTagName("table:table-row"):
      repeated_lines = line.getAttributeNS(self.ns["table"], "number-rows-repeated")
      if not repeated_lines:
        repeated_lines = 1
      else:
        repeated_lines = int(repeated_lines)

      for i in range(repeated_lines):
        table_line = []
        col_number = 0

        for cell in line.getElementsByTagName("table:table-cell"):
          repeated_cells = cell.getAttributeNS(self.ns["table"], "number-columns-repeated")
          if not repeated_cells:
            repeated_cells = 1
          else:
            repeated_cells = int(repeated_cells)

          for j in range(repeated_cells):
            cell_text = None
            text_tags = cell.getElementsByTagName("text:p")

            for text in text_tags:
              for k in range(text.childNodes.length):
                child = text.childNodes[k]
                if child.nodeType == Node.TEXT_NODE:
                  if cell_text == None:
                    cell_text = ''
                  cell_text += child.nodeValue

            table_line.append(cell_text)
            col_number += 1

        table.append(table_line)
        line_number += 1

    # Reduce the table to the minimum
    text_min_bounds = self._getTableMinimalBounds(table)
    table = self._setTableBounds( table
                                , width  = text_min_bounds['width']
                                , height = text_min_bounds['height']
                                )
    if no_empty_lines:
      table = self._deleteTableEmptyLines(table)
271
    return {table_name: table}
Kevin Deldycke's avatar
Kevin Deldycke committed
272 273 274


  security.declarePrivate('_getTableMinimalBounds')
275
  def _getTableMinimalBounds(self, table):
Kevin Deldycke's avatar
Kevin Deldycke committed
276
    """
277
      Calcul the minimum size of a table
Kevin Deldycke's avatar
Kevin Deldycke committed
278 279 280 281 282
    """
    empty_lines = 0
    no_more_empty_lines = 0

    # Eliminate all empty cells at the ends of lines and columns
283
    for line in range(len(table)-1, -1, -1):
Kevin Deldycke's avatar
Kevin Deldycke committed
284
      empty_cells = 0
285
      line_content = table[line]
Kevin Deldycke's avatar
Kevin Deldycke committed
286
      for cell in range(len(line_content)-1, -1, -1):
287
        if line_content[cell] in ('', None):
Kevin Deldycke's avatar
Kevin Deldycke committed
288 289 290 291 292 293 294
          empty_cells += 1
        else:
          break
      if (not no_more_empty_lines) and (empty_cells == len(line_content)):
        empty_lines += 1
      else:
        line_size = len(line_content) - empty_cells
295
        table[line] = line_content[:line_size]
Kevin Deldycke's avatar
Kevin Deldycke committed
296 297
        no_more_empty_lines = 1

298 299
    texts_size = len(table) - empty_lines
    table = table[:texts_size]
Kevin Deldycke's avatar
Kevin Deldycke committed
300 301 302

    # Determine minimum bounds
    max_cols = 0
303 304 305 306
    for line in range(len(table)):
      line_content = table[line]
      if len(line_content) > max_cols:
        max_cols = len(line_content)
Kevin Deldycke's avatar
Kevin Deldycke committed
307

308 309 310
    return { 'width' : max_cols
           , 'height': len(table)
           }
Kevin Deldycke's avatar
Kevin Deldycke committed
311 312 313


  security.declarePrivate('_setTableBounds')
314
  def _setTableBounds(self, table, width=0, height=0):
Kevin Deldycke's avatar
Kevin Deldycke committed
315 316 317
    """
      Enlarge a text table to given bounds
    """
318 319
    while height > len(table):
      table.append([])
Kevin Deldycke's avatar
Kevin Deldycke committed
320
    for line in range(height):
321 322 323 324
      while width > len(table[line]):
        table[line].append(None)
    return table

Kevin Deldycke's avatar
Kevin Deldycke committed
325

326 327 328
  security.declarePrivate('_deleteTableEmptyLines')
  def _deleteTableEmptyLines(self, table):
    """
329
      Delete table empty lines.
330 331 332 333 334 335 336 337 338 339
    """
    new_table = []
    for line in table:
      empty_cell = 0
      for cell in line:
        if cell == None:
          empty_cell += 1
      if empty_cell != len(line):
        new_table.append(line)
    return new_table
Kevin Deldycke's avatar
Kevin Deldycke committed
340 341


342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
  security.declarePrivate('_getTableListUnion')
  def _getTableListUnion(self, list1, list2):
    """
      Coerce two dict containing tables structures.
      We need to use this method because a OpenOffice document can hold
        several embedded spreadsheets with the same id. This explain the
        use of random suffix in such extreme case.
    """
    for list2_key in list2.keys():
      # Generate a new table ID if needed
      new_key = list2_key
      while new_key in list1.keys():
        new_key = list2_key + '_' + str(random.randint(1000,9999))
      list1[new_key] = list2[list2_key]
    return list1


Kevin Deldycke's avatar
Kevin Deldycke committed
359 360
InitializeClass(OOoParser)
allow_class(OOoParser)