OOoDocument.py 19.1 KB
Newer Older
Bartek Górny's avatar
Bartek Górny committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
##############################################################################
#
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

28
import xmlrpclib, base64, re, zipfile, cStringIO
29
from warnings import warn
30
from xmlrpclib import Fault
31 32
from xmlrpclib import Transport
from xmlrpclib import SafeTransport
Bartek Górny's avatar
Bartek Górny committed
33 34
from AccessControl import ClassSecurityInfo
from OFS.Image import Pdata
35
from Products.CMFCore.utils import getToolByName, _setCacheHeaders
Bartek Górny's avatar
Bartek Górny committed
36 37
from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
from Products.ERP5Type.Cache import CachingMethod
38
from Products.ERP5.Document.File import File
39
from Products.ERP5.Document.Document import ConversionCacheMixin, ConversionError
40
from Products.ERP5.Document.File import _unpackData
41
from zLOG import LOG, ERROR
42

Bartek Górny's avatar
Bartek Górny committed
43 44 45
enc=base64.encodestring
dec=base64.decodestring

46
_MARKER = []
47
STANDARD_IMAGE_FORMAT_LIST = ('png', 'jpg', 'gif', )
48

49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71

class TimeoutTransport(SafeTransport):
  """A xmlrpc transport with configurable timeout.
  """
  def __init__(self, timeout=None, scheme='http'):
    self._timeout = timeout
    self._scheme = scheme

  def send_content(self, connection, request_body):
    connection.putheader("Content-Type", "text/xml")
    connection.putheader("Content-Length", str(len(request_body)))
    connection.endheaders()
    if self._timeout:
      connection._conn.sock.settimeout(self._timeout)
    if request_body:
      connection.send(request_body)

  def make_connection(self, h):
    if self._scheme == 'http':
      return Transport.make_connection(self, h)
    return SafeTransport.make_connection(self, h)


72
class OOoDocument(File, ConversionCacheMixin):
Bartek Górny's avatar
Bartek Górny committed
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
  """
    A file document able to convert OOo compatible files to
    any OOo supported format, to capture metadata and to
    update metadata in OOo documents.

    This class can be used:

    - to create an OOo document database with powerful indexing (r/o)
      and metadata handling (r/w) features (ex. change title in ERP5 ->
      title is changed in OOo document)

    - to massively convert MS Office documents to OOo format

    - to easily keep snapshots (in PDF and/or OOo format) of OOo documents
      generated from OOo templates

    This class may be used in the future:

    - to create editable OOo templates (ex. by adding tags in WYSIWYG mode
      and using tags to make document dynamic - ask kevin for more info)

    - to automatically sign / encrypt OOo documents based on user

    - to automatically sign / encrypt PDF generated from OOo documents based on user

    This class should not be used:

    - to store files in formats not supported by OOo

    - to stored pure images (use Image for that)

    - as a general file conversion system (use portal_transforms for that)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
105 106 107

    TODO:
    - better permissions
Bartek Górny's avatar
Bartek Górny committed
108 109 110 111 112 113 114
  """
  # CMF Type Definition
  meta_type = 'ERP5 OOo Document'
  portal_type = 'OOo Document'
  isPortalContent = 1
  isRADContent = 1

115
  searchable_property_list = ('asText', 'title', 'description', 'id', 'reference',
116 117
                              'version', 'short_title',
                              'subject', 'source_reference', 'source_project_title',)
Bartek Górny's avatar
Bartek Górny committed
118 119 120 121 122 123 124

  # Declarative security
  security = ClassSecurityInfo()
  security.declareObjectProtected(Permissions.AccessContentsInformation)

  # Default Properties
  property_sheets = ( PropertySheet.Base
125 126
                    , PropertySheet.XMLObject
                    , PropertySheet.Reference
Bartek Górny's avatar
Bartek Górny committed
127 128 129
                    , PropertySheet.CategoryCore
                    , PropertySheet.DublinCore
                    , PropertySheet.Version
130
                    , PropertySheet.Document
131 132 133 134
                    , PropertySheet.Snapshot
                    , PropertySheet.ExternalDocument
                    , PropertySheet.Url
                    , PropertySheet.Periodicity
Bartek Górny's avatar
Bartek Górny committed
135 136
                    )

137
  # regular expressions for stripping xml from ODF documents
138 139
  rx_strip = re.compile('<[^>]*?>', re.DOTALL|re.MULTILINE)
  rx_compr = re.compile('\s+')
140

141 142 143 144 145
  def _setFile(self, data, precondition=None):
    File._setFile(self, data, precondition=precondition)
    if self.hasBaseData():
      # This is a hack - XXX - new accessor needed to delete properties
      delattr(self, 'base_data')
146

147
  security.declareProtected(Permissions.View, 'index_html')
148
  def index_html(self, REQUEST, RESPONSE, format=None, display=None, **kw):
149
    """
150 151 152
      Default renderer with conversion support. Format is
      a string. The list of available formats can be obtained
      by calling getTargetFormatItemList.
153
    """
154 155 156
    # Accelerate rendering in Web mode
    _setCacheHeaders(self, {'format' : format})
    # Return the original file by default
157
    if format is None:
158 159 160 161 162
      return File.index_html(self, REQUEST, RESPONSE)
    # Make sure file is converted to base format
    if not self.hasBaseData():
      self.convertToBaseFormat()
    # Else try to convert the document and return it
163
    mime, result = self.convert(format=format, display=display, **kw)
164 165
    if not mime:
      mime = getToolByName(self, 'mimetypes_registry').lookupExtension('name.%s' % format)
166
    RESPONSE.setHeader('Content-Length', len(result))
167 168 169 170
    RESPONSE.setHeader('Content-Type', mime)
    RESPONSE.setHeader('Accept-Ranges', 'bytes')
    return result

171
  # Format conversion implementation
172
  def _getServerCoordinate(self):
Bartek Górny's avatar
Bartek Górny committed
173
    """
174 175
      Returns the oood conversion server coordinates
      as defined in preferences.
Bartek Górny's avatar
Bartek Górny committed
176
    """
177 178 179
    preference_tool = getToolByName(self, 'portal_preferences')
    address = preference_tool.getPreferredOoodocServerAddress()
    port = preference_tool.getPreferredOoodocServerPortNumber()
180
    if address in ('', None) or port in ('', None) :
181 182
      raise ConversionError('[DMS] Can not proceed with conversion:'
            ' conversion server host and port is not defined in preferences')
183
    return address, port
Bartek Górny's avatar
Bartek Górny committed
184 185 186

  def _mkProxy(self):
    """
187
      Create an XML-RPC proxy to access the conversion server.
Bartek Górny's avatar
Bartek Górny committed
188
    """
189 190 191 192
    server_proxy = xmlrpclib.ServerProxy(
             'http://%s:%d' % self._getServerCoordinate(),
             allow_none=True,
             transport=TimeoutTransport(timeout=360, scheme='http'))
193
    return server_proxy
Bartek Górny's avatar
Bartek Górny committed
194

195 196
  security.declareProtected(Permissions.AccessContentsInformation,
                            'getTargetFormatItemList')
Bartek Górny's avatar
Bartek Górny committed
197 198 199 200 201
  def getTargetFormatItemList(self):
    """
      Returns a list of acceptable formats for conversion
      in the form of tuples (for listfield in ERP5Form)

202 203
      NOTE: it is the responsability of the conversion server
      to provide an extensive list of conversion formats.
Bartek Górny's avatar
Bartek Górny committed
204
    """
205 206 207
    if not self.hasBaseData():
      self.convertToBaseFormat()

208
    def cached_getTargetFormatItemList(content_type):
209
      server_proxy = self._mkProxy()
210
      try:
211 212 213 214 215 216 217 218 219 220 221
        allowed_target_item_list = server_proxy.getAllowedTargetItemList(
                                                      content_type)
        try:
          response_code, response_dict, response_message = \
                                             allowed_target_item_list
        except ValueError:
          # Compatibility with older oood where getAllowedTargetItemList only
          # returned response_dict
          response_code, response_dict, response_message = \
                         200, dict(response_data=allowed_target_item_list), ''
        
222 223 224 225 226
        if response_code == 200:
          allowed = response_dict['response_data']
        else:
          # This is very temporary code - XXX needs to be changed
          # so that the system can retry
227 228 229
          raise ConversionError("[DMS] Can not get list of allowed acceptable"
                                " formats for conversion: %s (%s)" % (
                                      response_code, response_message))
230

231 232 233 234
      except Fault, f:
        allowed = server_proxy.getAllowedTargets(content_type)
        warn('Your oood version is too old, using old method '
            'getAllowedTargets instead of getAllowedTargetList',
235
             DeprecationWarning)
236 237 238

      # tuple order is reversed to be compatible with ERP5 Form
      return [(y, x) for x, y in allowed]
Bartek Górny's avatar
Bartek Górny committed
239

240
    # Cache valid format list
241 242 243 244
    cached_getTargetFormatItemList = CachingMethod(
                                cached_getTargetFormatItemList,
                                id="OOoDocument_getTargetFormatItemList",
                                cache_factory='erp5_ui_medium')
Bartek Górny's avatar
Bartek Górny committed
245

246 247
    return cached_getTargetFormatItemList(self.getBaseContentType())

248 249
  security.declareProtected(Permissions.AccessContentsInformation,
                            'getTargetFormatTitleList')
250
  def getTargetFormatTitleList(self):
Bartek Górny's avatar
Bartek Górny committed
251 252 253 254 255
    """
      Returns a list of acceptable formats for conversion
    """
    return map(lambda x: x[0], self.getTargetFormatItemList())

256 257
  security.declareProtected(Permissions.AccessContentsInformation,
                            'getTargetFormatList')
258
  def getTargetFormatList(self):
Bartek Górny's avatar
Bartek Górny committed
259
    """
260
      Returns a list of acceptable formats for conversion
Bartek Górny's avatar
Bartek Górny committed
261
    """
262
    return map(lambda x: x[1], self.getTargetFormatItemList())
Bartek Górny's avatar
Bartek Górny committed
263

264 265
  security.declareProtected(Permissions.ModifyPortalContent,
                            'isTargetFormatAllowed')
266
  def isTargetFormatAllowed(self, format):
267
    """
268 269 270 271 272 273 274 275 276 277
      Checks if the current document can be converted
      into the specified target format.
    """
    return format in self.getTargetFormatList()

  security.declarePrivate('_convert')
  def _convert(self, format):
    """
      Communicates with server to convert a file 
    """
278 279
    if not self.hasBaseData():
      self.convertToBaseFormat()
280 281 282
    if format == 'text-content':
      # Extract text from the ODF file
      cs = cStringIO.StringIO()
283
      cs.write(_unpackData(self.getBaseData()))
284 285 286 287 288 289
      z = zipfile.ZipFile(cs)
      s = z.read('content.xml')
      s = self.rx_strip.sub(" ", s) # strip xml
      s = self.rx_compr.sub(" ", s) # compress multiple spaces
      cs.close()
      z.close()
290
      return 'text/plain', s
291
    server_proxy = self._mkProxy()
292 293
    
    generate_result = server_proxy.run_generate(self.getId(),
294
                                       enc(_unpackData(self.getBaseData())),
295
                                       None,
296
                                       format)
297 298 299 300 301 302 303
    try:
      response_code, response_dict, response_message = generate_result
    except ValueError:
      # This is for backward compatibility with older oood version returning
      # only response_dict
      response_dict = generate_result
      
304
    # XXX: handle possible OOOd server failure
305
    return response_dict['mime'], Pdata(dec(response_dict['data']))
306

307
  # Conversion API
308
  security.declareProtected(Permissions.View, 'convert')
309
  def convert(self, format, display=None, **kw):
310 311 312 313
    """Convert the document to the given format.

    If a conversion is already stored for this format, it is returned
    directly, otherwise the conversion is stored for the next time.
Bartek Górny's avatar
Bartek Górny committed
314
    """
315 316 317 318 319
    #XXX if document is empty, stop to try to convert.
    #XXX but I don't know what is a appropriate mime-type.(Yusei)
    if self.get_size()==0:
      return 'text/plain', ''
    
320 321
    # Make sure we can support html and pdf by default
    is_html = 0
322
    original_format = format
323 324 325
    if format == 'base-data':
      if not self.hasBaseData(): self.convertToBaseFormat()
      return self.getBaseContentType(), self.getBaseData()
326
    if format == 'pdf':
327 328
      format_list = [x for x in self.getTargetFormatList()
                                          if x.endswith('pdf')]
329
      format = format_list[0]
330
    elif format in STANDARD_IMAGE_FORMAT_LIST:
331 332
      format_list = [x for x in self.getTargetFormatList()
                                          if x.endswith(format)]
333
      format = format_list[0]
334
    elif format == 'html':
335 336
      format_list = [x for x in self.getTargetFormatList()
                              if x.startswith('html') or x.endswith('html')]
337 338
      format = format_list[0]
      is_html = 1
339 340 341 342 343 344 345 346 347 348
    elif format in ('txt', 'text', 'text-content'):
      format_list = self.getTargetFormatList()
      if format in format_list:
        format = format_list[format_list.index(format)]
      if 'txt' in format_list:
        format = format_list[format_list.index('txt')]
      elif 'text' in format_list:
        format = format_list[format_list.index('text')]
      else:
        return 'text/plain', self.asTextContent()
349 350
    # Raise an error if the format is not supported
    if not self.isTargetFormatAllowed(format):
351
      raise ConversionError("[DMS] Target format %s is not supported" % format)
352 353
    # Check if we have already a base conversion
    if not self.hasBaseData():
354
      self.convertToBaseFormat()
355
    # Return converted file
356 357 358 359 360
    if display is None or original_format not in STANDARD_IMAGE_FORMAT_LIST:
      has_format = self.hasConversion(format=format)
    else:
      has_format = self.hasConversion(format=format, display=display)
    if not has_format:
361 362 363 364 365 366
      # Do real conversion
      mime, data = self._convert(format)
      if is_html:
        # Extra processing required since
        # we receive a zip file
        cs = cStringIO.StringIO()
367
        cs.write(_unpackData(data))
368 369 370 371 372 373 374 375 376 377
        z = zipfile.ZipFile(cs)
        for f in z.infolist():
          fn = f.filename
          if fn.endswith('html'):
            data = z.read(fn)
            break
        mime = 'text/html'
        self.populateContent(zip_file=z)
        z.close()
        cs.close()
378 379 380
      if display is None or original_format not in STANDARD_IMAGE_FORMAT_LIST:
        self.setConversion(data, mime, format=format)
      else:
381
        self.portal_contributions.newContent(
382 383 384 385 386 387 388 389 390 391
                                       portal_type='Image',
                                       temp_object=1)
        temp_image._setData(data)
        mime, data = temp_image.convert(format, display=display)
        self.setConversion(data, mime, format=format, display=display)
    if display is None or original_format not in STANDARD_IMAGE_FORMAT_LIST:
      return self.getConversion(format=format)
    else:
      return self.getConversion(format=format, display=display)

392 393 394 395 396 397 398
  security.declareProtected(Permissions.View, 'asTextContent')
  def asTextContent(self):
    """
      Extract plain text from ooo docs by stripping the XML file.
      This is the simplest way, the most universal and it is compatible
      will all formats.
    """
399
    return self._convert(format='text-content')
400

401 402
  security.declareProtected(Permissions.ModifyPortalContent,
                            'populateContent')
403 404 405 406 407 408
  def populateContent(self, zip_file=None):
    """
    Extract content from the ODF zip file and populate the document.
    Optional parameter zip_file prevents from converting content twice.
    """
    if zip_file is None:
409 410
      format_list = [x for x in self.getTargetFormatList()
                                                if x.startswith('html')]
411 412 413
      format = format_list[0]
      mime, data = self._convert(format)
      archive_file = cStringIO.StringIO()
414
      archive_file.write(_unpackData(data))
415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434
      zip_file = zipfile.ZipFile(archive_file)
      must_close = 1
    else:
      must_close = 0
    for f in zip_file.infolist():
      file_name = f.filename
      if not file_name.endswith('html'):
        document = self.get(file_name, None)
        if document is not None:
          self.manage_delObjects([file_name])
        self.portal_contributions.newContent(id=file_name, container=self,
                                             file_name=file_name,
                                             data=zip_file.read(file_name))
    if must_close:
      zip_file.close()
      archive_file.close()

  # Base format implementation
  security.declarePrivate('_convertToBaseFormat')
  def _convertToBaseFormat(self):
Bartek Górny's avatar
Bartek Górny committed
435
    """
436 437 438
      Converts the original document into ODF
      by invoking the conversion server. Store the result
      on the object. Update metadata information.
Bartek Górny's avatar
Bartek Górny committed
439
    """
440
    server_proxy = self._mkProxy()
441 442
    response_code, response_dict, response_message = server_proxy.run_convert(
                                      self.getSourceReference() or self.getId(),
443
                                      enc(_unpackData(self.getData())))
444 445 446 447 448 449 450 451
    if response_code == 200:
      # sucessfully converted document
      self._setBaseData(dec(response_dict['data']))
      metadata = response_dict['meta']
      self._base_metadata = metadata
      if metadata.get('MIMEType', None) is not None:
        self._setBaseContentType(metadata['MIMEType'])
    else:
452 453 454 455 456 457 458
      # log and raise errors with converting server.
      LOG('ERP5OOo', ERROR,
          '[DMS] Error converting document to base format %s:%s'
                    % (response_code, response_message))
      # Explicitly raise the exception!
      raise ConversionError(
                "[DMS] Error converting document to base format %s:%s:" 
459
                                       %(response_code, response_message))
Bartek Górny's avatar
Bartek Górny committed
460

461 462
  security.declareProtected(Permissions.AccessContentsInformation,
                            'getContentInformation')
463
  def getContentInformation(self):
Bartek Górny's avatar
Bartek Górny committed
464
    """
465 466
      Returns the metadata extracted by the conversion
      server.
Bartek Górny's avatar
Bartek Górny committed
467
    """
468
    return self._base_metadata
Bartek Górny's avatar
Bartek Górny committed
469

470 471
  security.declareProtected(Permissions.ModifyPortalContent,
                            'updateBaseMetadata')
472
  def updateBaseMetadata(self, *arg, **kw):
Bartek Górny's avatar
Bartek Górny committed
473
    """
474 475 476
      Updates metadata information in the converted OOo document
      based on the values provided by the user. This is implemented
      through the invocation of the conversion server.
Bartek Górny's avatar
Bartek Górny committed
477
    """
478
    server_proxy = self._mkProxy()
479 480 481 482
    response_code, response_dict, response_message = \
          server_proxy.run_setmetadata(self.getId(),
                                       enc(_unpackData(self.getBaseData())),
                                       kw)
483 484 485 486
    if response_code == 200:
      # successful meta data extraction
      self._setBaseData(dec(response_dict['data']))
    else:
487 488 489 490 491 492 493
      # log and raise errors with converting server.
      LOG('ERP5OOo', ERROR, "[DMS] Error getting document's metadata %s:%s"
                        % (response_code, response_message))
      # Explicitly raise the exception!
      raise ConversionError("[DMS] Error getting document's metadata %s:%s"
                        % (response_code, response_message))