document.erp5.PDFDocument.py 14.3 KB
Newer Older
1
# -*- coding: utf-8 -*-
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
##############################################################################
#
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

29
import tempfile, os, pickle
30

31
import zope.interface
32
from AccessControl import ClassSecurityInfo
33

34 35
from Products.ERP5Type import Permissions, PropertySheet
from erp5.component.interface.IWatermarkable import IWatermarkable
36
from erp5.component.document.Image import Image
37
from erp5.component.document.Document import ConversionError
38
from subprocess import Popen, PIPE
39
from zLOG import LOG, INFO, PROBLEM
Nicolas Dumazet's avatar
Nicolas Dumazet committed
40
import errno
41
from StringIO import StringIO
42

43
@zope.interface.implementer(IWatermarkable)
44
class PDFDocument(Image):
45
  """
46 47 48
  PDFDocument is a subclass of Image which is able to
  extract text content from a PDF file either as text
  or as HTML.
49 50
  """
  # CMF Type Definition
51
  meta_type = 'ERP5 PDF Document'
52 53 54 55 56 57 58 59
  portal_type = 'PDF'

  # Declarative security
  security = ClassSecurityInfo()
  security.declareObjectProtected(Permissions.AccessContentsInformation)

  # Default Properties
  property_sheets = ( PropertySheet.Base
60
                    , PropertySheet.XMLObject
61 62 63 64 65 66
                    , PropertySheet.CategoryCore
                    , PropertySheet.DublinCore
                    , PropertySheet.Version
                    , PropertySheet.Reference
                    , PropertySheet.Document
                    , PropertySheet.Data
67 68 69
                    , PropertySheet.ExternalDocument
                    , PropertySheet.Url
                    , PropertySheet.Periodicity
70 71
                    )

72 73 74 75 76 77 78 79 80 81 82 83
  security.declareProtected(Permissions.AccessContentsInformation,
                            'getWatermarkedData')
  def getWatermarkedData(self, watermark_data, repeat_watermark=True,
                         watermark_start_page=0, **kw):
    """See interface

    * watermark_data is the PDF data (as a string) to use as a watermark.
    * If repeat_watermark is true, then the watermark will be applied on all
      pages, otherwise it is applied only once.
    * Watermark is applied at all pages starting watermark_start_page (this
      index is 0 based)
    """
Aurel's avatar
Aurel committed
84 85 86 87 88 89 90 91 92 93 94 95
    try:
      from PyPDF2 import PdfFileWriter, PdfFileReader
    except ImportError:
      pass
    else:
      if not watermark_data:
        raise ValueError("watermark_data cannot not be empty")
      if not self.hasData():
        raise ValueError("Cannot watermark an empty document")
      self_reader = PdfFileReader(StringIO(self.getData()))
      watermark_reader = PdfFileReader(StringIO(watermark_data))
      watermark_page_count = watermark_reader.getNumPages()
96

Aurel's avatar
Aurel committed
97
      output = PdfFileWriter()
98

Aurel's avatar
Aurel committed
99 100 101 102 103 104 105 106 107 108 109 110
      for page_number in range(self_reader.getNumPages()):
        self_page = self_reader.getPage(page_number)
        watermark_page = None
        if page_number >= watermark_start_page:
          if repeat_watermark:
            watermark_page = watermark_reader.getPage(
              (page_number - watermark_start_page) % watermark_page_count)
          elif page_number < (watermark_page_count + watermark_start_page):
            watermark_page = watermark_reader.getPage(page_number - watermark_start_page)
          if watermark_page is not None:
            self_page.mergePage(watermark_page)
        output.addPage(self_page)
111

Aurel's avatar
Aurel committed
112 113 114
      outputStream = StringIO()
      output.write(outputStream)
      return outputStream.getvalue()
115

116
  # Conversion API
117
  def _convert(self, format, **kw):  # pylint: disable=redefined-builtin
118 119 120
    """
    Implementation of conversion for PDF files
    """
121
    if format == 'html':
122 123 124
      try:
        return self.getConversion(format=format)
      except KeyError:
125
        mime = 'text/html'
126
        data = self._convertToHTML()
127 128
        self.setConversion(data, mime=mime, format=format)
        return (mime, data)
129
    elif format in ('txt', 'text'):
130 131 132
      try:
        return self.getConversion(format='txt')
      except KeyError:
133
        mime = 'text/plain'
134
        data = self._convertToText()
135 136
        self.setConversion(data, mime=mime, format='txt')
        return (mime, data)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
137 138 139 140 141 142 143 144
    elif format in ('djvu', 'DJVU'):
      try:
        return self.getConversion(format='djvu')
      except KeyError:
        mime = 'image/vnd.djvu'
        data = self._convertToDJVU()
        self.setConversion(data, mime=mime, format='djvu')
        return (mime, data)
145 146
    elif format in ('', None,) or format=='pdf':
      # return original content
147
      return self.getContentType(), self.getData()
148
    else:
149 150 151 152 153
      if kw.get('frame', None) is None:
        # when converting to image from PDF we care for first page only
        # this will make sure that only first page is used and not whole content of
        # PDF file read & converted which is a performance issue
        kw['frame'] = 0
154
      return Image._convert(self, format, **kw)
155 156 157

  security.declareProtected(Permissions.ModifyPortalContent, 'populateContent')
  def populateContent(self):
158
    """
159 160 161
      Convert each page to an Image and populate the
      PDF directory with converted images. May be useful
      to provide online PDF reader
162
    """
163
    raise NotImplementedError
164 165

  security.declarePrivate('_convertToText')
166
  def _convertToText(self, format='txt'):  # pylint: disable=redefined-builtin
167 168 169 170
    """Convert the PDF to text

    If the PDF have text, return the text, otherwise try to do OCR using
    tesseract.
171
    """
172
    if not self.hasData():
173
      return ''
174 175 176 177 178 179 180 181 182 183 184 185 186
    data = str(self.getData())
    try:
      from PyPDF2 import PdfFileReader
      from PyPDF2.utils import PdfReadError
    except ImportError:
      pass
    else:
      try:
        if PdfFileReader(StringIO(data)).isEncrypted:
          return ''
      except PdfReadError:
        return ''

Nicolas Delaby's avatar
Nicolas Delaby committed
187 188
    mime_type = 'text/plain'
    portal_transforms = self.getPortalObject().portal_transforms
189
    filename = self.getFilename()
190
    result = portal_transforms.convertToData(mime_type, data,
Nicolas Delaby's avatar
Nicolas Delaby committed
191 192 193 194
                                             context=self, filename=filename,
                                             mimetype=self.getContentType())
    if result:
      return result
195
    else:
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
      # Try to use OCR from ghostscript, but tolerate that the command might
      # not be available.
      process = None
      command = [
          'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH', '-dNOPAUSE',
          '-dNOPROMPT', '-sDEVICE=ocr', '-r300x300', '-o', '-', '-f', '-'
      ]
      try:
        process = Popen(
            command,
            stdin=PIPE,
            stdout=PIPE,
            stderr=PIPE,
            close_fds=True,
        )
        output, error = process.communicate(data)
        if process.returncode:
          raise ConversionError(
              "Error invoking ghostscript.\noutput:%s\nerror:%s" % (output, error))
        return output.strip()
      except OSError as e:
        if e.errno != errno.ENOENT:
          raise
      finally:
        del process

      # We don't have ghostscript, fallback to the expensive pipeline using:
      #   pdf -- (Image._convert imagemagick) --> png
      #       -- (PortalTransforms.png_to_tiff imagemagick) --> tiff
      #       -- (PortalTransforms.tiff_to_text tesseract) --> text
      #
227
      # As high dpi images are required, it may take some times to convert the
228 229
      # pdf.
      # It may be required to use activities to fill the cache and at the end,
230 231 232 233 234
      # to calculate the final result
      text = ''
      content_information = self.getContentInformation()
      page_count = int(content_information.get('Pages', 0))
      for page_number in range(page_count):
235
        src_mimetype, png_data = self._convert(
236
            'png', quality=100, resolution=300,
237 238 239
            frame=page_number, display='identical')
        if not src_mimetype.endswith('png'):
          continue
Nicolas Delaby's avatar
Nicolas Delaby committed
240
        content = str(png_data)
241
        if content is not None:
Nicolas Delaby's avatar
Nicolas Delaby committed
242
          filename = self.getStandardFilename(format='png')
243 244
          result = portal_transforms.convertToData(mime_type, content,
                                                   context=self,
Nicolas Delaby's avatar
Nicolas Delaby committed
245
                                                   filename=filename,
246 247
                                                   mimetype=src_mimetype)
          if result is None:
248 249
            raise ConversionError('PDFDocument conversion error. '
                                  'portal_transforms failed to convert to %s: %r' % (mime_type, self))
250 251 252
          text += result
      return text

253
  security.declareProtected(Permissions.AccessContentsInformation, 'getSizeFromImageDisplay')
254 255 256 257 258 259 260 261 262 263 264 265
  def getSizeFromImageDisplay(self, image_display):
    """
    Return the size for this image display, or None if this image display name
    is not known. If the preference is not set, (0, 0) is returned.
    """
    # identical parameter can be considered as a hack, in order not to
    # resize the image to prevent text distorsion when using OCR.
    # A cleaner API is required.
    if image_display == 'identical':
      return (self.getWidth(), self.getHeight())
    else:
      return Image.getSizeFromImageDisplay(self, image_display)
266 267 268

  security.declarePrivate('_convertToHTML')
  def _convertToHTML(self):
Jérome Perrin's avatar
Jérome Perrin committed
269
    """Convert the PDF text content to HTML with pdftohtml
270
    """
271
    if not self.hasData():
272
      return ''
273
    tmp = tempfile.NamedTemporaryFile()
274
    tmp.write(self.getData())
275
    tmp.seek(0)
276

Nicolas Dumazet's avatar
Nicolas Dumazet committed
277 278 279 280 281 282 283 284 285 286 287 288 289
    command_result = None
    try:
      command = ['pdftohtml', '-enc', 'UTF-8', '-stdout',
                 '-noframes', '-i', tmp.name]
      try:
        command_result = Popen(command, stdout=PIPE).communicate()[0]
      except OSError, e:
        if e.errno == errno.ENOENT:
          raise ConversionError('pdftohtml was not found')
        raise

    finally:
      tmp.close()
290
    # Quick hack to remove bg color - XXX
Nicolas Dumazet's avatar
Nicolas Dumazet committed
291
    h = command_result.replace('<BODY bgcolor="#A0A0A0"', '<BODY ')
292 293 294
    # Make links relative
    h = h.replace('href="%s.html' % tmp.name.split(os.sep)[-1],
                                                          'href="asEntireHTML')
295 296
    return h

Jean-Paul Smets's avatar
Jean-Paul Smets committed
297 298
  security.declarePrivate('_convertToDJVU')
  def _convertToDJVU(self):
Jérome Perrin's avatar
Jérome Perrin committed
299
    """Convert the PDF text content to DJVU with pdf2djvu
Jean-Paul Smets's avatar
Jean-Paul Smets committed
300 301 302 303 304 305 306 307 308
    """
    if not self.hasData():
      return ''
    tmp = tempfile.NamedTemporaryFile()
    tmp.write(self.getData())
    tmp.seek(0)

    command_result = None
    try:
309
      command = ['pdf2djvu', tmp.name]
Jean-Paul Smets's avatar
Jean-Paul Smets committed
310 311 312 313 314 315 316 317 318 319 320
      try:
        command_result = Popen(command, stdout=PIPE).communicate()[0]
      except OSError, e:
        if e.errno == errno.ENOENT:
          raise ConversionError('pdf2djvu was not found')
        raise

    finally:
      tmp.close()
    return command_result

321 322
  security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
  def getContentInformation(self):
323
    """Returns the information about the PDF document with pdfinfo.
324
    """
325
    if not self.hasData():
326
      return {}
327
    try:
328
      return self._content_information.copy() # pylint: disable=access-member-before-definition
329 330
    except AttributeError:
      pass
331
    tmp = tempfile.NamedTemporaryFile()
332
    tmp.write(self.getData())
333
    tmp.seek(0)
Nicolas Dumazet's avatar
Nicolas Dumazet committed
334
    command_result = None
335
    try:
Nicolas Dumazet's avatar
Nicolas Dumazet committed
336

337
      # First, we use pdfinfo to get standard metadata
Nicolas Dumazet's avatar
Nicolas Dumazet committed
338 339 340 341 342 343 344 345
      command = ['pdfinfo', '-meta', '-box', tmp.name]
      try:
        command_result = Popen(command, stdout=PIPE).communicate()[0]
      except OSError, e:
        if e.errno == errno.ENOENT:
          raise ConversionError('pdfinfo was not found')
        raise

346
      result = {}
Nicolas Dumazet's avatar
Nicolas Dumazet committed
347
      for line in command_result.splitlines():
348 349 350 351 352
        item_list = line.split(':')
        key = item_list[0].strip()
        value = ':'.join(item_list[1:]).strip()
        result[key] = value

Aurel's avatar
Aurel committed
353
      # Then we use PyPDF2 to get extra metadata
354
      try:
Aurel's avatar
Aurel committed
355 356
        from PyPDF2 import PdfFileReader
        from PyPDF2.utils import PdfReadError
357
      except ImportError:
Aurel's avatar
Aurel committed
358
        # if PyPDF2 not found, pass
359
        pass
360
      else:
361 362
        try:
          pdf_file = PdfFileReader(tmp)
363
          for info_key, info_value in (pdf_file.getDocumentInfo() or {}).iteritems():
364 365 366
            info_key = info_key.lstrip("/")
            if isinstance(info_value, unicode):
              info_value = info_value.encode("utf-8")
367 368 369 370

            # Ignore values that cannot be pickled ( such as AAPL:Keywords )
            try:
              pickle.dumps(info_value)
371
            except pickle.PicklingError:
372 373 374 375 376
              LOG("PDFDocument.getContentInformation", INFO,
                "Ignoring non picklable document info on %s: %s (%r)" % (
                self.getRelativeUrl(), info_key, info_value))
            else:
              result.setdefault(info_key, info_value)
377
        except (PdfReadError, AssertionError):
378
          LOG("PDFDocument.getContentInformation", PROBLEM,
Aurel's avatar
Aurel committed
379
            "PyPDF2 is Unable to read PDF, probably corrupted PDF here : %s" % \
380
            (self.getRelativeUrl(),))
381 382 383 384
        except Exception:
          # an exception of Exception class will be raised when the
          # document is encrypted.
          pass
385 386 387
    finally:
      tmp.close()

388 389
    # Store cache as an instance of document. FIXME: we usually try to avoid this
    # pattern and cache the result of methods using content md5 as a cache key.
390 391 392
    self._content_information = result
    return result.copy()

393
  def _setFile(self, *args, **kw):
394 395
    try:
      del self._content_information
Yusei Tahara's avatar
Yusei Tahara committed
396
    except (AttributeError, KeyError):
397
      pass
398
    Image._setFile(self, *args, **kw)