PDFDocument.py 7.46 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35

##############################################################################
#
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

from AccessControl import ClassSecurityInfo
from Products.CMFCore.WorkflowCore import WorkflowMethod
from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5.Document.Image import Image
from Products.ERP5.Document.File import File, stripHtml
from Products.ERP5.Document.Document import ConversionCacheMixin
36
from Products.CMFCore.utils import getToolByName
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
from zLOG import LOG

import tempfile, os, glob, zipfile, cStringIO, re


class PDFDocument(File, ConversionCacheMixin):
  """
  PdfDocument - same as file, but has its own getSearchableText method
  (converts via pdftotext)
  in effect it has two separate caches - from CachingMixin for txt and html
  and for image formats from Image
  """
  # CMF Type Definition
  meta_type = 'ERP5 PDF'
  portal_type = 'PDF'
  isPortalContent = 1
  isRADContent = 1

  # Declarative security
  security = ClassSecurityInfo()
  security.declareObjectProtected(Permissions.AccessContentsInformation)

  # Default Properties
  property_sheets = ( PropertySheet.Base
                    , PropertySheet.CategoryCore
                    , PropertySheet.DublinCore
                    , PropertySheet.Version
                    , PropertySheet.Reference
                    , PropertySheet.Document
66
                    , PropertySheet.TextDocument
67 68 69
                    , PropertySheet.Data
                    )

Bartek Górny's avatar
Bartek Górny committed
70

71
  def index_html(self, REQUEST, RESPONSE, format=None, force=0):
72
    """
73 74 75
      Returns data in the appropriate format (graphical)
      it is always a zip because multi-page pdfs are converted into a zip
      file of many images
76
    """
77 78 79
    if format is None:
      RESPONSE.setHeader('Content-Type', 'application/pdf')
      return self._unpackData(self.data)
80 81 82 83 84 85 86
    if format == 'html':
      RESPONSE.setHeader('Content-Type', 'text/html;charset=UTF-8')
      return self.getHtmlRepresentation(force)
    if format == 'txt':
      RESPONSE.setHeader('Content-Type', 'text/plain;charset=UTF-8')
      self._convertToText(force)
      return self.getTextContent()
87 88
    mime = 'image/'+format.lower()
    if force or not self.hasConversion(format = format):
89
      self.setConversion(self._makeFile(format), 'application/zip', format=format)
90
    RESPONSE.setHeader('Content-Type', 'application/zip')
91 92 93
    return self.getConversion(format = format)

  def _makeFile(self,format):
Bartek Górny's avatar
Bartek Górny committed
94 95
    tempfile.tempdir = os.path.join(os.getenv('INSTANCE_HOME'), 'tmp')
    os.putenv('TMPDIR', '/tmp') # because if we run zope as root, we have /root/tmp here and convert goes crazy
96
    if not os.path.exists(tempfile.tempdir):
Bartek Górny's avatar
Bartek Górny committed
97 98 99 100
      os.mkdir(tempfile.tempdir, 0775)
    fr = tempfile.mktemp(suffix='.pdf')
    to = tempfile.mktemp(suffix = '.' + format)
    file_fr = open(fr, 'w')
101 102
    file_fr.write(self._unpackData(self.data))
    file_fr.close()
Bartek Górny's avatar
Bartek Górny committed
103
    cmd = 'convert %s %s' % (fr, to)
104 105
    os.system(cmd)
    # pack it
Bartek Górny's avatar
Bartek Górny committed
106 107 108 109 110
    f = cStringIO.StringIO()
    z = zipfile.ZipFile(f, 'a')
    for fname in glob.glob(to.replace('.', '*')):
      base = os.path.basename(fname)
      pg = re.match('.*(\d+)\.'+format, base).groups()
111
      if pg:
Bartek Górny's avatar
Bartek Górny committed
112 113
        pg = pg[0]
        arcname = '%s/page-%s.%s' % (format, pg, format)
114
      else:
Bartek Górny's avatar
Bartek Górny committed
115 116
        arcname = base
      z.write(fname, arcname)
117 118 119 120 121 122 123 124 125 126
    z.close()
    f.seek(0)
    return f.read()

  searchable_property_list = File.searchable_property_list + ('text_content',)

  ### Content indexing methods
  security.declareProtected(Permissions.View, 'getSearchableText')
  def getSearchableText(self, md=None, force=0):
    """
127 128
      Used by the catalog for basic full text indexing
      conditionally convert pdf to text
129
    """
130 131 132 133 134 135 136 137 138 139 140 141 142 143
    self._convertToText(force)
    return File.getSearchableText(self, md)

  security.declarePrivate('_convertToText')
  def _convertToText(self, force):
    """
      Private implementation method.
      If we don't have txt cache or we are forced to convert, we try to do it
      using system pdftotext utility. We set the result as text_content property.
      We mark it in cache as done, even if we fail, so we don't keep trying if it
      doesn't work.
    """
    portal_workflow = getToolByName(self, 'portal_workflow')
    if hasattr(self, 'data') and (force == 1 or not self.hasConversion(format = 'txt')):
144
      # XXX-JPS accessing attribute data is bad
Bartek Górny's avatar
Bartek Górny committed
145
      self.log('PdfDocument', 'regenerating txt')
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
      try:
        try:
          tmp = tempfile.NamedTemporaryFile()
          tmp.write(self._unpackData(self.data))
          tmp.seek(0)
          cmd = 'pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
          r = os.popen(cmd)
          self.setTextContent(r.read().replace('\n', ' '))
          tmp.close()
          r.close()
        except Exception, e:
          self.log(str(e))
          msg = 'Conversion to text failed: ' + str(e)
        else:
          msg = 'Converted to text'
      finally:
        portal_workflow.doActionFor(self, 'process', comment=msg)
        # we don't need to store it twice, just mark we have it (or rather we already tried)
        # we try only once
        self.setConversion('empty', format = 'txt') 
166 167 168 169 170 171 172

  SearchableText=getSearchableText

  security.declareProtected(Permissions.View, 'getHtmlRepresentation')
  def getHtmlRepresentation(self, force=0):
    '''
    get simplified html version to display
173 174
    If we fail to convert, we set workflow message and put error message
    as html preview so that the user knows what's going on
175
    '''
176
    portal_workflow = getToolByName(self, 'portal_workflow')
Bartek Górny's avatar
Bartek Górny committed
177
    if not hasattr(self, 'data'):
178 179
      return 'no data'
    if force==1 or not self.hasConversion(format = 'html'):
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
      try:
        self.log('PDF', 'regenerating html')
        tmp = tempfile.NamedTemporaryFile()
        tmp.write(self._unpackData(self.data))
        tmp.seek(0)
        cmd = 'pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
        r = os.popen(cmd)
        h = r.read()
        tmp.close()
        r.close()
        h = stripHtml(h)
      except Exception, e:
        msg = 'Could not convert to html: ' + str(e)
        h = msg
        portal_workflow.doActionFor(self, 'process', comment=msg)
195 196 197 198 199
      self.setConversion(h, format = 'html')
    return self.getConversion(format = 'html')[1]

# vim: syntax=python shiftwidth=2