Commit 52358c8a authored by Jérome Perrin's avatar Jérome Perrin

PDF: do not fail getting content information with document created using apple tools

Document created with some apple tools have a custom info 'AAPL:Keywords' for
which pypdf returns a non picklable instance.
parent 78ef5ad9
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
# #
############################################################################## ##############################################################################
import tempfile, os import tempfile, os, pickle
import zope.interface import zope.interface
from AccessControl import ClassSecurityInfo from AccessControl import ClassSecurityInfo
...@@ -36,7 +36,7 @@ from Products.ERP5.Document.Image import Image ...@@ -36,7 +36,7 @@ from Products.ERP5.Document.Image import Image
from Products.ERP5.Document.Document import ConversionError,\ from Products.ERP5.Document.Document import ConversionError,\
VALID_TEXT_FORMAT_LIST VALID_TEXT_FORMAT_LIST
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
from zLOG import LOG from zLOG import LOG, INFO, PROBLEM
import errno import errno
from StringIO import StringIO from StringIO import StringIO
...@@ -312,14 +312,25 @@ class PDFDocument(Image): ...@@ -312,14 +312,25 @@ class PDFDocument(Image):
info_key = info_key.lstrip("/") info_key = info_key.lstrip("/")
if isinstance(info_value, unicode): if isinstance(info_value, unicode):
info_value = info_value.encode("utf-8") info_value = info_value.encode("utf-8")
result.setdefault(info_key, info_value)
# Ignore values that cannot be pickled ( such as AAPL:Keywords )
try:
pickle.dumps(info_value)
except pickle.PicklingError, err:
LOG("PDFDocument.getContentInformation", INFO,
"Ignoring non picklable document info on %s: %s (%r)" % (
self.getRelativeUrl(), info_key, info_value))
else:
result.setdefault(info_key, info_value)
except PdfReadError: except PdfReadError:
LOG("PDFDocument.getContentInformation", 0, LOG("PDFDocument.getContentInformation", PROBLEM,
"pyPdf is Unable to read PDF, probably corrupted PDF here : %s" % \ "pyPdf is Unable to read PDF, probably corrupted PDF here : %s" % \
(self.getRelativeUrl(),)) (self.getRelativeUrl(),))
finally: finally:
tmp.close() tmp.close()
# Store cache as an instance of document. FIXME: we usually try to avoid this
# pattern and cache the result of methods using content md5 as a cache key.
self._content_information = result self._content_information = result
return result.copy() return result.copy()
......
...@@ -1329,6 +1329,21 @@ class TestDocument(TestDocumentMixin): ...@@ -1329,6 +1329,21 @@ class TestDocument(TestDocumentMixin):
# empty PDF have no content information # empty PDF have no content information
self.assertEquals(dict(), content_information) self.assertEquals(dict(), content_information)
def test_apple_PDF_metadata(self):
# PDF created with Apple software have a special 'AAPL:Keywords' info tag
# and when pypdf extracts pdf information, it is returned as an
# IndirectObject instance which is not picklable
document = self.portal.document_module.newContent(
portal_type='PDF',
file=makeFileUpload('apple_metadata.pdf'))
# content_information is picklable
content_information = document.getContentInformation()
from pickle import dumps
dumps(content_information)
# so document can be saved in ZODB
self.commit()
self.tic()
def test_PDF_content_content_type(self): def test_PDF_content_content_type(self):
upload_file = makeFileUpload('REF-en-001.pdf') upload_file = makeFileUpload('REF-en-001.pdf')
document = self.portal.document_module.newContent(portal_type='PDF') document = self.portal.document_module.newContent(portal_type='PDF')
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment