Revert "handler.pdf: use pyPdf in setMetada"

This reverts commit 0ff799eb. For old setups without pyPdf that want the latest changes by simply updating their local working copies.

Revert "handler.pdf: use pyPdf in setMetada"
This reverts commit 0ff799eb. For old setups without pyPdf that want the latest changes by simply updating their local working copies.
8306b381 · Julien Muchembled · a09d87af · 8306b381 · 8306b381
Commit 8306b381 authored Nov 09, 2022 by Julien Muchembled
Hide whitespace changes
Inline Side-by-side

Showing with 32 additions and 31 deletions

cloudooo/handler/pdf/handler.py cloudooo/handler/pdf/handler.py +32 -30

setup.py setup.py +0 -1

No files found.
--- a/cloudooo/handler/pdf/handler.py
+++ b/cloudooo/handler/pdf/handler.py
@@ -27,7 +27,6 @@
 # See https://www.nexedi.com/licensing for rationale and options.
 #
 ##############################################################################
-import io
 from zope.interface import implements
 from cloudooo.interfaces.handler import IHandler
@@ -36,8 +35,6 @@ from cloudooo.util import logger, parseContentType
 from subprocess import Popen, PIPE
 from tempfile import mktemp
-from pyPdf import PdfFileWriter, PdfFileReader
-from pyPdf.generic import NameObject, createStringObject
 class Handler(object):
  """PDF Handler is used to handler inputed pdf document."""
@@ -52,7 +49,6 @@ class Handler(object):
  def convert(self, destination_format=None, **kw):
    """ Convert a pdf document """
-    # TODO: use pyPdf
    logger.debug("PDFConvert: %s > %s" % (self.document.source_format, destination_format))
    output_url = mktemp(suffix=".%s" % destination_format,
                        dir=self.document.directory_name)
@@ -72,7 +68,6 @@ class Handler(object):
    """Returns a dictionary with all metadata of document.
    along with the metadata.
    """
-    # TODO: use pyPdf and not use lower()
    command = ["pdfinfo", self.document.getUrl()]
    stdout, stderr = Popen(command,
                           stdout=PIPE,
@@ -82,10 +77,13 @@ class Handler(object):
    info_list = filter(None, stdout.split("\n"))
    metadata = {}
    for info in iter(info_list):
-      info = info.split(":")
+      if info.count(":") == 1:
-      info_name = info[0].lower()
+        info_name, info_value = info.split(":")
-      info_value = ":".join(info[1:]).strip()
+      else:
-      metadata[info_name] = info_value
+        info_name, info_value = info.split("  ")
+        info_name = info_name.replace(":", "")
+      info_value = info_value.strip()
+      metadata[info_name.lower()] = info_value
    self.document.trash()
    return metadata
@@ -94,27 +92,31 @@ class Handler(object):
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
-    # TODO: date as "D:20090401124817-04'00'" ASN.1 for ModDate and CreationDate
+    text_template = "InfoKey: %s\nInfoValue: %s\n"
-    input_pdf = PdfFileReader(open(self.document.getUrl(), "rb"))
+    text_list = [text_template % (key.capitalize(), value) \
-    output_pdf = PdfFileWriter()
+                                 for key, value in metadata.iteritems()]
+    metadata_file = File(self.document.directory_name,
-    modification_date = metadata.pop("ModificationDate", None)
+                         "".join(text_list),
-    if modification_date:
+                         "txt")
-      metadata['ModDate'] = modification_date
+    output_url = mktemp(suffix=".pdf",
-    if type(metadata.get('Keywords', None)) is list:
+                        dir=self.document.directory_name)
-      metadata['Keywords'] = metadata['Keywords'].join(' ')
+    command = ["pdftk",
-    args = {}
+               self.document.getUrl(),
-    for key, value in list(metadata.items()):
+               "update_info",
-      args[NameObject('/' + key.capitalize())] = createStringObject(value)
+               metadata_file.getUrl(),
+               "output",
-    output_pdf._info.getObject().update(args)
+               output_url
+               ]
-    for page_num in range(input_pdf.getNumPages()):
+    stdout, stderr = Popen(command,
-      output_pdf.addPage(input_pdf.getPage(page_num))
+                           stdout=PIPE,
+                           stderr=PIPE,
-    output_stream = io.BytesIO()
+                           close_fds=True,
-    output_pdf.write(output_stream)
+                           env=self.environment).communicate()
-    return output_stream.getvalue()
+    self.document.reload(output_url)
+    try:
+      return self.document.getContent()
+    finally:
+      self.document.trash()
  @staticmethod
  def getAllowedConversionFormatList(source_mimetype):

--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,6 @@ install_requires = [
          'zope.interface',
          'PasteDeploy',
          'PasteScript[WSGIUtils]',
-          'pyPdf',
          'psutil>=3.0.0',
          'lxml',
          'python-magic',