Commit 2e8fd048 authored by Boris Kocherov's avatar Boris Kocherov

x2t: add support metadata and all another libreoffice formats

parent 50329148
......@@ -27,9 +27,10 @@
##############################################################################
from xml.etree import ElementTree
from subprocess import Popen, PIPE
from tempfile import NamedTemporaryFile, mktemp
import sys
import os
import json
import io
from mimetypes import guess_type
from zope.interface import implements
......@@ -37,6 +38,9 @@ from cloudooo.interfaces.handler import IHandler
from cloudooo.file import File
from cloudooo.util import logger, zipTree, unzip, parseContentType
from cloudooo.handler.ooo.handler import Handler as OOoHandler
from cloudooo.handler.ooo.handler import bootstrapHandler
from zipfile import ZipFile
AVS_OFFICESTUDIO_FILE_UNKNOWN = "0"
AVS_OFFICESTUDIO_FILE_DOCUMENT_DOCX = "65"
......@@ -68,13 +72,25 @@ yformat_map = {
'ppty': 'pptx',
}
yformat_service_map = {
'docy': 'com.sun.star.text.TextDocument',
'xlsy': 'com.sun.star.sheet.SpreadsheetDocument',
'ppty': 'com.sun.star.presentation.PresentationDocument',
yformat2opendocument_map = {
'docy': 'odt',
'xlsy': 'ods',
'ppty': 'odp',
}
yformat_tuple = ("docy", "xlsy", "ppty")
yformat_tuple = (
"docy", "application/x-asc-text",
"xlsy", "application/x-asc-spreadsheet",
"ppty", "application/x-asc-presentation",
)
openxml_tuple = (
"docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation",
)
supported_formats = yformat_tuple + openxml_tuple
class Handler(object):
"""
......@@ -97,33 +113,57 @@ class Handler(object):
self._data = data
self._source_format = source_format
self._init_kw = kw
self.file = File(base_folder_url, data, source_format)
self.environment = kw.get("env", {})
def convert(self, destination_format=None, **kw):
""" Convert the inputed file to output as format that were informed """
source_format = self.file.source_format
source_format = self._source_format
logger.debug("x2t convert: %s > %s" % (source_format, destination_format))
data = self._data
if source_format in yformat_tuple:
supported_format = yformat_map[source_format]
data = self._convert(data, source_format, supported_format)
source_format = supported_format
if destination_format in yformat_tuple:
supported_format = yformat_map[destination_format]
if supported_format != source_format:
data = OOoHandler(self.base_folder_url, data, source_format, **self._init_kw)\
.convert(destination_format=supported_format)
data = self._convert(data, supported_format, destination_format)
elif destination_format != source_format:
data = OOoHandler(self.base_folder_url, data, source_format, **self._init_kw)\
.convert(destination_format=destination_format)
return data
def _convert(self, data, source_format, destination_format):
""" Convert the inputed file to output as format that were informed """
self.file = File(self.base_folder_url, data, source_format)
logger.debug("x2t convert: %s > %s" % (source_format, destination_format))
# init vars and xml configuration file
in_format = format_code_map[source_format]
out_format = format_code_map[destination_format]
root_dir = self.file.directory_name
input_dir = os.path.join(root_dir, "input");
output_dir = os.path.join(root_dir, "output");
input_dir = os.path.join(root_dir, "input")
output_dir = os.path.join(root_dir, "output")
final_file_name = os.path.join(root_dir, "document.%s" % destination_format)
input_file_name = self.file.getUrl()
output_file_name = final_file_name
config_file_name = os.path.join(root_dir, "config.xml")
metadata = None
output_data = None
if source_format in yformat_tuple:
if self._data.startswith("PK\x03\x04"):
if data.startswith("PK\x03\x04"):
os.mkdir(input_dir)
unzip(self.file.getUrl(), input_dir)
for _, _, files in os.walk(input_dir):
input_file_name, = files
break
input_file_name = os.path.join(input_dir, input_file_name)
input_file_name = os.path.join(input_dir, "body.txt")
metadata_file_name = os.path.join(input_dir, "metadata.json")
if os.path.isfile(metadata_file_name):
with open(metadata_file_name) as metadata_file:
metadata = json.loads(metadata_file.read())
if destination_format in yformat_tuple:
os.mkdir(output_dir)
output_file_name = os.path.join(output_dir, "body.txt")
......@@ -160,54 +200,74 @@ class Handler(object):
if p.returncode != 0:
raise RuntimeError("x2t: exit code %d != 0\n+ %s\n> stdout: %s\n> stderr: %s@ x2t xml:\n%s" % (p.returncode, " ".join(["x2t", config_file.name]), stdout, stderr, " " + open(config_file.name).read().replace("\n", "\n ")))
if destination_format in yformat_tuple:
zipTree(
final_file_name,
(output_file_name, ""),
(os.path.join(os.path.dirname(output_file_name), "media"), ""),
)
self.file.reload(final_file_name)
try:
return self.file.getContent()
if source_format in yformat_tuple:
if (metadata):
output_data = OOoHandler(self.base_folder_url, self.file.getContent(), source_format, **self._init_kw)\
.setMetadata(metadata)
else:
output_data = self.file.getContent()
elif destination_format in yformat_tuple:
dir_name = os.path.dirname(output_file_name)
metadata_file_name = os.path.join(dir_name, "metadata.json")
with open(metadata_file_name, 'w') as metadata_file:
metadata = OOoHandler(self.base_folder_url, data, source_format, **self._init_kw).getMetadata()
metadata.pop('MIMEType', None)
metadata.pop('Generator', None)
metadata.pop('AppVersion', None)
metadata.pop('ImplementationName', None)
metadata_file.write(json.dumps(metadata))
zipTree(
final_file_name,
(output_file_name, ""),
(metadata_file_name, ""),
(os.path.join(dir_name, "media"), ""),
)
output_data = self.file.getContent()
finally:
self.file.trash()
return output_data
def _getContentType(self):
mimetype_type = None
if "/" not in self._source_format:
mimetype_type = guess_type('a.' + self._source_format)[0]
if mimetype_type is None:
mimetype_type = self._source_format
return mimetype_type
def getMetadata(self, base_document=False):
r"""Returns a dictionary with all metadata of document.
/!\ Not Implemented: no format are handled correctly.
"""
# XXX Cloudooo takes the first handler that can "handle" source_mimetype.
# However, docx documents metadata can only be "handled" by the ooo handler.
# Handlers should provide a way to tell if such capability is available for the required source mimetype.
# We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration.
# And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating.
if self._source_format in (
"docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation",
):
if self._source_format in yformat_tuple and self._data.startswith("PK\x03\x04"):
with io.BytesIO(self._data) as memfile, ZipFile(memfile) as zipfile:
try:
metadata = zipfile.read("metadata.json")
except KeyError:
metadata = '{}'
metadata = json.loads(metadata)
metadata['MIMEType'] = self._getContentType()
if base_document:
opendocument_format = yformat2opendocument_map[self._source_format]
metadata['MIMEType'] = guess_type('a.' + opendocument_format)[0]
metadata['Data'] = self.convert(opendocument_format)
return metadata
else:
return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).getMetadata(base_document)
return {}
def setMetadata(self, metadata={}):
r"""Returns document with new metadata.
/!\ Not Implemented: no format are handled correctly.
Keyword arguments:
metadata -- expected an dictionary with metadata.
"""
# XXX Cloudooo takes the first handler that can "handle" source_mimetype.
# However, docx documents metadata can only be "handled" by the ooo handler.
# Handlers should provide a way to tell if such capability is available for the required source mimetype.
# We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration.
# And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating.
if self._source_format in (
"docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation",
):
if self._source_format in yformat_tuple and self._data.startswith("PK\x03\x04"):
with io.BytesIO(self._data) as memfile, ZipFile(memfile) as zipfile:
zipfile.write("metadata.json", json.dumps(metadata))
return memfile.getvalue()
else:
return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).setMetadata(metadata)
return self.file.getContent()
@staticmethod
def getAllowedConversionFormatList(source_mimetype):
......@@ -218,17 +278,26 @@ class Handler(object):
...
]
"""
getFormatList = OOoHandler.getAllowedConversionFormatList
source_mimetype = parseContentType(source_mimetype).gettype()
if source_mimetype in ("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"):
return [("application/x-asc-text", "OnlyOffice Text Document")]
if source_mimetype in ("docy", "application/x-asc-text"):
return [("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "Word 2007 Document")]
if source_mimetype in ("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"):
return [("application/x-asc-spreadsheet", "OnlyOffice Spreadsheet")]
return getFormatList("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
if source_mimetype in ("xlsy", "application/x-asc-spreadsheet"):
return [("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "Excel 2007 Spreadsheet")]
if source_mimetype in ("pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"):
return [("application/x-asc-presentation", "OnlyOffice Presentation")]
return getFormatList("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
if source_mimetype in ("ppty", "application/x-asc-presentation"):
return [("application/vnd.openxmlformats-officedocument.presentationml.presentation", "PowerPoint 2007 Presentation")]
return []
return getFormatList("application/vnd.openxmlformats-officedocument.presentationml.presentation")
format_list = getFormatList(source_mimetype)
format_list_append = format_list.append
for type, _ in format_list:
if type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
format_list_append(("application/x-asc-text", "OnlyOffice Text Document"))
break
if type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
format_list_append(("application/x-asc-spreadsheet", "OnlyOffice Spreadsheet"))
break
if type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
format_list_append(("application/x-asc-presentation", "OnlyOffice Presentation"))
break
return format_list
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment