Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
cloudooo
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Labels
Merge Requests
7
Merge Requests
7
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Jobs
Commits
Open sidebar
nexedi
cloudooo
Commits
2e8fd048
Commit
2e8fd048
authored
Feb 14, 2017
by
Boris Kocherov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x2t: add support metadata and all another libreoffice formats
parent
50329148
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
127 additions
and
58 deletions
+127
-58
cloudooo/handler/x2t/handler.py
cloudooo/handler/x2t/handler.py
+127
-58
No files found.
cloudooo/handler/x2t/handler.py
View file @
2e8fd048
...
...
@@ -27,9 +27,10 @@
##############################################################################
from
xml.etree
import
ElementTree
from
subprocess
import
Popen
,
PIPE
from
tempfile
import
NamedTemporaryFile
,
mktemp
import
sys
import
os
import
json
import
io
from
mimetypes
import
guess_type
from
zope.interface
import
implements
...
...
@@ -37,6 +38,9 @@ from cloudooo.interfaces.handler import IHandler
from
cloudooo.file
import
File
from
cloudooo.util
import
logger
,
zipTree
,
unzip
,
parseContentType
from
cloudooo.handler.ooo.handler
import
Handler
as
OOoHandler
from
cloudooo.handler.ooo.handler
import
bootstrapHandler
from
zipfile
import
ZipFile
AVS_OFFICESTUDIO_FILE_UNKNOWN
=
"0"
AVS_OFFICESTUDIO_FILE_DOCUMENT_DOCX
=
"65"
...
...
@@ -68,13 +72,25 @@ yformat_map = {
'ppty'
:
'pptx'
,
}
yformat
_service
_map
=
{
'docy'
:
'
com.sun.star.text.TextDocumen
t'
,
'xlsy'
:
'
com.sun.star.sheet.SpreadsheetDocument
'
,
'ppty'
:
'
com.sun.star.presentation.PresentationDocument
'
,
yformat
2opendocument
_map
=
{
'docy'
:
'
od
t'
,
'xlsy'
:
'
ods
'
,
'ppty'
:
'
odp
'
,
}
yformat_tuple
=
(
"docy"
,
"xlsy"
,
"ppty"
)
yformat_tuple
=
(
"docy"
,
"application/x-asc-text"
,
"xlsy"
,
"application/x-asc-spreadsheet"
,
"ppty"
,
"application/x-asc-presentation"
,
)
openxml_tuple
=
(
"docx"
,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
,
"xlsx"
,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
,
"pptx"
,
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
,
)
supported_formats
=
yformat_tuple
+
openxml_tuple
class
Handler
(
object
):
"""
...
...
@@ -97,33 +113,57 @@ class Handler(object):
self
.
_data
=
data
self
.
_source_format
=
source_format
self
.
_init_kw
=
kw
self
.
file
=
File
(
base_folder_url
,
data
,
source_format
)
self
.
environment
=
kw
.
get
(
"env"
,
{})
def
convert
(
self
,
destination_format
=
None
,
**
kw
):
""" Convert the inputed file to output as format that were informed """
source_format
=
self
.
file
.
source_format
source_format
=
self
.
_source_format
logger
.
debug
(
"x2t convert: %s > %s"
%
(
source_format
,
destination_format
))
data
=
self
.
_data
if
source_format
in
yformat_tuple
:
supported_format
=
yformat_map
[
source_format
]
data
=
self
.
_convert
(
data
,
source_format
,
supported_format
)
source_format
=
supported_format
if
destination_format
in
yformat_tuple
:
supported_format
=
yformat_map
[
destination_format
]
if
supported_format
!=
source_format
:
data
=
OOoHandler
(
self
.
base_folder_url
,
data
,
source_format
,
**
self
.
_init_kw
)
\
.
convert
(
destination_format
=
supported_format
)
data
=
self
.
_convert
(
data
,
supported_format
,
destination_format
)
elif
destination_format
!=
source_format
:
data
=
OOoHandler
(
self
.
base_folder_url
,
data
,
source_format
,
**
self
.
_init_kw
)
\
.
convert
(
destination_format
=
destination_format
)
return
data
def
_convert
(
self
,
data
,
source_format
,
destination_format
):
""" Convert the inputed file to output as format that were informed """
self
.
file
=
File
(
self
.
base_folder_url
,
data
,
source_format
)
logger
.
debug
(
"x2t convert: %s > %s"
%
(
source_format
,
destination_format
))
# init vars and xml configuration file
in_format
=
format_code_map
[
source_format
]
out_format
=
format_code_map
[
destination_format
]
root_dir
=
self
.
file
.
directory_name
input_dir
=
os
.
path
.
join
(
root_dir
,
"input"
)
;
output_dir
=
os
.
path
.
join
(
root_dir
,
"output"
)
;
input_dir
=
os
.
path
.
join
(
root_dir
,
"input"
)
output_dir
=
os
.
path
.
join
(
root_dir
,
"output"
)
final_file_name
=
os
.
path
.
join
(
root_dir
,
"document.%s"
%
destination_format
)
input_file_name
=
self
.
file
.
getUrl
()
output_file_name
=
final_file_name
config_file_name
=
os
.
path
.
join
(
root_dir
,
"config.xml"
)
metadata
=
None
output_data
=
None
if
source_format
in
yformat_tuple
:
if
self
.
_
data
.
startswith
(
"PK
\
x03
\
x04
"
):
if
data
.
startswith
(
"PK
\
x03
\
x04
"
):
os
.
mkdir
(
input_dir
)
unzip
(
self
.
file
.
getUrl
(),
input_dir
)
for
_
,
_
,
files
in
os
.
walk
(
input_dir
):
input_file_name
,
=
files
break
input_file_name
=
os
.
path
.
join
(
input_dir
,
input_file_name
)
input_file_name
=
os
.
path
.
join
(
input_dir
,
"body.txt"
)
metadata_file_name
=
os
.
path
.
join
(
input_dir
,
"metadata.json"
)
if
os
.
path
.
isfile
(
metadata_file_name
):
with
open
(
metadata_file_name
)
as
metadata_file
:
metadata
=
json
.
loads
(
metadata_file
.
read
())
if
destination_format
in
yformat_tuple
:
os
.
mkdir
(
output_dir
)
output_file_name
=
os
.
path
.
join
(
output_dir
,
"body.txt"
)
...
...
@@ -160,54 +200,74 @@ class Handler(object):
if
p
.
returncode
!=
0
:
raise
RuntimeError
(
"x2t: exit code %d != 0
\
n
+ %s
\
n
> stdout: %s
\
n
> stderr: %s@ x2t xml:
\
n
%s"
%
(
p
.
returncode
,
" "
.
join
([
"x2t"
,
config_file
.
name
]),
stdout
,
stderr
,
" "
+
open
(
config_file
.
name
).
read
().
replace
(
"
\
n
"
,
"
\
n
"
)))
if
destination_format
in
yformat_tuple
:
zipTree
(
final_file_name
,
(
output_file_name
,
""
),
(
os
.
path
.
join
(
os
.
path
.
dirname
(
output_file_name
),
"media"
),
""
),
)
self
.
file
.
reload
(
final_file_name
)
try
:
return
self
.
file
.
getContent
()
if
source_format
in
yformat_tuple
:
if
(
metadata
):
output_data
=
OOoHandler
(
self
.
base_folder_url
,
self
.
file
.
getContent
(),
source_format
,
**
self
.
_init_kw
)
\
.
setMetadata
(
metadata
)
else
:
output_data
=
self
.
file
.
getContent
()
elif
destination_format
in
yformat_tuple
:
dir_name
=
os
.
path
.
dirname
(
output_file_name
)
metadata_file_name
=
os
.
path
.
join
(
dir_name
,
"metadata.json"
)
with
open
(
metadata_file_name
,
'w'
)
as
metadata_file
:
metadata
=
OOoHandler
(
self
.
base_folder_url
,
data
,
source_format
,
**
self
.
_init_kw
).
getMetadata
()
metadata
.
pop
(
'MIMEType'
,
None
)
metadata
.
pop
(
'Generator'
,
None
)
metadata
.
pop
(
'AppVersion'
,
None
)
metadata
.
pop
(
'ImplementationName'
,
None
)
metadata_file
.
write
(
json
.
dumps
(
metadata
))
zipTree
(
final_file_name
,
(
output_file_name
,
""
),
(
metadata_file_name
,
""
),
(
os
.
path
.
join
(
dir_name
,
"media"
),
""
),
)
output_data
=
self
.
file
.
getContent
()
finally
:
self
.
file
.
trash
()
return
output_data
def
_getContentType
(
self
):
mimetype_type
=
None
if
"/"
not
in
self
.
_source_format
:
mimetype_type
=
guess_type
(
'a.'
+
self
.
_source_format
)[
0
]
if
mimetype_type
is
None
:
mimetype_type
=
self
.
_source_format
return
mimetype_type
def
getMetadata
(
self
,
base_document
=
False
):
r"""Returns a dictionary with all metadata of document.
/!\
No
t Implemented: no format are handled correctly.
"""
# XXX Cloudooo takes the first handler that can "handle" source_mimetype.
# However, docx documents metadata can only be "handled" by the ooo handler.
# Handlers should provide a way to tell if such capability is available for the required source mimetype.
# We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration.
# And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating.
if
self
.
_source_format
in
(
"docx"
,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
,
"xlsx"
,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
,
"pptx"
,
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
,
):
if
self
.
_source_format
in
yformat_tuple
and
self
.
_data
.
startswith
(
"PK
\
x03
\
x04
"
):
with
io
.
BytesIO
(
self
.
_data
)
as
memfile
,
ZipFile
(
memfile
)
as
zipfile
:
try
:
metadata
=
zipfile
.
read
(
"metadata.json"
)
except
KeyError
:
metadata
=
'{}'
metadata
=
json
.
loads
(
metadata
)
metadata
[
'MIMEType'
]
=
self
.
_getContentType
()
if
base_document
:
opendocument_format
=
yformat2opendocument_map
[
self
.
_source_format
]
metadata
[
'MIMEType'
]
=
guess_type
(
'a.'
+
opendocument_format
)[
0
]
metadata
[
'Data'
]
=
self
.
convert
(
opendocument_format
)
return
metadata
else
:
return
OOoHandler
(
self
.
base_folder_url
,
self
.
_data
,
self
.
_source_format
,
**
self
.
_init_kw
).
getMetadata
(
base_document
)
return
{}
def
setMetadata
(
self
,
metadata
=
{}):
r"""Returns document with new metadata.
/!\
No
t Implemented: no format are handled correctly.
Keyword arguments:
metadata -- expected an dictionary with metadata.
"""
# XXX Cloudooo takes the first handler that can "handle" source_mimetype.
# However, docx documents metadata can only be "handled" by the ooo handler.
# Handlers should provide a way to tell if such capability is available for the required source mimetype.
# We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration.
# And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating.
if
self
.
_source_format
in
(
"docx"
,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
,
"xlsx"
,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
,
"pptx"
,
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
,
):
if
self
.
_source_format
in
yformat_tuple
and
self
.
_data
.
startswith
(
"PK
\
x03
\
x04
"
):
with
io
.
BytesIO
(
self
.
_data
)
as
memfile
,
ZipFile
(
memfile
)
as
zipfile
:
zipfile
.
write
(
"metadata.json"
,
json
.
dumps
(
metadata
))
return
memfile
.
getvalue
()
else
:
return
OOoHandler
(
self
.
base_folder_url
,
self
.
_data
,
self
.
_source_format
,
**
self
.
_init_kw
).
setMetadata
(
metadata
)
return
self
.
file
.
getContent
()
@
staticmethod
def
getAllowedConversionFormatList
(
source_mimetype
):
...
...
@@ -218,17 +278,26 @@ class Handler(object):
...
]
"""
getFormatList
=
OOoHandler
.
getAllowedConversionFormatList
source_mimetype
=
parseContentType
(
source_mimetype
).
gettype
()
if
source_mimetype
in
(
"docx"
,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
return
[(
"application/x-asc-text"
,
"OnlyOffice Text Document"
)]
if
source_mimetype
in
(
"docy"
,
"application/x-asc-text"
):
return
[(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
,
"Word 2007 Document"
)]
if
source_mimetype
in
(
"xlsx"
,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
):
return
[(
"application/x-asc-spreadsheet"
,
"OnlyOffice Spreadsheet"
)]
return
getFormatList
(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
if
source_mimetype
in
(
"xlsy"
,
"application/x-asc-spreadsheet"
):
return
[(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
,
"Excel 2007 Spreadsheet"
)]
if
source_mimetype
in
(
"pptx"
,
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
return
[(
"application/x-asc-presentation"
,
"OnlyOffice Presentation"
)]
return
getFormatList
(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
if
source_mimetype
in
(
"ppty"
,
"application/x-asc-presentation"
):
return
[(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
,
"PowerPoint 2007 Presentation"
)]
return
[]
return
getFormatList
(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
format_list
=
getFormatList
(
source_mimetype
)
format_list_append
=
format_list
.
append
for
type
,
_
in
format_list
:
if
type
==
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
:
format_list_append
((
"application/x-asc-text"
,
"OnlyOffice Text Document"
))
break
if
type
==
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
:
format_list_append
((
"application/x-asc-spreadsheet"
,
"OnlyOffice Spreadsheet"
))
break
if
type
==
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
:
format_list_append
((
"application/x-asc-presentation"
,
"OnlyOffice Presentation"
))
break
return
format_list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment