Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
erp5
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Labels
Merge Requests
140
Merge Requests
140
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Jobs
Commits
Open sidebar
nexedi
erp5
Commits
b905e3b1
Commit
b905e3b1
authored
Feb 07, 2024
by
Jérome Perrin
Committed by
Arnaud Fontaine
Jul 04, 2024
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
TextContent base_data bytes
parent
7bf409b9
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
37 additions
and
52 deletions
+37
-52
product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.Document.py
...tTemplateItem/portal_components/document.erp5.Document.py
+1
-1
product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.TextDocument.py
...plateItem/portal_components/document.erp5.TextDocument.py
+36
-51
No files found.
product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.Document.py
View file @
b905e3b1
...
@@ -410,7 +410,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixin,
...
@@ -410,7 +410,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixin,
body_parser
=
re
.
compile
(
r'<body[^>]*>(.*?)</body>'
,
re
.
IGNORECASE
+
re
.
DOTALL
)
body_parser
=
re
.
compile
(
r'<body[^>]*>(.*?)</body>'
,
re
.
IGNORECASE
+
re
.
DOTALL
)
title_parser
=
re
.
compile
(
r'<title[^>]*>(.*?)</title>'
,
re
.
IGNORECASE
+
re
.
DOTALL
)
title_parser
=
re
.
compile
(
r'<title[^>]*>(.*?)</title>'
,
re
.
IGNORECASE
+
re
.
DOTALL
)
base_parser
=
re
.
compile
(
r'<base[^>]*href=[\'"](.*?)[\'"][^>]*>'
,
re
.
IGNORECASE
+
re
.
DOTALL
)
base_parser
=
re
.
compile
(
r'<base[^>]*href=[\'"](.*?)[\'"][^>]*>'
,
re
.
IGNORECASE
+
re
.
DOTALL
)
charset_parser
=
re
.
compile
(
r'(?P<keyword>charset="?)(?P<charset>[a-z0-9\
-]+)
', re.IGNORECASE)
charset_parser
=
re
.
compile
(
b
r'(?P<keyword>charset="?)(?P<charset>[a-z0-9\
-]+)
', re.IGNORECASE)
# Declarative security
# Declarative security
security = ClassSecurityInfo()
security = ClassSecurityInfo()
...
...
product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.TextDocument.py
View file @
b905e3b1
...
@@ -42,7 +42,7 @@ from string import Template
...
@@ -42,7 +42,7 @@ from string import Template
from
erp5.component.mixin.CachedConvertableMixin
import
CachedConvertableMixin
from
erp5.component.mixin.CachedConvertableMixin
import
CachedConvertableMixin
from
erp5.component.mixin.BaseConvertableFileMixin
import
BaseConvertableFileMixin
from
erp5.component.mixin.BaseConvertableFileMixin
import
BaseConvertableFileMixin
from
Products.ERP5Type.mixin.text_content_history
import
TextContentHistoryMixin
from
Products.ERP5Type.mixin.text_content_history
import
TextContentHistoryMixin
from
Products.ERP5Type.Utils
import
guessEncodingFromText
from
Products.ERP5Type.Utils
import
guessEncodingFromText
,
bytes2str
from
lxml
import
html
as
etree_html
from
lxml
import
html
as
etree_html
from
lxml
import
etree
from
lxml
import
etree
...
@@ -163,10 +163,9 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
...
@@ -163,10 +163,9 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
if
mime_type
==
'text/html'
:
if
mime_type
==
'text/html'
:
mime_type
=
'text/x-html-safe'
mime_type
=
'text/x-html-safe'
if
src_mimetype
!=
"image/svg+xml"
:
if
src_mimetype
!=
"image/svg+xml"
:
if
six
.
PY2
:
data
=
text_content
data
=
text_content
if
not
isinstance
(
data
,
bytes
):
else
:
data
=
data
.
encode
(
'utf-8'
)
data
=
text_content
.
encode
()
result
=
portal_transforms
.
convertToData
(
mime_type
,
data
,
result
=
portal_transforms
.
convertToData
(
mime_type
,
data
,
object
=
self
,
context
=
self
,
object
=
self
,
context
=
self
,
filename
=
filename
,
filename
=
filename
,
...
@@ -186,6 +185,8 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
...
@@ -186,6 +185,8 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
file
=
BytesIO
(),
file
=
BytesIO
(),
filename
=
self
.
getId
(),
filename
=
self
.
getId
(),
temp_object
=
1
)
temp_object
=
1
)
if
not
isinstance
(
result
,
bytes
):
result
=
result
.
encode
(
'utf-8'
)
temp_image
.
_setData
(
result
)
temp_image
.
_setData
(
result
)
_
,
result
=
temp_image
.
convert
(
**
kw
)
_
,
result
=
temp_image
.
convert
(
**
kw
)
...
@@ -227,7 +228,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
...
@@ -227,7 +228,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
def
setBaseData
(
self
,
value
):
def
setBaseData
(
self
,
value
):
"""Store base_data into text_content
"""Store base_data into text_content
"""
"""
self
.
_setTextContent
(
value
)
self
.
_setTextContent
(
bytes2str
(
value
)
)
security
.
declareProtected
(
Permissions
.
ModifyPortalContent
,
'_setBaseData'
)
security
.
declareProtected
(
Permissions
.
ModifyPortalContent
,
'_setBaseData'
)
_setBaseData
=
setBaseData
_setBaseData
=
setBaseData
...
@@ -253,9 +254,12 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
...
@@ -253,9 +254,12 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
"""
"""
self
.
_checkConversionFormatPermission
(
None
)
self
.
_checkConversionFormatPermission
(
None
)
if
default
is
_MARKER
:
if
default
is
_MARKER
:
return
self
.
getTextContent
()
text_content
=
self
.
getTextContent
()
else
:
else
:
return
self
.
getTextContent
(
default
=
default
)
text_content
=
self
.
getTextContent
(
default
=
default
)
if
six
.
PY3
and
text_content
and
text_content
is
not
default
:
text_content
=
text_content
.
encode
(
'utf-8'
)
return
text_content
security
.
declareProtected
(
Permissions
.
AccessContentsInformation
,
'hasBaseData'
)
security
.
declareProtected
(
Permissions
.
AccessContentsInformation
,
'hasBaseData'
)
def
hasBaseData
(
self
):
def
hasBaseData
(
self
):
...
@@ -290,9 +294,12 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
...
@@ -290,9 +294,12 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
def
_convertToBaseFormat
(
self
):
def
_convertToBaseFormat
(
self
):
"""Conversion to base format for TextDocument consist
"""Conversion to base format for TextDocument consist
to convert file content into utf-8
to convert file content into utf-8.
If the data embeds charset information, this information is updated
to the new (utf-8) charset. This supports XML and HTML.
"""
"""
def
guessCharsetAndConvert
(
document
,
text_content
,
content_type
):
def
guessCharsetAndConvert
(
document
,
text_content
,
content_type
):
# type: (TextDocument, bytes, str) -> Tuple[bytes, str]
"""
"""
return encoded content_type and message if encoding
return encoded content_type and message if encoding
is not utf-8
is not utf-8
...
@@ -322,36 +329,32 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
...
@@ -322,36 +329,32 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
return
text_content
,
message
return
text_content
,
message
content_type
=
self
.
getContentType
()
or
DEFAULT_CONTENT_TYPE
content_type
=
self
.
getContentType
()
or
DEFAULT_CONTENT_TYPE
text_content
=
self
.
getData
()
# TODO: don't we need to convert to bytes here ? what if it is PData ?
data
=
bytes
(
self
.
getData
())
if
content_type
.
endswith
(
'xml'
):
if
content_type
.
endswith
(
'xml'
):
try
:
try
:
tree
=
etree
.
fromstring
(
text_content
)
tree
=
etree
.
fromstring
(
data
)
text_content
=
etree
.
tostring
(
tree
,
encoding
=
'utf-8'
,
xml_declaration
=
True
)
base_data
=
etree
.
tostring
(
tree
,
encoding
=
'utf-8'
,
xml_declaration
=
True
)
message
=
'Conversion to base format succeeds'
message
=
'Conversion to base format succeeds'
except
etree
.
XMLSyntaxError
:
# pylint: disable=catching-non-exception
except
etree
.
XMLSyntaxError
:
# pylint: disable=catching-non-exception
message
=
'Conversion to base format without codec fails'
message
=
'Conversion to base format without codec fails'
elif
content_type
==
'text/html'
:
elif
content_type
==
'text/html'
:
re_match
=
self
.
charset_parser
.
search
(
re_match
=
self
.
charset_parser
.
search
(
data
)
# we don't really care about decoding errors for searching this
# regexp
text_content
.
decode
(
'ascii'
,
'replace'
)
if
six
.
PY3
else
text_content
)
message
=
'Conversion to base format succeeds'
message
=
'Conversion to base format succeeds'
if
re_match
is
not
None
:
if
re_match
is
not
None
:
charset
=
re_match
.
group
(
'charset'
)
base_data
=
data
charset
=
re_match
.
group
(
'charset'
).
decode
(
'ascii'
)
try
:
try
:
# Use encoding in html document
# Use encoding in html document
text_content
=
text_content
.
decode
(
charset
)
data
=
data
.
decode
(
charset
).
encode
(
'utf-8'
)
if
six
.
PY2
:
text_content
=
text_content
.
encode
(
'utf-8'
)
except
(
UnicodeDecodeError
,
LookupError
):
except
(
UnicodeDecodeError
,
LookupError
):
# Encoding read from document is wrong
# Encoding read from document is wrong
text_content
,
message
=
guessCharsetAndConvert
(
self
,
base_data
,
message
=
guessCharsetAndConvert
(
self
,
text_content
,
content_type
)
data
,
content_type
)
else
:
else
:
message
=
'Conversion to base format with charset %r succeeds'
\
message
=
'Conversion to base format with charset %r succeeds'
\
%
charset
%
charset
if
charset
.
lower
()
!=
'utf-8'
:
if
charset
.
lower
()
!=
'utf-8'
:
charset
=
'utf-8'
# Override charset if convertion succeeds
charset
=
'utf-8'
# Override charset if convertion succeeds
# change charset value in html_document as well
# change charset value in html_document as well
def
subCharset
(
matchobj
):
def
subCharset
(
matchobj
):
keyword
=
matchobj
.
group
(
'keyword'
)
keyword
=
matchobj
.
group
(
'keyword'
)
...
@@ -361,51 +364,33 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
...
@@ -361,51 +364,33 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
return
matchobj
.
group
(
0
)
return
matchobj
.
group
(
0
)
elif
keyword
:
elif
keyword
:
# if keyword is present, replace charset just after
# if keyword is present, replace charset just after
return
keyword
+
'utf-8'
return
keyword
+
b
'utf-8'
text_content
=
self
.
charset_parser
.
sub
(
subCharset
,
text_content
)
base_data
=
self
.
charset_parser
.
sub
(
subCharset
,
data
)
else
:
else
:
text_content
,
message
=
guessCharsetAndConvert
(
self
,
base_data
,
message
=
guessCharsetAndConvert
(
self
,
data
,
content_type
)
text_content
,
content_type
)
else
:
else
:
# generaly text/plain
# generaly text/plain
try
:
try
:
# if succeeds, not need to change encoding
# if succeeds, not need to change encoding
# it's already utf-8
# it's already utf-8
text_content
.
decode
(
'utf-8'
)
data
.
decode
(
'utf-8'
)
except
(
UnicodeDecodeError
,
LookupError
):
except
(
UnicodeDecodeError
,
LookupError
):
text_content
,
message
=
guessCharsetAndConvert
(
self
,
base_data
,
message
=
guessCharsetAndConvert
(
self
,
data
,
content_type
)
text_content
,
content_type
)
else
:
else
:
message
=
'Conversion to base format succeeds'
message
=
'Conversion to base format succeeds'
# TODO(zope4py3): rethink this, shouldn't we store bytes in base data ?
self
.
_setBaseData
(
base_data
)
self
.
_setBaseData
(
text_content
)
self
.
_setBaseContentType
(
content_type
)
self
.
_setBaseContentType
(
content_type
)
return
message
return
message
security
.
declareProtected
(
Permissions
.
AccessContentsInformation
,
'getTextContent'
)
security
.
declareProtected
(
Permissions
.
AccessContentsInformation
,
'getTextContent'
)
def
getTextContent
(
self
,
default
=
_MARKER
,
encoding
=
None
):
def
getTextContent
(
self
,
default
=
_MARKER
):
"""Overriden method to check
"""Overridden method to check permission to access content in raw format
permission to access content in raw format and manage encoding.
"""
"""
self
.
_checkConversionFormatPermission
(
None
)
self
.
_checkConversionFormatPermission
(
None
)
if
default
is
_MARKER
:
if
default
is
_MARKER
:
text_content
=
self
.
_baseGetTextContent
()
return
self
.
_baseGetTextContent
()
else
:
else
:
text_content
=
self
.
_baseGetTextContent
(
default
)
return
self
.
_baseGetTextContent
(
default
)
if
isinstance
(
text_content
,
bytes
):
# TODO(Zope4py3): should this return str ??
# We probably have "legacy" documents where `text_content` is a python2
# str encoded as something else than utf-8.
# Maybe we should introduce a new text_content_encoding property and
# expose API to getRawTextContent (as bytes) and getTextContent would return
# the decoded string.
# XXX what about _convertToBaseFormat/guessCharsetAndConvert ???
LOG
(
'TextDocument'
,
WARNING
,
"getTextContent with bytes %s"
%
text_content
)
try
:
text_content
=
text_content
.
decode
(
'utf-8'
)
except
UnicodeDecodeError
:
text_content
=
text_content
.
decode
(
'latin1'
)
return
text_content
# Backward compatibility for replacement of text_format by content_type
# Backward compatibility for replacement of text_format by content_type
security
.
declareProtected
(
Permissions
.
AccessContentsInformation
,
'getTextFormat'
)
security
.
declareProtected
(
Permissions
.
AccessContentsInformation
,
'getTextFormat'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment