TextContent base_data bytes

0b935752 · Jérome Perrin · Arnaud Fontaine · bab56ebd · 0b935752 · 0b935752
Commit 0b935752 authored Feb 07, 2024 by Jérome Perrin Committed by Arnaud Fontaine Jul 05, 2024
2 changed files
--- a/product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.Document.py
+++ b/product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.Document.py
@@ -410,7 +410,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixin,
  body_parser = re.compile(r'<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
  title_parser = re.compile(r'<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
  base_parser = re.compile(r'<base[^>]*href=[\'"](.*?)[\'"][^>]*>', re.IGNORECASE + re.DOTALL)
-  charset_parser = re.compile(r'(?P<keyword>charset="?)(?P<charset>[a-z0-9\-]+)', re.IGNORECASE)
+  charset_parser = re.compile(br'(?P<keyword>charset="?)(?P<charset>[a-z0-9\-]+)', re.IGNORECASE)
  # Declarative security
  security = ClassSecurityInfo()

--- a/product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.TextDocument.py
+++ b/product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.TextDocument.py
@@ -42,7 +42,7 @@ from string import Template
 from erp5.component.mixin.CachedConvertableMixin import CachedConvertableMixin
 from erp5.component.mixin.BaseConvertableFileMixin import BaseConvertableFileMixin
 from Products.ERP5Type.mixin.text_content_history import TextContentHistoryMixin
-from Products.ERP5Type.Utils import guessEncodingFromText
+from Products.ERP5Type.Utils import guessEncodingFromText, bytes2str
 from lxml import html as etree_html
 from lxml import etree
@@ -163,10 +163,9 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
        if mime_type == 'text/html':
          mime_type = 'text/x-html-safe'
        if src_mimetype != "image/svg+xml":
-          if six.PY2:
          data = text_content
-          else:
+          if not isinstance(data, bytes):
-            data = text_content.encode()
+            data = data.encode('utf-8')
          result = portal_transforms.convertToData(mime_type, data,
                                                   object=self, context=self,
                                                   filename=filename,
@@ -186,6 +185,8 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
                                       file=BytesIO(),
                                       filename=self.getId(),
                                       temp_object=1)
+          if not isinstance(result, bytes):
+            result = result.encode('utf-8')
          temp_image._setData(result)
          _, result = temp_image.convert(**kw)
@@ -227,7 +228,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
  def setBaseData(self, value):
    """Store base_data into text_content
    """
-    self._setTextContent(value)
+    self._setTextContent(bytes2str(value))
  security.declareProtected(Permissions.ModifyPortalContent, '_setBaseData')
  _setBaseData = setBaseData
@@ -253,9 +254,12 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
    """
    self._checkConversionFormatPermission(None)
    if default is _MARKER:
-      return self.getTextContent()
+      text_content = self.getTextContent()
    else:
-      return self.getTextContent(default=default)
+      text_content = self.getTextContent(default=default)
+    if six.PY3 and text_content and text_content is not default:
+      text_content = text_content.encode('utf-8')
+    return text_content
  security.declareProtected(Permissions.AccessContentsInformation, 'hasBaseData')
  def hasBaseData(self):
@@ -290,9 +294,12 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
  def _convertToBaseFormat(self):
    """Conversion to base format for TextDocument consist
-    to convert file content into utf-8
+    to convert file content into utf-8.
+    If the data embeds charset information, this information is updated
+    to the new (utf-8) charset. This supports XML and HTML.
    """
    def guessCharsetAndConvert(document, text_content, content_type):
+      # type: (TextDocument, bytes, str) -> Tuple[bytes, str]
      """
      return encoded content_type and message if encoding
      is not utf-8
@@ -322,31 +329,27 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
      return text_content, message
    content_type = self.getContentType() or DEFAULT_CONTENT_TYPE
-    text_content = self.getData() # TODO: don't we need to convert to bytes here ? what if it is PData ?
+    data = bytes(self.getData())
    if content_type.endswith('xml'):
      try:
-        tree = etree.fromstring(text_content)
+        tree = etree.fromstring(data)
-        text_content = etree.tostring(tree, encoding='utf-8', xml_declaration=True)
+        base_data = etree.tostring(tree, encoding='utf-8', xml_declaration=True)
        message = 'Conversion to base format succeeds'
      except etree.XMLSyntaxError: # pylint: disable=catching-non-exception
        message = 'Conversion to base format without codec fails'
    elif content_type == 'text/html':
-      re_match = self.charset_parser.search(
+      re_match = self.charset_parser.search(data)
-        # we don't really care about decoding errors for searching this
-        # regexp
-        text_content.decode('ascii', 'replace') if six.PY3 else text_content)
      message = 'Conversion to base format succeeds'
      if re_match is not None:
-        charset = re_match.group('charset')
+        base_data = data
+        charset = re_match.group('charset').decode('ascii')
        try:
          # Use encoding in html document
-          text_content = text_content.decode(charset)
+          data = data.decode(charset).encode('utf-8')
-          if six.PY2:
-            text_content = text_content.encode('utf-8')
        except (UnicodeDecodeError, LookupError):
          # Encoding read from document is wrong
-          text_content, message = guessCharsetAndConvert(self,
+          base_data, message = guessCharsetAndConvert(self,
-                                                text_content, content_type)
+                                                data, content_type)
        else:
          message = 'Conversion to base format with charset %r succeeds'\
                                                                  % charset
@@ -361,51 +364,33 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
                return matchobj.group(0)
              elif keyword:
                # if keyword is present, replace charset just after
-                return keyword + 'utf-8'
+                return keyword + b'utf-8'
-            text_content = self.charset_parser.sub(subCharset, text_content)
+            base_data = self.charset_parser.sub(subCharset, data)
      else:
-        text_content, message = guessCharsetAndConvert(self,
+        base_data, message = guessCharsetAndConvert(self, data, content_type)
-                                                  text_content, content_type)
    else:
      # generaly text/plain
      try:
        # if succeeds, not need to change encoding
        # it's already utf-8
-        text_content.decode('utf-8')
+        data.decode('utf-8')
      except (UnicodeDecodeError, LookupError):
-        text_content, message = guessCharsetAndConvert(self,
+        base_data, message = guessCharsetAndConvert(self, data, content_type)
-                                                  text_content, content_type)
      else:
        message = 'Conversion to base format succeeds'
-    # TODO(zope4py3): rethink this, shouldn't we store bytes in base data ?
+    self._setBaseData(base_data)
-    self._setBaseData(text_content)
    self._setBaseContentType(content_type)
    return message
  security.declareProtected(Permissions.AccessContentsInformation, 'getTextContent')
-  def getTextContent(self, default=_MARKER, encoding=None):
+  def getTextContent(self, default=_MARKER):
-    """Overriden method to check
+    """Overridden method to check permission to access content in raw format
-    permission to access content in raw format and manage encoding.
    """
    self._checkConversionFormatPermission(None)
    if default is _MARKER:
-      text_content = self._baseGetTextContent()
+      return self._baseGetTextContent()
    else:
-      text_content = self._baseGetTextContent(default)
+      return self._baseGetTextContent(default)
-    if isinstance(text_content, bytes):
-      # TODO(Zope4py3): should this return str ??
-      # We probably have "legacy" documents where `text_content` is a python2
-      # str encoded as something else than utf-8.
-      # Maybe we should introduce a new text_content_encoding property and
-      # expose API to getRawTextContent (as bytes) and getTextContent would return
-      # the decoded string.
-      # XXX what about _convertToBaseFormat/guessCharsetAndConvert ???
-      LOG('TextDocument', WARNING, "getTextContent with bytes %s" % text_content)
-      try:
-        text_content = text_content.decode('utf-8')
-      except UnicodeDecodeError:
-        text_content = text_content.decode('latin1')
-    return text_content
  # Backward compatibility for replacement of text_format by content_type
  security.declareProtected(Permissions.AccessContentsInformation, 'getTextFormat')