Extend guessEncoding method when chardet does not detect

acceptable encoding (it is reliable for html content only), So fallback to file command (only available on linux2 platform) to detect used encoding for text/plain. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@35217 20353a03-c40f-0410-a6d1-a30d3c3de9de

Extend guessEncoding method when chardet does not detect
acceptable encoding (it is reliable for html content only), So fallback to file command (only available on linux2 platform) to detect used encoding for text/plain. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@35217 20353a03-c40f-0410-a6d1-a30d3c3de9de
6257ef4b · Nicolas Delaby · dda491be · 6257ef4b
Commit 6257ef4b authored May 12, 2010 by Nicolas Delaby
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 3 deletions

product/ERP5/Document/Document.py product/ERP5/Document/Document.py +20 -3

No files found.
--- a/product/ERP5/Document/Document.py
+++ b/product/ERP5/Document/Document.py
@@ -1172,7 +1172,8 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
    """
    return self._stripHTML(self._asHTML(**kw))

-  def _guessEncoding(self, string):
+  security.declarePrivate('_guessEncoding')
+  def _guessEncoding(self, string, mime='text/html'):
    """
      Try to guess the encoding for this string.
      Returns None if no encoding can be guessed.
@@ -1180,8 +1181,24 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
    try:
      import chardet
    except ImportError:
-      return None
-    return chardet.detect(string).get('encoding', None)
+      chardet = None
+    if chardet is not None and (mime == 'text/html'\
+                                               or os.sys.platform != 'linux2'):
+      # chardet works fine on html document and its platform independent
+      return chardet.detect(string).get('encoding', None)
+    else:
+      # file command provide better result
+      # for text/plain documents
+      # store the content into tempfile
+      file_descriptor, path = tempfile.mkstemp()
+      file_object = os.fdopen(file_descriptor, 'w')
+      file_object.write(string)
+      file_object.close()
+      # run file command against tempfile to and read encoded
+      command_result = Popen(['file', '-b', '--mime-encoding', path],
+                                                  stdout=PIPE).communicate()[0]
+      # return detected encoding
+      return command_result.strip()

  def _stripHTML(self, html, charset=None):
    """