From 6257ef4b4beb2375bd601a1dae5422c469b972c5 Mon Sep 17 00:00:00 2001
From: Nicolas Delaby <nicolas@nexedi.com>
Date: Wed, 12 May 2010 13:31:15 +0000
Subject: [PATCH] Extend guessEncoding method when chardet does not detect
 acceptable encoding (it is reliable for html content only), So fallback to
 file command (only available on linux2 platform) to detect used encoding for
 text/plain.

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@35217 20353a03-c40f-0410-a6d1-a30d3c3de9de
---
 product/ERP5/Document/Document.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/product/ERP5/Document/Document.py b/product/ERP5/Document/Document.py
index 4b9300f27b..012365fc82 100644
--- a/product/ERP5/Document/Document.py
+++ b/product/ERP5/Document/Document.py
@@ -1172,7 +1172,8 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
     """
     return self._stripHTML(self._asHTML(**kw))
 
-  def _guessEncoding(self, string):
+  security.declarePrivate('_guessEncoding')
+  def _guessEncoding(self, string, mime='text/html'):
     """
       Try to guess the encoding for this string.
       Returns None if no encoding can be guessed.
@@ -1180,8 +1181,24 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
     try:
       import chardet
     except ImportError:
-      return None
-    return chardet.detect(string).get('encoding', None)
+      chardet = None
+    if chardet is not None and (mime == 'text/html'\
+                                               or os.sys.platform != 'linux2'):
+      # chardet works fine on html document and its platform independent
+      return chardet.detect(string).get('encoding', None)
+    else:
+      # file command provide better result
+      # for text/plain documents
+      # store the content into tempfile
+      file_descriptor, path = tempfile.mkstemp()
+      file_object = os.fdopen(file_descriptor, 'w')
+      file_object.write(string)
+      file_object.close()
+      # run file command against tempfile to and read encoded
+      command_result = Popen(['file', '-b', '--mime-encoding', path],
+                                                  stdout=PIPE).communicate()[0]
+      # return detected encoding
+      return command_result.strip()
 
   def _stripHTML(self, html, charset=None):
     """
-- 
2.30.9