From de2545fc06c034da5e4bd1d53aa812aa5ba83a03 Mon Sep 17 00:00:00 2001
From: Nicolas Delaby <nicolas@nexedi.com>
Date: Wed, 1 Dec 2010 14:53:24 +0000
Subject: [PATCH] Refactoring of DMS. - file_name become filename - filename
 values are not stored in source_reference Contribution Tool will not honour
 id arguments. Contribution Tool can create any kind of document. Portal
 Contribution Registry can read extention, content_type and read content_type
 from data to guess what will be the best Portal Type to use.

All discoverable methods (IDiscoverable) can change the portal_type of document.
  (migratePortalType)
User can change portal_type of document through UI with simple Action.
Crawling will not hardcode ids of document depending of their URLs thanks to
Portal Url Registry





git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@40971 20353a03-c40f-0410-a6d1-a30d3c3de9de
---
 product/ERP5/Document/Document.py             | 240 +-------
 product/ERP5/Document/PDFDocument.py          |  31 +-
 product/ERP5/Document/TextDocument.py         |  19 +-
 product/ERP5/Tool/ContributionRegistryTool.py |  18 +-
 product/ERP5/Tool/ContributionTool.py         | 542 ++++++++----------
 product/ERP5/__init__.py                      |   3 +-
 .../drawing_extension.xml                     |   8 +-
 .../image_extension.xml                       |   8 +-
 .../pdf_extension.xml                         |   8 +-
 .../pdf_mimetype.xml                          |   8 +-
 .../presentation_extension.xml                |   8 +-
 .../spreadsheet_by_content.xml                |  71 +++
 .../spreadsheet_extension.xml                 |   8 +-
 .../text_by_conent_type.xml                   |  67 +++
 .../text_by_content.xml                       |  67 +++
 .../text_extension.xml                        |   6 +-
 .../web_page_by_content.xml                   |  67 +++
 .../webpage_extension.xml                     |   8 +-
 .../webpage_mimetype.xml                      |  14 +-
 .../default_site_preference.xml               |  32 ++
 .../portal_skins/erp5_core/Base_download.xml  |   6 +-
 .../my_criterion_property_list.xml            |  12 +-
 .../listbox.xml                               |  38 ++
 product/ERP5/bootstrap/erp5_core/bt/revision  |   2 +-
 .../bootstrap/erp5_core/bt/template_path_list |   4 +
 product/ERP5/interfaces/discoverable.py       | 105 ++++
 product/ERP5/interfaces/document.py           |   2 +-
 product/ERP5/interfaces/downloadable.py       |   4 +-
 product/ERP5/interfaces/url.py                |  78 +++
 product/ERP5/interfaces/url_registry_tool.py  |  95 +++
 product/ERP5/mixin/cached_convertable.py      |  13 +-
 product/ERP5/mixin/crawlable.py               |  87 ++-
 product/ERP5/mixin/discoverable.py            | 254 ++++++++
 product/ERP5/mixin/downloadable.py            |  23 +-
 product/ERP5/tests/testBase.py                |  38 ++
 product/ERP5/tests/testCRM.py                 |   8 +-
 .../tests/testContributionRegistryTool.py     |  30 +-
 product/ERP5/tests/testERP5WebWithDms.py      |  36 +-
 product/ERP5/tests/testWebCrawler.py          | 298 ++++++++++
 39 files changed, 1744 insertions(+), 622 deletions(-)
 create mode 100644 product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/spreadsheet_by_content.xml
 create mode 100644 product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_by_conent_type.xml
 create mode 100644 product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_by_content.xml
 create mode 100644 product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/web_page_by_content.xml
 create mode 100644 product/ERP5/interfaces/discoverable.py
 create mode 100644 product/ERP5/interfaces/url.py
 create mode 100644 product/ERP5/interfaces/url_registry_tool.py
 create mode 100644 product/ERP5/mixin/discoverable.py
 create mode 100644 product/ERP5/tests/testWebCrawler.py

diff --git a/product/ERP5/Document/Document.py b/product/ERP5/Document/Document.py
index a6729d5336..5ca8cbd389 100644
--- a/product/ERP5/Document/Document.py
+++ b/product/ERP5/Document/Document.py
@@ -40,18 +40,15 @@ from Products.ERP5Type import Permissions, PropertySheet, interfaces
 from Products.ERP5Type.XMLObject import XMLObject
 from Products.ERP5Type.DateUtils import convertDateToHour,\
                                 number_of_hours_in_day, number_of_hours_in_year
-from Products.ERP5Type.Utils import convertToUpperCase, fill_args_from_request
+from Products.ERP5Type.Utils import convertToUpperCase, fill_args_from_request,\
+                                    deprecated
 from Products.ERP5Type.TransactionalVariable import getTransactionalVariable
 from Products.ERP5Type.Cache import getReadOnlyTransactionCache
-from Products.ERP5.Document.Url import UrlMixIn
 from Products.ERP5.Tool.ContributionTool import MAX_REPEAT
-from Products.ERP5Type.UnrestrictedMethod import unrestricted_apply
 from Products.ZSQLCatalog.SQLCatalog import SQLQuery
 from AccessControl import Unauthorized
 import zope.interface
 from Products.PythonScripts.Utility import allow_class
-import tempfile
-from subprocess import Popen, PIPE
 
 # Mixin Import
 from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
@@ -60,9 +57,10 @@ from Products.ERP5.mixin.downloadable import DownloadableMixin
 from Products.ERP5.mixin.document import DocumentMixin
 from Products.ERP5.mixin.extensible_traversable import DocumentExtensibleTraversableMixin
 from Products.ERP5.mixin.crawlable import CrawlableMixin
+from Products.ERP5.mixin.discoverable import DiscoverableMixin
+from Products.ERP5.mixin.url import UrlMixin
 
 _MARKER = []
-VALID_ORDER_KEY_LIST = ('user_login', 'content', 'file_name', 'input')
 
 # these property ids are unchangable
 FIXED_PROPERTY_IDS = ('id', 'uid', 'rid', 'sid')
@@ -88,8 +86,9 @@ class DocumentProxyError(Exception):pass
 class NotConvertedError(Exception):pass
 allow_class(NotConvertedError)
 
-class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedConvertableMixin,
-               CrawlableMixin, TextConvertableMixin, DownloadableMixin, DocumentMixin):
+class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixin,
+               CachedConvertableMixin, CrawlableMixin, TextConvertableMixin,
+               DownloadableMixin, DocumentMixin, DiscoverableMixin):
   """Document is an abstract class with all methods related to document
   management in ERP5. This includes searchable text, explicit relations,
   implicit relations, metadata, versions, languages, etc.
@@ -144,7 +143,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
 
   input      -   data supplied with http request or set on the object during (2) (e.g.
                  discovered from email text)
-  file_name  -   data which might be encoded in file name
+  filename   -   data which might be encoded in filename
   user_login -   information about user who is contributing the file
   content    -   data which might be derived from document content
 
@@ -156,7 +155,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
   Methods for discovering metadata are:
 
     getPropertyDictFromInput
-    getPropertyDictFromFileName
+    getPropertyDictFromFilename
     getPropertyDictFromUserLogin
     getPropertyDictFromContent
 
@@ -266,10 +265,15 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
                             interfaces.IVersionable,
                             interfaces.IDownloadable,
                             interfaces.ICrawlable,
-                            interfaces.IDocument
+                            interfaces.IDocument,
+                            interfaces.IDiscoverable,
+                            interfaces.IUrl,
                            )
 
   # Regular expressions
+  # XXX those regex are weak, fast but not reliable.
+  # this is a valid url than regex are not able to parse
+  # http://www.example.com//I don't care i put what/ i want/
   href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
   body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
   title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
@@ -639,141 +643,14 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
     if not reference:
       return
     catalog = self.getPortalObject().portal_catalog
-    res = catalog(reference=self.getReference(), sort_on=(('creation_date','ascending'),))
+    result_list = catalog.unrestrictedSearchResults(
+                                      reference=self.getReference(),
+                                      sort_on=(('creation_date', 
+                                                'ascending'),))
     # XXX this should be security-unaware - delegate to script with proxy roles
-    return res[0].getLanguage() # XXX what happens if it is empty?
-
-  ### Property getters
-  # Property Getters are document dependent so that we can
-  # handle the weird cases in which needed properties change with the type of document
-  # and the usual cases in which accessing content changes with the meta type
-  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromUserLogin')
-  def getPropertyDictFromUserLogin(self, user_login=None):
-    """
-      Based on the user_login, find out as many properties as needed.
-      returns properties which should be set on the document
-    """
-    if user_login is None:
-      user_login = str(getSecurityManager().getUser())
-    method = self._getTypeBasedMethod('getPropertyDictFromUserLogin',
-        fallback_script_id='Document_getPropertyDictFromUserLogin')
-    return method(user_login)
-
-  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromContent')
-  def getPropertyDictFromContent(self):
-    """
-      Based on the document content, find out as many properties as needed.
-      returns properties which should be set on the document
-    """
-    # accesss data through convert
-    mime, content = self.convert(None)
-    if not content:
-       # if document is empty, we will not find anything in its content
-      return {}
-    method = self._getTypeBasedMethod('getPropertyDictFromContent',
-        fallback_script_id='Document_getPropertyDictFromContent')
-    return method()
-
-  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromFileName')
-  def getPropertyDictFromFileName(self, file_name):
-    """
-      Based on the file name, find out as many properties as needed.
-      returns properties which should be set on the document
-    """
-    return self.portal_contributions.getPropertyDictFromFileName(file_name)
-
-  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromInput')
-  def getPropertyDictFromInput(self):
-    """
-      Get properties which were supplied explicitly to the ingestion method
-      (discovered or supplied before the document was created).
-
-      The implementation consists in saving document properties
-      into _backup_input by supposing that original input parameters were
-      set on the document by ContributionTool.newContent as soon
-      as the document was created.
-    """
-    kw = getattr(self, '_backup_input', {})
-    if kw:
-      return kw
-    for id in self.propertyIds():
-      # We should not consider file data
-      if id not in ('data', 'categories_list', 'uid', 'id',
-                    'text_content', 'base_data',) \
-            and self.hasProperty(id):
-        kw[id] = self.getProperty(id)
-    self._backup_input = kw # We could use volatile and pass kw in activate
-                            # if we are garanteed that _backup_input does not
-                            # disappear within a given transaction
-    return kw
-
-  ### Metadata disovery and ingestion methods
-  security.declareProtected(Permissions.ModifyPortalContent, 'discoverMetadata')
-  def discoverMetadata(self, file_name=None, user_login=None):
-    """
-      This is the main metadata discovery function - controls the process
-      of discovering data from various sources. The discovery itself is
-      delegated to scripts or uses preference-configurable regexps. The
-      method returns either self or the document which has been
-      merged in the discovery process.
-
-      file_name - this parameter is a file name of the form "AA-BBB-CCC-223-en"
-
-      user_login - this is a login string of a person; can be None if the user is
-                   currently logged in, then we'll get him from session
-    """
-    # Preference is made of a sequence of 'user_login', 'content', 'file_name', 'input'
-    method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList',
-        fallback_script_id = 'Document_getPreferredDocumentMetadataDiscoveryOrderList')
-    order_list = list(method())
-    order_list.reverse()
-    # build a dictionary according to the order
-    kw = {}
-    for order_id in order_list:
-      result = None
-      if order_id not in VALID_ORDER_KEY_LIST:
-        # Prevent security attack or bad preferences
-        raise AttributeError, "%s is not in valid order key list" % order_id
-      method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id)
-      method = getattr(self, method_id)
-      if order_id == 'file_name':
-        if file_name is not None:
-          result = method(file_name)
-      elif order_id == 'user_login':
-        if user_login is not None:
-          result = method(user_login)
-      else:
-        result = method()
-      if result is not None:
-        for key, value in result.iteritems():
-          if value not in (None, ''):
-            kw[key]=value
-
-    if file_name is not None:
-      # filename is often undefined....
-      kw['source_reference'] = file_name
-    # Prepare the content edit parameters - portal_type should not be changed
-    kw.pop('portal_type', None)
-    # Try not to invoke an automatic transition here
-    self._edit(**kw)
-    # Finish ingestion by calling method
-    self.finishIngestion() # XXX - is this really the right place ?
-    self.reindexObject() # XXX - is this really the right place ?
-    # Revision merge is tightly coupled
-    # to metadata discovery - refer to the documentation of mergeRevision method
-    merged_doc = self.mergeRevision() # XXX - is this really the right place ?
-    merged_doc.reindexObject() # XXX - is this really the right place ?
-    return merged_doc # XXX - is this really the right place ?
-
-  security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion')
-  def finishIngestion(self):
-    """
-      Finish the ingestion process by calling the appropriate script. This
-      script can for example allocate a reference number automatically if
-      no reference was defined.
-    """
-    method = self._getTypeBasedMethod('finishIngestion', fallback_script_id='Document_finishIngestion')
-    return method()
+    if result_list:
+      return result_list[0].getLanguage()
+    return
 
   security.declareProtected(Permissions.View, 'asSubjectText')
   def asSubjectText(self, **kw):
@@ -827,32 +704,13 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
     return self._stripHTML(self._asHTML(**kw))
 
   security.declarePrivate('_guessEncoding')
+  @deprecated
   def _guessEncoding(self, string, mime='text/html'):
     """
-      Try to guess the encoding for this string.
-      Returns None if no encoding can be guessed.
+      Deprecated method
     """
-    try:
-      import chardet
-    except ImportError:
-      chardet = None
-    if chardet is not None and (mime == 'text/html'\
-                                               or os.sys.platform != 'linux2'):
-      # chardet works fine on html document and its platform independent
-      return chardet.detect(string).get('encoding', None)
-    else:
-      # file command provide better result
-      # for text/plain documents
-      # store the content into tempfile
-      file_descriptor, path = tempfile.mkstemp()
-      file_object = os.fdopen(file_descriptor, 'w')
-      file_object.write(string)
-      file_object.close()
-      # run file command against tempfile to and read encoded
-      command_result = Popen(['file', '-b', '--mime-encoding', path],
-                                                  stdout=PIPE).communicate()[0]
-      # return detected encoding
-      return command_result.strip()
+    contribution_tool = self.getPortalObject().portal_contributions
+    return contribution_tool.guessEncodingFromText(string, content_type=mime)
 
   def _stripHTML(self, html, charset=None):
     """
@@ -866,22 +724,6 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
       stripped_html = html
     return stripped_html
 
-  security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
-  def getContentInformation(self):
-    """
-    Returns the content information from the HTML conversion.
-    The default implementation tries to build a dictionnary
-    from the HTML conversion of the document and extract
-    the document title.
-    """
-    result = {}
-    html = self.asEntireHTML()
-    if not html: return result
-    title_list = re.findall(self.title_parser, str(html))
-    if title_list:
-      result['title'] = title_list[0]
-    return result
-
   security.declareProtected(Permissions.AccessContentsInformation,
                             'getMetadataMappingDict')
   def getMetadataMappingDict(self):
@@ -918,21 +760,6 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
       method = None
     if method is not None: method()
 
-  # Crawling API
-  security.declareProtected(Permissions.AccessContentsInformation, 'getContentURLList')
-  def getContentURLList(self):
-    """
-      Returns a list of URLs referenced by the content of this document.
-      Default implementation consists in analysing the document
-      converted to HTML. Subclasses may overload this method
-      if necessary. However, it is better to extend the conversion
-      methods in order to produce valid HTML, which is useful to
-      many people, rather than overload this method which is only
-      useful for crawling.
-    """
-    html_content = self.asStrippedHTML()
-    return re.findall(self.href_parser, str(html_content))
-
   security.declareProtected(Permissions.ModifyPortalContent, 'updateContentFromURL')
   def updateContentFromURL(self, repeat=MAX_REPEAT, crawling_depth=0):
     """
@@ -963,18 +790,3 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
     if hasattr(aq_base(container), 'isIndexContent'):
       return container.isIndexContent(self)
     return False
-
-  security.declareProtected(Permissions.AccessContentsInformation, 'getContentBaseURL')
-  def getContentBaseURL(self):
-    """
-      Returns the content base URL based on the actual content or
-      on its URL.
-    """
-    base_url = self.asURL()
-    base_url_list = base_url.split('/')
-    if len(base_url_list):
-      if base_url_list[-1] and base_url_list[-1].find('.') > 0:
-        # Cut the trailing part in http://www.some.site/at/trailing.html
-        # but not in http://www.some.site/at
-        base_url = '/'.join(base_url_list[:-1])
-    return base_url
diff --git a/product/ERP5/Document/PDFDocument.py b/product/ERP5/Document/PDFDocument.py
index 8095d0a115..3a3c0d11dc 100644
--- a/product/ERP5/Document/PDFDocument.py
+++ b/product/ERP5/Document/PDFDocument.py
@@ -114,22 +114,14 @@ class PDFDocument(Image):
     """
     if not self.hasData():
       return ''
-    tmp = tempfile.NamedTemporaryFile()
-    tmp.write(self.getData())
-    tmp.seek(0)
-    try:
-      command = ['pdftotext', '-layout', '-enc', 'UTF-8',
-                 '-nopgbrk', tmp.name, '-']
-      try:
-        command_result = Popen(command, stdout=PIPE).communicate()[0]
-      except OSError, e:
-        if e.errno == errno.ENOENT:
-          raise ConversionError('pdftotext was not found')
-        raise
-    finally:
-      tmp.close()
-    if command_result:
-      return command_result
+    mime_type = 'text/plain'
+    portal_transforms = self.getPortalObject().portal_transforms
+    filename = self.getStandardFilename(format='txt')
+    result = portal_transforms.convertToData(mime_type, str(self.getData()),
+                                             context=self, filename=filename,
+                                             mimetype=self.getContentType())
+    if result:
+      return result
     else:
       # Try to use OCR
       # As high dpi images are required, it may take some times to convert the
@@ -145,13 +137,12 @@ class PDFDocument(Image):
             frame=page_number, display='identical')
         if not src_mimetype.endswith('png'):
           continue
-        content = '%s' % png_data
-        mime_type = 'text/plain'
+        content = str(png_data)
         if content is not None:
-          portal_transforms = getToolByName(self, 'portal_transforms')
+          filename = self.getStandardFilename(format='png')
           result = portal_transforms.convertToData(mime_type, content,
                                                    context=self,
-                                                   filename=self.getTitleOrId(),
+                                                   filename=filename,
                                                    mimetype=src_mimetype)
           if result is None:
             raise ConversionError('PDFDocument conversion error. '
diff --git a/product/ERP5/Document/TextDocument.py b/product/ERP5/Document/TextDocument.py
index beaad2f04f..2877c0c5ef 100644
--- a/product/ERP5/Document/TextDocument.py
+++ b/product/ERP5/Document/TextDocument.py
@@ -45,6 +45,9 @@ try:
   from string import Template
 except ImportError:
   from Products.ERP5Type.patches.string import Template
+from Products.ERP5Type.Utils import guessEncodingFromText
+
+from lxml import html as etree_html
 
 class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
                                                             TextContent, File):
@@ -147,7 +150,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
         kw['format'] = format
         if not self.hasConversion(**kw):
           portal_transforms = getToolByName(portal, 'portal_transforms')
-          filename = self.getSourceReference(self.getTitleOrId())
+          filename = self.getStandardFilename(format=format)
           if mime_type == 'text/html':
             mime_type = 'text/x-html-safe'
           result = portal_transforms.convertToData(mime_type, text_content,
@@ -183,9 +186,13 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
       """
       if self.hasTextContent():
         html = self._asHTML()
-        base_list = re.findall(self.base_parser, str(html))
-        if base_list:
-          return base_list[0]
+        # a document can be entirely stripped by safe_html
+        # so its html conversion can be empty
+        if html.strip():
+          html_tree = etree_html.fromstring(html)
+          base_list = [href for href in html_tree.xpath('//base/@href') if href]
+          if base_list:
+            return str(base_list[0])
       return Document.getContentBaseURL(self)
 
     security.declareProtected(Permissions.ModifyPortalContent, 'setBaseData')
@@ -270,14 +277,14 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
         return encoded content_type and message if encoding
         is not utf-8
         """
-        codec = document._guessEncoding(text_content, content_type)
+        codec = guessEncodingFromText(text_content, content_type)
         if codec is not None:
           try:
             text_content = text_content.decode(codec).encode('utf-8')
           except (UnicodeDecodeError, LookupError):
             message = 'Conversion to base format with codec %r fails' % codec
             # try again with another guesser based on file command
-            codec = document._guessEncoding(text_content, 'text/plain')
+            codec = guessEncodingFromText(text_content, 'text/plain')
             if codec is not None:
               try:
                 text_content = text_content.decode(codec).encode('utf-8')
diff --git a/product/ERP5/Tool/ContributionRegistryTool.py b/product/ERP5/Tool/ContributionRegistryTool.py
index 6fa611a3c6..b65ba4909c 100644
--- a/product/ERP5/Tool/ContributionRegistryTool.py
+++ b/product/ERP5/Tool/ContributionRegistryTool.py
@@ -29,7 +29,7 @@
 from AccessControl import ClassSecurityInfo
 from Products.ERP5Type.Globals import InitializeClass
 from Products.ERP5Type.Tool.BaseTool import BaseTool
-
+from Products.ERP5Type import Permissions
 
 class ContributionRegistryTool(BaseTool):
 
@@ -41,14 +41,18 @@ class ContributionRegistryTool(BaseTool):
 
   security = ClassSecurityInfo()
 
-  security.declarePrivate('findPortalTypeName')
-  def findPortalTypeName(self, file_name='', mime_type=None, data=None):
-    from Products.ERP5Type.Document import newTempIngestionFile
-    ingestion_file = newTempIngestionFile(self, 'id')
-    ingestion_file._edit(file_name=file_name, mime_type=mime_type, data=data)
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'findPortalTypeName')
+  def findPortalTypeName(self, context=None, **kw):
+    # if a context is passed, ignore other arguments
+    if context is None:
+      # Build a temp object edited with provided parameters
+      from Products.ERP5Type.Document import newTempFile
+      context = newTempFile(self, 'id')
+      context.edit(**kw)
 
     for predicate in self.objectValues(sort_on='int_index'):
-      result = predicate.test(ingestion_file)
+      result = predicate.test(context)
       if result:
         return result
 
diff --git a/product/ERP5/Tool/ContributionTool.py b/product/ERP5/Tool/ContributionTool.py
index 62876fdbc9..c179d00344 100644
--- a/product/ERP5/Tool/ContributionTool.py
+++ b/product/ERP5/Tool/ContributionTool.py
@@ -29,12 +29,7 @@
 
 import cStringIO
 import re
-import string
 import socket
-try:
-  from hashlib import md5 as md5_new
-except ImportError:
-  from md5 import new as md5_new
 import urllib2, urllib
 import urlparse
 from cgi import parse_header
@@ -46,13 +41,11 @@ from Products.CMFCore.utils import getToolByName, _checkPermission
 from Products.ERP5Type.Tool.BaseTool import BaseTool
 from Products.ERP5Type import Permissions
 from Products.ERP5 import _dtmldir
-from Products.ERP5.Document.Url import no_crawl_protocol_list, no_host_protocol_list
+from Products.ERP5.Document.Url import no_crawl_protocol_list
 from AccessControl import Unauthorized
 
-from zLOG import LOG
 from DateTime import DateTime
-from Acquisition import aq_base
-from zExceptions import BadRequest
+import warnings
 
 # Install openers
 import ContributionOpener
@@ -83,7 +76,7 @@ class ContributionTool(BaseTool):
 
     Configuration Scripts:
 
-      - ContributionTool_getPropertyDictFromFileName: receives file name and a 
+      - ContributionTool_getPropertyDictFromFilename: receives file name and a 
         dict derived from filename by regular expression, and does any necesary
         operations (e.g. mapping document type id onto a real portal_type).
 
@@ -98,8 +91,7 @@ class ContributionTool(BaseTool):
   meta_type = 'ERP5 Contribution Tool'
   portal_type = 'Contribution Tool'
 
-  # Regular expressions
-  simple_normaliser = re.compile('#.*')
+  
 
   # Declarative Security
   security = ClassSecurityInfo()
@@ -108,153 +100,141 @@ class ContributionTool(BaseTool):
   manage_overview = DTMLFile( 'explainContributionTool', _dtmldir )
 
   security.declareProtected(Permissions.AddPortalContent, 'newContent')
-  def newContent(self, id=None, portal_type=None, url=None, container=None,
-                       container_path=None,
-                       discover_metadata=1, temp_object=0,
-                       user_login=None, data=None, file_name=None, **kw):
+  def newContent(self, **kw):
     """
       The newContent method is overriden to implement smart content
       creation by detecting the portal type based on whatever information
       was provided and finding out the most appropriate module to store
       the content.
 
-      user_login is the name under which the content will be created
-      XXX - this is a security hole which needs to be fixed by
-      making sure only Manager can use this parameter
-
-      container -- if specified, it is possible to define
-      where to contribute the content. Else, ContributionTool
-      tries to guess.
-
-      container_path -- if specified, defines the container path
-      and has precedence over container
-
-      url -- if specified, content is download from the URL.
-
-      NOTE:
-        We always generate ID. So, we must prevent using the one
-        which we were provided.
+      explicit named parameters was:
+        id - ignored argument
+        portal_type - explicit portal_type parameter, must be honoured
+        url - Identifier of external resource. Content will be downloaded
+              from it
+        container - if specified, it is possible to define
+                    where to contribute the content. Else, ContributionTool
+                    tries to guess.
+        container_path - if specified, defines the container path
+                         and has precedence over container
+        discover_metadata - Enable metadata extraction and discovery
+                            (default True)
+        temp_object - build tempObject or not (default False)
+        user_login - is the name under which the content will be created
+                     XXX - this is a security hole which needs to be fixed by
+                     making sure only Manager can use this parameter
+        data - Binary representation of content
+        filename - explicit filename of content
     """
-    if file_name is not None:
-      kw['file_name'] = file_name
-    if data is not None:
-      # This is only used to make sure
-      # we can pass file as parameter to ZPublisher
-      # whenever we ingest email
-      kw['data'] = data
+    kw.pop('id', None) # Never use hardcoded ids anymore longer
+
+    # Useful for metadata discovery, keep it as it as been provided
+    input_parameter_dict = kw.copy()
+    # But file and data are exceptions.
+    # They are potentialy too big to be keept into memory.
+    # We want to keep only one reference of thoses values
+    # on futur created document only !
+    if 'file' in input_parameter_dict:
+      del input_parameter_dict['file']
+    if 'data' in input_parameter_dict:
+      del input_parameter_dict['data']
+    # pop: remove keys which are not document properties
+    url = kw.pop('url', None)
+    container = kw.pop('container', None)
+    container_path = kw.pop('container_path', None)
+    discover_metadata = kw.pop('discover_metadata', True)
+    user_login = kw.pop('user_login', None)
+    # check file_name argument for backward compatibility.
+    if 'file_name' in kw:
+      if 'filename' not in kw:
+        kw['filename'] = kw['file_name']
+      del(kw['file_name'])
+    filename = kw.get('filename', None)
+    portal_type = kw.get('portal_type')
+    temp_object = kw.get('temp_object', False)
 
     document = None
-
-    # Try to find the file_name
+    portal = self.getPortalObject()
+    # Try to find the filename
     content_type = None
     if not url:
       # check if file was provided
-      file = kw.get('file', None)
-      if file is not None and file_name is None:
-        file_name = file.filename
+      file_object = kw.get('file')
+      if file_object is not None:
+        if not filename:
+          filename = file_object.filename
       else:
         # some channels supply data and file-name separately
         # this is the case for example for email ingestion
         # in this case, we build a file wrapper for it
-        data = kw.get('data', None)
-        if data is not None:
-          file_name = kw.get('file_name', None)
-          if file_name is not None:
-            file = cStringIO.StringIO()
-            file.write(data)
-            file.seek(0)
-            kw['file'] = file
-            del kw['data']
-            del kw['file_name']
+        data = kw.get('data')
+        if data is not None and filename:
+          file_object = cStringIO.StringIO()
+          file_object.write(data)
+          file_object.seek(0)
+          kw['file'] = file_object
+          del kw['data']
+        else:
+          raise TypeError, 'data and filename must be provided'
     else:
-      # build a new file from the url
-      url_file = urllib2.urlopen(url)
-      data = url_file.read() # time out must be set or ... too long XXX
-      file = cStringIO.StringIO()
-      file.write(data)
-      file.seek(0)
-      # if a content-disposition header is present,
-      # try first to read the suggested filename from it.
-      header_info = url_file.info()
-      content_disposition = header_info.getheader('content-disposition', '')
-      file_name = parse_header(content_disposition)[1].get('filename')
-      if not file_name:
-        # Now read the filename from url.
-        # In case of http redirection, the real url must be read
-        # from file object returned by urllib2.urlopen.
-        # It can happens when the header 'Location' is present in request.
-        # See http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.30
-        url = url_file.geturl()
-        # Create a file name based on the URL and quote it
-        file_name = urlparse.urlsplit(url)[-3]
-        file_name = os.path.basename(file_name)
-        file_name = urllib.quote(file_name, safe='')
-        file_name = file_name.replace('%', '')
-      # For URLs, we want an id by default equal to the encoded URL
-      if id is None:
-        id = self.encodeURL(url)
-      content_type = header_info.gettype()
+      file_object, filename, content_type = self._openURL(url)
       if content_type:
         kw['content_type'] = content_type
-      kw['file'] = file
+      kw['file'] = file_object
 
     # If the portal_type was provided, we can go faster
     if portal_type and container is None:
       # We know the portal_type, let us find the default module
       # and use it as container
       try:
-        container = self.getDefaultModule(portal_type)
+        container = portal.getDefaultModule(portal_type)
       except ValueError:
         container = None
 
-    if portal_type and container is not None:
-      # We could simplify things here and return a document immediately
-      # NOTE: we use the module ID generator rather than the provided ID
-      #document = module.newContent(portal_type=portal_type, **kw)
-      #if discover_metadata:
-      #  document.activate().discoverMetadata(file_name=file_name, user_login=user_login)
-      #return document
-      pass # XXX - This needs to be implemented once the rest is stable
-
     # From here, there is no hope unless a file was provided
-    if file is None:
-      raise ValueError, "could not determine portal type"
+    if file_object is None:
+      raise ValueError, "No data provided"
 
+
+    if portal_type is None:
+      # Guess it with help of portal_contribution_registry
+      registry = getToolByName(portal, 'portal_contribution_registry')
+      portal_type = registry.findPortalTypeName(filename=filename,
+                                                content_type=content_type)
     #
     # Check if same file is already exists. if it exists, then update it.
     #
-    if portal_type is None:
-      portal_type = self._guessPortalType(file_name, content_type, data)
-      property_dict = self.getMatchedFileNamePatternDict(file_name)
-      reference = property_dict.get('reference', None)
-      version  = property_dict.get('version', None)
-      language  = property_dict.get('language', None)
-      if portal_type and reference and version and language:
-        portal_catalog = getToolByName(self, 'portal_catalog')
-        document = portal_catalog.getResultValue(portal_type=portal_type,
-                                                  reference=reference,
-                                                  version=version,
-                                                  language=language)
-        if document is not None:
-          # document is already uploaded. So overrides file.
-          if not _checkPermission(Permissions.ModifyPortalContent, document):
-            raise Unauthorized, "[DMS] You are not allowed to update the existing document which has the same coordinates (id %s)" % document.getId()
-          document.edit(file=kw['file'])
-          return document
+    property_dict = self.getMatchedFilenamePatternDict(filename)
+    reference = property_dict.get('reference', None)
+    version  = property_dict.get('version', None)
+    language  = property_dict.get('language', None)
+    if portal_type and reference and version and language:
+      portal_catalog = getToolByName(portal, 'portal_catalog')
+      document = portal_catalog.getResultValue(portal_type=portal_type,
+                                                reference=reference,
+                                                version=version,
+                                                language=language)
 
+      if document is not None:
+        # document is already uploaded. So overrides file.
+        if not _checkPermission(Permissions.ModifyPortalContent, document):
+          raise Unauthorized, "[DMS] You are not allowed to update the existing document which has the same coordinates (id %s)" % document.getId()
+        document.edit(file=kw['file'])
+        return document
     # Temp objects use the standard newContent from Folder
     if temp_object:
       # For temp_object creation, use the standard method
-      return BaseTool.newContent(self, id=id, portal_type=portal_type,
-                                 temp_object=temp_object, **kw)
+      kw['portal_type'] = portal_type
+      return BaseTool.newContent(self, **kw)
 
     # Then put the file inside ourselves for a short while
     if container_path is not None:
       container = self.getPortalObject().restrictedTraverse(container_path)
-    document = self._setObject(file_name, None, portal_type=portal_type,
-                               user_login=user_login, id=id,
-                               container=container,
+    document = self._setObject(filename, None, portal_type=portal_type,
+                               user_login=user_login, container=container,
                                discover_metadata=discover_metadata,
+                               filename=filename,
+                               input_parameter_dict=input_parameter_dict
                                )
     object_id = document.getId()
     document = self._getOb(object_id) # Call _getOb to purge cache
@@ -264,18 +244,12 @@ class ContributionTool(BaseTool):
       if modified_kw is not None:
         kw.update(modified_kw)
 
+    kw['filename'] = filename # Override filename property
     # Then edit the document contents (so that upload can happen)
     document._edit(**kw)
-    # if no content_type has been set, guess it
-    if 'content_type' not in kw and getattr(document, 'guessMimeType', None) is not None:
-      # For File force to setup the mime_type
-      document.guessMimeType(fname=file_name)
     if url:
       document.fromURL(url)
 
-    # Notify workflows
-    #document.notifyWorkflowCreated()
-
     # Allow reindexing, reindex it and return the document
     try:
       delattr(document, 'isIndexable')
@@ -293,17 +267,19 @@ class ContributionTool(BaseTool):
     """
     pass
 
-  security.declareProtected(Permissions.ModifyPortalContent,'getMatchedFileNamePatternDict')
-  def getMatchedFileNamePatternDict(self, file_name):
+  security.declareProtected(Permissions.ModifyPortalContent,
+                            'getMatchedFilenamePatternDict')
+  def getMatchedFilenamePatternDict(self, filename):
     """
       Get matched group dict of file name parsing regular expression.
     """
     property_dict = {}
 
-    if file_name is None:
+    if filename is None:
       return property_dict
 
-    regex_text = self.portal_preferences.getPreferredDocumentFileNameRegularExpression()
+    regex_text = self.portal_preferences.\
+                                getPreferredDocumentFilenameRegularExpression()
     if regex_text in ('', None):
       return property_dict
 
@@ -311,42 +287,55 @@ class ContributionTool(BaseTool):
       pattern = re.compile(regex_text)
       if pattern is not None:
         try:
-          property_dict = pattern.match(file_name).groupdict()
+          property_dict = pattern.match(filename).groupdict()
         except AttributeError: # no match
           pass
     return property_dict
 
-  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromFileName')
-  def getPropertyDictFromFileName(self, file_name):
+  # backward compatibility
+  security.declareProtected(Permissions.ModifyPortalContent,
+                            'getMatchedFileNamePatternDict')
+  def getMatchedFileNamePatternDict(self, filename):
+    """
+    (deprecated) use getMatchedFilenamePatternDict() instead.
+    """
+    warnings.warn('getMatchedFileNamePatternDict() is deprecated. '
+                  'use getMatchedFilenamePatternDict() instead.')
+    return self.getMatchedFilenamePatternDict(filename)
+
+  security.declareProtected(Permissions.ModifyPortalContent,
+                            'getPropertyDictFromFilename')
+  def getPropertyDictFromFilename(self, filename):
     """
       Gets properties from filename. File name is parsed with a regular expression
       set in preferences. The regexp should contain named groups.
     """
-    if file_name is None:
+    if filename is None:
       return {}
-    property_dict = self.getMatchedFileNamePatternDict(file_name)
-    method = self._getTypeBasedMethod('getPropertyDictFromFileName',
-        fallback_script_id = 'ContributionTool_getPropertyDictFromFileName')
-    property_dict = method(file_name, property_dict)
-    if property_dict.get('portal_type', None) is not None:
-      # we have to return portal_type as a tuple
-      # because we should allow for having multiple candidate types
-      property_dict['portal_type'] = (property_dict['portal_type'],)
-    else:
-      # we have to find candidates by file extenstion
-      basename, extension = os.path.splitext(file_name)
-      if extension:
-        extension = extension.lstrip('.') # remove first dot
-        property_dict['portal_type'] =\
-               self.ContributionTool_getCandidateTypeListByExtension(extension)
+    property_dict = self.getMatchedFilenamePatternDict(filename)
+    method = self._getTypeBasedMethod('getPropertyDictFromFilename',
+             fallback_script_id='ContributionTool_getPropertyDictFromFilename')
+    property_dict = method(filename, property_dict)
     return property_dict
 
+  # backward compatibility
+  security.declareProtected(Permissions.ModifyPortalContent,
+                            'getPropertyDictFromFileName')
+  def getPropertyDictFromFileName(self, filename):
+    """
+    (deprecated) use getPropertyDictFromFilename() instead.
+    """
+    warnings.warn('getPropertyDictFromFileName() is deprecated. '
+                  'use getPropertyDictFromFilename() instead.')
+    return self.getPropertyDictFromFilename(filename)
+
   # WebDAV virtual folder support
-  def _setObject(self, name, ob, portal_type=None, user_login=None,
-                 container=None, id=None, discover_metadata=1):
+  def _setObject(self, id, ob, portal_type=None, user_login=None,
+                 container=None, discover_metadata=True, filename=None,
+                 input_parameter_dict=None):
     """
       portal_contribution_registry will find appropriate portal type
-      name by file_name and content itself.
+      name by filename and content itself.
 
       The ContributionTool instance must be configured in such
       way that _verifyObjectPaste will return TRUE.
@@ -362,9 +351,8 @@ class ContributionTool(BaseTool):
       # redefine parameters
       portal_type = ob.getPortalType()
       container = ob.getParentValue()
-      id = ob.getId()
     if not portal_type:
-      document = BaseTool.newContent(self, id=name,
+      document = BaseTool.newContent(self, id=id,
                                      portal_type=portal_type,
                                      is_indexable=0)
     else:
@@ -379,33 +367,27 @@ class ContributionTool(BaseTool):
         module = self.getDefaultModule(portal_type)
       else:
         module = container
-      if id is None:
-        new_id = module.generateNewId()
-      else:
-        new_id = id
-      existing_document = module._getOb(new_id, None)
-      if existing_document is None:
-        # There is no preexisting document - we can therefore
-        # set the new object
-        document = module.newContent(id=new_id,
-                                     portal_type=portal_type,
-                                     is_indexable=0)
-        # We can now discover metadata
-        if discover_metadata:
-          # Metadata disovery is done as an activity by default
-          # If we need to discoverMetadata synchronously, it must
-          # be for user interface and should thus be handled by
-          # ZODB scripts
-          document.activate(after_path_and_method_id=(document.getPath(),
-            ('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
-          .discoverMetadata(file_name=name, user_login=user_login)
-      else:
-        document = existing_document
+      # There is no preexisting document - we can therefore
+      # set the new object
+      document = module.newContent(portal_type=portal_type, is_indexable=0)
+      # We can now discover metadata
+      if discover_metadata:
+        # Metadata disovery is done as an activity by default
+        # If we need to discoverMetadata synchronously, it must
+        # be for user interface and should thus be handled by
+        # ZODB scripts
+        document.activate(after_path_and_method_id=(document.getPath(),
+          ('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
+        .discoverMetadata(filename=filename,
+                          user_login=user_login,
+                          input_parameter_dict=input_parameter_dict)
       # Keep the document close to us - this is only useful for
       # file upload from webdav
-      if not hasattr(self, '_v_document_cache'):
+      volatile_cache = getattr(self, '_v_document_cache', None)
+      if volatile_cache is None:
         self._v_document_cache = {}
-      self._v_document_cache[document.getId()] = document.getRelativeUrl()
+        volatile_cache = self._v_document_cache
+      volatile_cache[document.getId()] = document.getRelativeUrl()
 
     # Return document to newContent method
     return document
@@ -417,10 +399,11 @@ class ContributionTool(BaseTool):
     """
     # Use the document cache if possible and return result immediately
     # this is only useful for webdav
-    if hasattr(self, '_v_document_cache'):
-      document_url = self._v_document_cache.get(id, None)
+    volatile_cache = getattr(self, '_v_document_cache', None)
+    if volatile_cache is not None:
+      document_url = volatile_cache.get(id)
       if document_url is not None:
-        del self._v_document_cache[id]
+        del volatile_cache[id]
         return self.getPortalObject().unrestrictedTraverse(document_url)
 
     # Try first to return the real object inside
@@ -475,66 +458,11 @@ class ContributionTool(BaseTool):
     def wrapper(o_list):
       for o in o_list:
         o = o.getObject()
-        id = '%s-%s' % (o.getUid(), o.getStandardFileName(),)
+        id = '%s-%s' % (o.getUid(), o.getStandardFilename(),)
         yield o.asContext(id=id)
 
     return wrapper(object_list)
 
-  # Crawling methods
-  security.declareProtected(Permissions.View, 'normaliseURL')
-  def normaliseURL(self, url, base_url=None):
-    """
-      Returns a normalised version of the url so
-      that we do not download twice the same content.
-      URL normalisation is an important part in crawlers.
-      The current implementation is obviously simplistic.
-      Refer to http://en.wikipedia.org/wiki/Web_crawler
-      and study Harvestman for more ideas.
-    """
-    url = self.simple_normaliser.sub('', url)
-    url_split = url.split(':')
-    url_protocol = url_split[0]
-    if url_protocol in no_host_protocol_list:
-      return url
-    if base_url and len(url_split) == 1:
-      # Make relative URL absolute
-      url = '%s/%s' % (base_url, url)
-    return url
-
-  security.declareProtected(Permissions.View, 'encodeURL')
-  def encodeURL(self, url):
-    """
-    Returns the URL as an ID. ID should be chosen in such
-    way that it is optimal with HBTreeFolder (ie. so that
-    distribution of access time on a cluster is possible)
-
-    NOTE: alternate approach is based on a url table
-    and catalog lookup. It is faster ? Not sure. Since
-    we must anyway insert objects in btrees and this
-    is simimar in cost to accessing them.
-    """
-    # Produce an MD5 from the URL
-    hex_md5 = md5_new(url).hexdigest()
-    # Take the first part in the URL which is not empty
-    # LOG("encodeURL", 0, url)
-    url_segment = url.split(':')[1]
-    url_segment_list = url_segment.split('/')
-    url_domain = None
-    for url_part in url_segment_list:
-      if url_part:
-        url_domain = url_part
-        break
-    # Return encoded url
-    if url_domain:
-      url_domain = urllib.quote(url_domain, safe='')
-      url_domain = url_domain.replace('%', '')
-      return "%s-%s" % (url_domain, hex_md5)
-    return hex_md5
-    url = urllib.quote(url, safe='')
-    url = url.replace('_', '__')
-    url = url.replace('%', '_')
-    return url
-
   security.declareProtected(Permissions.AddPortalContent, 'crawlContent')
   def crawlContent(self, content, container=None):
     """
@@ -543,6 +471,8 @@ class ContributionTool(BaseTool):
       XXX: missing is the conversion of content local href to something
       valid.
     """
+    portal = self.getPortalObject()
+    url_registry_tool = portal.portal_url_registry
     depth = content.getCrawlingDepth()
     if depth < 0:
       # Do nothing if crawling depth is reached
@@ -554,32 +484,34 @@ class ContributionTool(BaseTool):
     if depth < 0:
       # Do nothing if crawling depth is reached
       return
-    base_url = content.getContentBaseURL()
-    url_list = map(lambda url: self.normaliseURL(url, base_url), set(content.getContentURLList()))
+    url_list = content.getContentNormalisedURLList()
     for url in set(url_list):
       # LOG('trying to crawl', 0, url)
       # Some url protocols should not be crawled
-      if url.split(':')[0] in no_crawl_protocol_list:
+      if urlparse.urlsplit(url)[0] in no_crawl_protocol_list:
         continue
       if container is None:
         #if content.getParentValue()
         # in place of not ?
         container = content.getParentValue()
-      # Calculate the id under which content will be stored
-      id = self.encodeURL(url)
-      # Try to access the document if it already exists
-      document = container.get(id, None)
-      if document is None:
-        # XXX - This call is not working due to missing group_method_id
-        # therefore, multiple call happen in parallel and eventually fail
-        # (the same URL is created multiple times)
-        # LOG('activate newContentFromURL', 0, url)
-        self.activate(activity="SQLQueue").newContentFromURL(container_path=container.getRelativeUrl(),
-                                                      id=id, url=url, crawling_depth=depth)
-      elif depth and document.getCrawlingDepth() < depth:
-        # Update the crawling depth if necessary
-        document._setCrawlingDepth(depth)
-        document.activate().crawlContent()
+      try:
+        url_registry_tool.getReferenceFromURL(url, context=container)
+      except KeyError:
+        pass
+      else:
+        # url already crawled
+        continue
+      # XXX - This call is not working due to missing group_method_id
+      # therefore, multiple call happen in parallel and eventually fail
+      # (the same URL is created multiple times)
+      # LOG('activate newContentFromURL', 0, url)
+      self.activate(activity="SQLQueue").newContentFromURL(
+                                  container_path=container.getRelativeUrl(),
+                                  url=url, crawling_depth=depth)
+      # Url is not known yet but register right now to avoid
+      # creation of duplicated crawled content
+      # An activity will later setup the good reference for it.
+      url_registry_tool.registerURL(url, None, context=container)
 
   security.declareProtected(Permissions.AddPortalContent, 'updateContentFromURL')
   def updateContentFromURL(self, content, repeat=MAX_REPEAT, crawling_depth=0):
@@ -595,10 +527,7 @@ class ContributionTool(BaseTool):
       # Step 1: download new content
       try:
         url = content.asURL()
-        data = urllib2.urlopen(url).read()
-        file = cStringIO.StringIO()
-        file.write(data)
-        file.seek(0)
+        file_object, filename, content_type = self._openURL(url)
       except urllib2.HTTPError, error:
         if repeat == 0:
           # XXX - Call the extendBadURLList method,--NOT Implemented--
@@ -615,28 +544,28 @@ class ContributionTool(BaseTool):
         content.activate(at_date=DateTime() + 1).updateContentFromURL(repeat=repeat - 1)
         return
 
-      # Step 2: compare and update if necessary (md5)
-      # md5 stuff to compare contents
-      new_content_md5 = md5_new(data).hexdigest()
-      content_md5 = content.getContentMd5()
-      if content_md5 == new_content_md5:
-        return
-      content._edit(file=file)# Please make sure that if content is the same
+      content._edit(file=file_object, content_type=content_type)
+                              # Please make sure that if content is the same
                               # we do not update it
                               # This feature must be implemented by Base or File
                               # not here (look at _edit in Base)
-      # Step 3: convert to base format
-      content.convertToBaseFormat()
+      # Step 2: convert to base format
+      if content.isSupportBaseDataConversion():
+        content.activate().Document_tryToConvertToBaseFormat()
+      # Step 3: run discoverMetadata
+      content.activate(after_path_and_method_id=(content.getPath(),
+            ('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
+          .discoverMetadata(filename=filename)
       # Step 4: activate populate (unless interaction workflow does it)
       content.activate().populateContent()
       # Step 5: activate crawlContent
       depth = content.getCrawlingDepth()
       if depth > 0:
         content.activate().crawlContent()
-      content.setContentMd5(new_content_md5)
 
   security.declareProtected(Permissions.AddPortalContent, 'newContentFromURL')
-  def newContentFromURL(self, container_path=None, id=None, repeat=MAX_REPEAT, repeat_interval=1, batch_mode=True, **kw):
+  def newContentFromURL(self, container_path=None, id=None, repeat=MAX_REPEAT,
+                        repeat_interval=1, batch_mode=True, url=None, **kw):
     """
       A wrapper method for newContent which provides extra safety
       in case or errors (ie. download, access, conflict, etc.).
@@ -646,17 +575,13 @@ class ContributionTool(BaseTool):
       the at_date parameter and some standard values.
 
       NOTE: implementation needs to be done.
+      id parameter is ignored
     """
     document = None
-    # First of all, make sure do not try to create an existing document
-    if container_path is not None and id is not None:
-      container = self.restrictedTraverse(container_path)
-      document = container.get(id, None)
-      if document is not None:
-        # Document aleardy exists: no need to keep on crawling
-        return document
+    if not url:
+      raise TypeError, 'url parameter is mandatory'
     try:
-      document = self.newContent(container_path=container_path, id=id, **kw)
+      document = self.newContent(container_path=container_path, url=url, **kw)
       if document.isIndexContent() and document.getCrawlingDepth() >= 0:
         # If this is an index document, keep on crawling even if crawling_depth is 0
         document.activate().crawlContent()
@@ -672,7 +597,7 @@ class ContributionTool(BaseTool):
       if repeat > 0:
         # Catch any HTTP error
         self.activate(at_date=DateTime() + repeat_interval).newContentFromURL(
-                          container_path=container_path, id=id,
+                          container_path=container_path, url=url,
                           repeat=repeat - 1,
                           repeat_interval=repeat_interval, **kw)
     except urllib2.URLError, error:
@@ -685,28 +610,57 @@ class ContributionTool(BaseTool):
       if repeat > 0:
         self.activate(at_date=DateTime() + repeat_interval,
                       activity="SQLQueue").newContentFromURL(
-                        container_path=container_path, id=id,
+                        container_path=container_path, url=url,
                         repeat=repeat - 1,
                         repeat_interval=repeat_interval, **kw)
     return document
 
-  def _guessPortalType(self, name, typ, body):
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'guessMimeTypeFromFilename')
+  def guessMimeTypeFromFilename(self, filename):
     """
-       Call Portal Contribution Registry
-       to know which portal_type should be used
+      get mime type from file name
     """
-    findPortalTypeName = None
-    registry = getToolByName(self, 'portal_contribution_registry', None)
-    if registry is not None:
-      findPortalTypeName = registry.findPortalTypeName
-    else:
-      # Keep backward compatibility
-      registry = getToolByName(self, 'content_type_registry', None)
-      if registry is None:
-        return None
-      findPortalTypeName = registry.findTypeName
-
-    portal_type = findPortalTypeName(name, typ, body)
-    return portal_type
+    if not filename:
+      return
+    portal = self.getPortalObject()
+    content_type = portal.mimetypes_registry.lookupExtension(filename)
+    return content_type
+
+  def _openURL(self, url):
+    """Download content from url,
+    read filename and content_type
+    return file_object, filename, content_type tuple
+    """
+    # Quote path part of url
+    url_tuple = urlparse.urlsplit(url)
+    quoted_path = urllib.quote(url_tuple[2])
+    url = urlparse.urlunsplit((url_tuple[0], url_tuple[1], quoted_path,
+                               url_tuple[3], url_tuple[4]))
+    # build a new file from the url
+    url_file = urllib2.urlopen(url)
+    data = url_file.read() # time out must be set or ... too long XXX
+    file_object = cStringIO.StringIO()
+    file_object.write(data)
+    file_object.seek(0)
+    # if a content-disposition header is present,
+    # try first to read the suggested filename from it.
+    header_info = url_file.info()
+    content_disposition = header_info.getheader('content-disposition', '')
+    filename = parse_header(content_disposition)[1].get('filename')
+    if not filename:
+      # Now read the filename from url.
+      # In case of http redirection, the real url must be read
+      # from file object returned by urllib2.urlopen.
+      # It can happens when the header 'Location' is present in request.
+      # See http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.30
+      url = url_file.geturl()
+      # Create a file name based on the URL and quote it
+      filename = urlparse.urlsplit(url)[-3]
+      filename = os.path.basename(filename)
+      filename = urllib.quote(filename, safe='')
+      filename = filename.replace('%', '')
+    content_type = header_info.gettype()
+    return file_object, filename, content_type
 
 InitializeClass(ContributionTool)
diff --git a/product/ERP5/__init__.py b/product/ERP5/__init__.py
index 5a4509b979..60c5efd87a 100644
--- a/product/ERP5/__init__.py
+++ b/product/ERP5/__init__.py
@@ -50,7 +50,7 @@ from Tool import CategoryTool, SimulationTool, RuleTool, IdTool, TemplateTool,\
                  TrashTool, ContributionTool, NotificationTool, PasswordTool,\
                  GadgetTool, ContributionRegistryTool, IntrospectionTool,\
                  AcknowledgementTool, SolverTool, SolverProcessTool,\
-                 ConversionTool, RoundingTool
+                 ConversionTool, RoundingTool, UrlRegistryTool
 import ERP5Site
 from Document import PythonScript
 object_classes = ( ERP5Site.ERP5Site,
@@ -78,6 +78,7 @@ portal_tools = ( CategoryTool.CategoryTool,
                  SolverProcessTool.SolverProcessTool,
                  ConversionTool.ConversionTool,
                  RoundingTool.RoundingTool,
+                 UrlRegistryTool.UrlRegistryTool,
                 )
 content_classes = ()
 content_constructors = ()
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/drawing_extension.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/drawing_extension.xml
index 4df62bfbc1..dd1bd6379f 100644
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/drawing_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/drawing_extension.xml
@@ -11,7 +11,7 @@
             <value>
               <dictionary>
                 <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                     <value>
                       <list>
                         <string>sxd</string>
@@ -32,7 +32,7 @@
             <key> <string>criterion_property</string> </key>
             <value>
               <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
               </tuple>
             </value>
         </item>
@@ -46,7 +46,7 @@
         </item>
         <item>
             <key> <string>int_index</string> </key>
-            <value> <int>60</int> </value>
+            <value> <int>10</int> </value>
         </item>
         <item>
             <key> <string>portal_type</string> </key>
@@ -60,7 +60,7 @@
         </item>
         <item>
             <key> <string>title</string> </key>
-            <value> <string>Drawing</string> </value>
+            <value> <string>Drawing by extension</string> </value>
         </item>
       </dictionary>
     </pickle>
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/image_extension.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/image_extension.xml
index 4ae9e66cd8..35a1032c51 100644
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/image_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/image_extension.xml
@@ -11,7 +11,7 @@
             <value>
               <dictionary>
                 <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                     <value>
                       <list>
                         <string>gif</string>
@@ -35,7 +35,7 @@
             <key> <string>criterion_property</string> </key>
             <value>
               <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
               </tuple>
             </value>
         </item>
@@ -49,7 +49,7 @@
         </item>
         <item>
             <key> <string>int_index</string> </key>
-            <value> <int>20</int> </value>
+            <value> <int>10</int> </value>
         </item>
         <item>
             <key> <string>portal_type</string> </key>
@@ -63,7 +63,7 @@
         </item>
         <item>
             <key> <string>title</string> </key>
-            <value> <string>Image</string> </value>
+            <value> <string>Image by extension</string> </value>
         </item>
       </dictionary>
     </pickle>
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/pdf_extension.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/pdf_extension.xml
index 6df2da40fa..236f4b14ee 100644
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/pdf_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/pdf_extension.xml
@@ -11,7 +11,7 @@
             <value>
               <dictionary>
                 <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                     <value>
                       <list>
                         <string>pdf</string>
@@ -31,7 +31,7 @@
             <key> <string>criterion_property</string> </key>
             <value>
               <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
               </tuple>
             </value>
         </item>
@@ -45,7 +45,7 @@
         </item>
         <item>
             <key> <string>int_index</string> </key>
-            <value> <int>30</int> </value>
+            <value> <int>10</int> </value>
         </item>
         <item>
             <key> <string>portal_type</string> </key>
@@ -59,7 +59,7 @@
         </item>
         <item>
             <key> <string>title</string> </key>
-            <value> <string>PDF</string> </value>
+            <value> <string>PDF by extension</string> </value>
         </item>
       </dictionary>
     </pickle>
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/pdf_mimetype.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/pdf_mimetype.xml
index 843c9a13ec..e20829dd33 100644
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/pdf_mimetype.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/pdf_mimetype.xml
@@ -11,7 +11,7 @@
             <value>
               <dictionary>
                 <item>
-                    <key> <string>mime_type</string> </key>
+                    <key> <string>content_type</string> </key>
                     <value>
                       <list>
                         <string>application/pdf</string>
@@ -31,7 +31,7 @@
             <key> <string>criterion_property</string> </key>
             <value>
               <tuple>
-                <string>mime_type</string>
+                <string>content_type</string>
               </tuple>
             </value>
         </item>
@@ -45,7 +45,7 @@
         </item>
         <item>
             <key> <string>int_index</string> </key>
-            <value> <int>30</int> </value>
+            <value> <int>20</int> </value>
         </item>
         <item>
             <key> <string>portal_type</string> </key>
@@ -59,7 +59,7 @@
         </item>
         <item>
             <key> <string>title</string> </key>
-            <value> <string>PDF</string> </value>
+            <value> <string>PDF by mimetype</string> </value>
         </item>
       </dictionary>
     </pickle>
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/presentation_extension.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/presentation_extension.xml
index 5113c25adb..61301201ec 100644
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/presentation_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/presentation_extension.xml
@@ -11,7 +11,7 @@
             <value>
               <dictionary>
                 <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                     <value>
                       <list>
                         <string>ppt</string>
@@ -34,7 +34,7 @@
             <key> <string>criterion_property</string> </key>
             <value>
               <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
               </tuple>
             </value>
         </item>
@@ -48,7 +48,7 @@
         </item>
         <item>
             <key> <string>int_index</string> </key>
-            <value> <int>50</int> </value>
+            <value> <int>10</int> </value>
         </item>
         <item>
             <key> <string>portal_type</string> </key>
@@ -62,7 +62,7 @@
         </item>
         <item>
             <key> <string>title</string> </key>
-            <value> <string>Presentation</string> </value>
+            <value> <string>Presentation by extension</string> </value>
         </item>
       </dictionary>
     </pickle>
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/spreadsheet_by_content.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/spreadsheet_by_content.xml
new file mode 100644
index 0000000000..f29b786817
--- /dev/null
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/spreadsheet_by_content.xml
@@ -0,0 +1,71 @@
+<?xml version="1.0"?>
+<ZopeData>
+  <record id="1" aka="AAAAAAAAAAE=">
+    <pickle>
+      <global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
+    </pickle>
+    <pickle>
+      <dictionary>
+        <item>
+            <key> <string>_identity_criterion</string> </key>
+            <value>
+              <dictionary>
+                <item>
+                    <key> <string>content_type_from_content</string> </key>
+                    <value>
+                      <list>
+                        <string>application/vnd.ms-excel</string>
+                        <string>application/vnd.ms-office</string>
+                        <string>application/msexcel</string>
+                        <string>application/vnd.oasis.opendocument.spreadsheet</string>
+                        <string>application/vnd.oasis.opendocument.spreadsheet-template</string>
+                      </list>
+                    </value>
+                </item>
+              </dictionary>
+            </value>
+        </item>
+        <item>
+            <key> <string>_range_criterion</string> </key>
+            <value>
+              <dictionary/>
+            </value>
+        </item>
+        <item>
+            <key> <string>criterion_property</string> </key>
+            <value>
+              <tuple>
+                <string>content_type_from_content</string>
+              </tuple>
+            </value>
+        </item>
+        <item>
+            <key> <string>destination_portal_type</string> </key>
+            <value> <string>Spreadsheet</string> </value>
+        </item>
+        <item>
+            <key> <string>id</string> </key>
+            <value> <string>spreadsheet_by_content</string> </value>
+        </item>
+        <item>
+            <key> <string>int_index</string> </key>
+            <value> <int>70</int> </value>
+        </item>
+        <item>
+            <key> <string>portal_type</string> </key>
+            <value> <string>Contribution Predicate</string> </value>
+        </item>
+        <item>
+            <key> <string>test_method_id</string> </key>
+            <value>
+              <tuple/>
+            </value>
+        </item>
+        <item>
+            <key> <string>title</string> </key>
+            <value> <string>Spreadsheet by content</string> </value>
+        </item>
+      </dictionary>
+    </pickle>
+  </record>
+</ZopeData>
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/spreadsheet_extension.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/spreadsheet_extension.xml
index 0fc157565f..781064639c 100644
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/spreadsheet_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/spreadsheet_extension.xml
@@ -11,7 +11,7 @@
             <value>
               <dictionary>
                 <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                     <value>
                       <list>
                         <string>xls</string>
@@ -35,7 +35,7 @@
             <key> <string>criterion_property</string> </key>
             <value>
               <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
               </tuple>
             </value>
         </item>
@@ -49,7 +49,7 @@
         </item>
         <item>
             <key> <string>int_index</string> </key>
-            <value> <int>40</int> </value>
+            <value> <int>10</int> </value>
         </item>
         <item>
             <key> <string>portal_type</string> </key>
@@ -63,7 +63,7 @@
         </item>
         <item>
             <key> <string>title</string> </key>
-            <value> <string>Spreadsheet</string> </value>
+            <value> <string>Spreadsheet by extension</string> </value>
         </item>
       </dictionary>
     </pickle>
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_by_conent_type.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_by_conent_type.xml
new file mode 100644
index 0000000000..4db5b6382d
--- /dev/null
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_by_conent_type.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0"?>
+<ZopeData>
+  <record id="1" aka="AAAAAAAAAAE=">
+    <pickle>
+      <global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
+    </pickle>
+    <pickle>
+      <dictionary>
+        <item>
+            <key> <string>_identity_criterion</string> </key>
+            <value>
+              <dictionary>
+                <item>
+                    <key> <string>content_type</string> </key>
+                    <value>
+                      <list>
+                        <string>text/plain</string>
+                      </list>
+                    </value>
+                </item>
+              </dictionary>
+            </value>
+        </item>
+        <item>
+            <key> <string>_range_criterion</string> </key>
+            <value>
+              <dictionary/>
+            </value>
+        </item>
+        <item>
+            <key> <string>criterion_property</string> </key>
+            <value>
+              <tuple>
+                <string>content_type</string>
+              </tuple>
+            </value>
+        </item>
+        <item>
+            <key> <string>destination_portal_type</string> </key>
+            <value> <string>Text</string> </value>
+        </item>
+        <item>
+            <key> <string>id</string> </key>
+            <value> <string>text_by_conent_type</string> </value>
+        </item>
+        <item>
+            <key> <string>int_index</string> </key>
+            <value> <int>20</int> </value>
+        </item>
+        <item>
+            <key> <string>portal_type</string> </key>
+            <value> <string>Contribution Predicate</string> </value>
+        </item>
+        <item>
+            <key> <string>test_method_id</string> </key>
+            <value>
+              <tuple/>
+            </value>
+        </item>
+        <item>
+            <key> <string>title</string> </key>
+            <value> <string>Text by content type</string> </value>
+        </item>
+      </dictionary>
+    </pickle>
+  </record>
+</ZopeData>
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_by_content.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_by_content.xml
new file mode 100644
index 0000000000..3266b985ad
--- /dev/null
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_by_content.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0"?>
+<ZopeData>
+  <record id="1" aka="AAAAAAAAAAE=">
+    <pickle>
+      <global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
+    </pickle>
+    <pickle>
+      <dictionary>
+        <item>
+            <key> <string>_identity_criterion</string> </key>
+            <value>
+              <dictionary>
+                <item>
+                    <key> <string>content_type_from_content</string> </key>
+                    <value>
+                      <list>
+                        <string>text/plain</string>
+                      </list>
+                    </value>
+                </item>
+              </dictionary>
+            </value>
+        </item>
+        <item>
+            <key> <string>_range_criterion</string> </key>
+            <value>
+              <dictionary/>
+            </value>
+        </item>
+        <item>
+            <key> <string>criterion_property</string> </key>
+            <value>
+              <tuple>
+                <string>content_type_from_content</string>
+              </tuple>
+            </value>
+        </item>
+        <item>
+            <key> <string>destination_portal_type</string> </key>
+            <value> <string>Text</string> </value>
+        </item>
+        <item>
+            <key> <string>id</string> </key>
+            <value> <string>text_by_content</string> </value>
+        </item>
+        <item>
+            <key> <string>int_index</string> </key>
+            <value> <int>70</int> </value>
+        </item>
+        <item>
+            <key> <string>portal_type</string> </key>
+            <value> <string>Contribution Predicate</string> </value>
+        </item>
+        <item>
+            <key> <string>test_method_id</string> </key>
+            <value>
+              <tuple/>
+            </value>
+        </item>
+        <item>
+            <key> <string>title</string> </key>
+            <value> <string>Text by mimetype from data</string> </value>
+        </item>
+      </dictionary>
+    </pickle>
+  </record>
+</ZopeData>
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_extension.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_extension.xml
index 6f8a24d792..36ced43d8b 100644
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_extension.xml
@@ -11,7 +11,7 @@
             <value>
               <dictionary>
                 <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                     <value>
                       <list>
                         <string>txt</string>
@@ -36,7 +36,7 @@
             <key> <string>criterion_property</string> </key>
             <value>
               <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
               </tuple>
             </value>
         </item>
@@ -64,7 +64,7 @@
         </item>
         <item>
             <key> <string>title</string> </key>
-            <value> <string>Text</string> </value>
+            <value> <string>Text by extension</string> </value>
         </item>
       </dictionary>
     </pickle>
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/web_page_by_content.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/web_page_by_content.xml
new file mode 100644
index 0000000000..45064c34c5
--- /dev/null
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/web_page_by_content.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0"?>
+<ZopeData>
+  <record id="1" aka="AAAAAAAAAAE=">
+    <pickle>
+      <global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
+    </pickle>
+    <pickle>
+      <dictionary>
+        <item>
+            <key> <string>_identity_criterion</string> </key>
+            <value>
+              <dictionary>
+                <item>
+                    <key> <string>content_type_from_content</string> </key>
+                    <value>
+                      <list>
+                        <string>text/html</string>
+                      </list>
+                    </value>
+                </item>
+              </dictionary>
+            </value>
+        </item>
+        <item>
+            <key> <string>_range_criterion</string> </key>
+            <value>
+              <dictionary/>
+            </value>
+        </item>
+        <item>
+            <key> <string>criterion_property</string> </key>
+            <value>
+              <tuple>
+                <string>content_type_from_content</string>
+              </tuple>
+            </value>
+        </item>
+        <item>
+            <key> <string>destination_portal_type</string> </key>
+            <value> <string>Web Page</string> </value>
+        </item>
+        <item>
+            <key> <string>id</string> </key>
+            <value> <string>web_page_by_content</string> </value>
+        </item>
+        <item>
+            <key> <string>int_index</string> </key>
+            <value> <int>70</int> </value>
+        </item>
+        <item>
+            <key> <string>portal_type</string> </key>
+            <value> <string>Contribution Predicate</string> </value>
+        </item>
+        <item>
+            <key> <string>test_method_id</string> </key>
+            <value>
+              <tuple/>
+            </value>
+        </item>
+        <item>
+            <key> <string>title</string> </key>
+            <value> <string>Web Page by mimetype from data</string> </value>
+        </item>
+      </dictionary>
+    </pickle>
+  </record>
+</ZopeData>
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/webpage_extension.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/webpage_extension.xml
index b5ad3d5eb9..a3bdbf23da 100644
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/webpage_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/webpage_extension.xml
@@ -11,7 +11,7 @@
             <value>
               <dictionary>
                 <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                     <value>
                       <list>
                         <string>html</string>
@@ -33,7 +33,7 @@
             <key> <string>criterion_property</string> </key>
             <value>
               <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
               </tuple>
             </value>
         </item>
@@ -47,7 +47,7 @@
         </item>
         <item>
             <key> <string>int_index</string> </key>
-            <value> <int>90</int> </value>
+            <value> <int>10</int> </value>
         </item>
         <item>
             <key> <string>portal_type</string> </key>
@@ -61,7 +61,7 @@
         </item>
         <item>
             <key> <string>title</string> </key>
-            <value> <string>Web Page</string> </value>
+            <value> <string>Web Page by extension</string> </value>
         </item>
       </dictionary>
     </pickle>
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/webpage_mimetype.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/webpage_mimetype.xml
index 28f51d22e5..0ee1b5b3b5 100644
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/webpage_mimetype.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/webpage_mimetype.xml
@@ -11,7 +11,7 @@
             <value>
               <dictionary>
                 <item>
-                    <key> <string>mime_type</string> </key>
+                    <key> <string>content_type</string> </key>
                     <value>
                       <list>
                         <string>text/html</string>
@@ -31,7 +31,7 @@
             <key> <string>criterion_property</string> </key>
             <value>
               <tuple>
-                <string>mime_type</string>
+                <string>content_type</string>
               </tuple>
             </value>
         </item>
@@ -45,13 +45,7 @@
         </item>
         <item>
             <key> <string>int_index</string> </key>
-            <value> <int>90</int> </value>
-        </item>
-        <item>
-            <key> <string>membership_criterion_base_category</string> </key>
-            <value>
-              <tuple/>
-            </value>
+            <value> <int>20</int> </value>
         </item>
         <item>
             <key> <string>portal_type</string> </key>
@@ -65,7 +59,7 @@
         </item>
         <item>
             <key> <string>title</string> </key>
-            <value> <string>Web Page</string> </value>
+            <value> <string>Web Page by mimetype</string> </value>
         </item>
       </dictionary>
     </pickle>
diff --git a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_preferences/default_site_preference.xml b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_preferences/default_site_preference.xml
index 20ce1763e8..1fbb5cbaf5 100644
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_preferences/default_site_preference.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_preferences/default_site_preference.xml
@@ -24,6 +24,22 @@
               </tuple>
             </value>
         </item>
+        <item>
+            <key> <string>_Add_portal_content_Permission</string> </key>
+            <value>
+              <tuple>
+                <string>Manager</string>
+              </tuple>
+            </value>
+        </item>
+        <item>
+            <key> <string>_Delete_objects_Permission</string> </key>
+            <value>
+              <tuple>
+                <string>Manager</string>
+              </tuple>
+            </value>
+        </item>
         <item>
             <key> <string>_Modify_portal_content_Permission</string> </key>
             <value>
@@ -252,6 +268,22 @@ It\'s the lowest priority one; ie. managers can create higher priority preferenc
             <key> <string>preferred_date_order</string> </key>
             <value> <string>ymd</string> </value>
         </item>
+        <item>
+            <key> <string>preferred_document_file_name_regular_expression</string> </key>
+            <value> <string encoding="cdata"><![CDATA[
+
+(?P<reference>[A-Z&Ã©@{]{3,7})-(?P<language>[a-z]{2})-(?P<version>[0-9]{3})
+
+]]></string> </value>
+        </item>
+        <item>
+            <key> <string>preferred_document_reference_regular_expression</string> </key>
+            <value> <string encoding="cdata"><![CDATA[
+
+(?P<reference>[A-Z&Ã©@{]{3,7})(-(?P<language>[a-z]{2}))?(-(?P<version>[0-9]{3}))?
+
+]]></string> </value>
+        </item>
         <item>
             <key> <string>preferred_event_assessment_form_id</string> </key>
             <value>
diff --git a/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_download.xml b/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_download.xml
index c91dba6b56..57a6e1c18e 100644
--- a/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_download.xml
+++ b/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_download.xml
@@ -58,8 +58,8 @@ from zExceptions import Unauthorized\n
 format = None\n
 # Always force download of document even if format is supported\n
 # by browser\n
-file_name = context.getStandardFileName(format)\n
-response.setHeader(\'Content-disposition\', \'attachment; filename="%s"\' % file_name)\n
+filename = context.getStandardFilename(format)\n
+response.setHeader(\'Content-disposition\', \'attachment; filename="%s"\' % filename)\n
 \n
 try:\n
   return context.index_html(request, response, format)\n
@@ -111,7 +111,7 @@ except Unauthorized:\n
                             <string>None</string>
                             <string>format</string>
                             <string>context</string>
-                            <string>file_name</string>
+                            <string>filename</string>
                             <string>msg</string>
                             <string>dict</string>
                           </tuple>
diff --git a/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/ContributionPredicate_view/my_criterion_property_list.xml b/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/ContributionPredicate_view/my_criterion_property_list.xml
index a864ab9e89..9d50cd518e 100644
--- a/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/ContributionPredicate_view/my_criterion_property_list.xml
+++ b/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/ContributionPredicate_view/my_criterion_property_list.xml
@@ -222,12 +222,16 @@
                     <value>
                       <list>
                         <tuple>
-                          <string>file_extension</string>
-                          <string>file_extension</string>
+                          <string>extension_from_filename</string>
+                          <string>extension_from_filename</string>
                         </tuple>
                         <tuple>
-                          <string>mime_type</string>
-                          <string>mime_type</string>
+                          <string>content_type</string>
+                          <string>content_type</string>
+                        </tuple>
+                        <tuple>
+                          <string>content_type_from_content</string>
+                          <string>content_type_from_content</string>
                         </tuple>
                       </list>
                     </value>
diff --git a/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/ContributionRegistryTool_viewContributionPredicateList/listbox.xml b/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/ContributionRegistryTool_viewContributionPredicateList/listbox.xml
index 6cd98d6343..602fccab46 100644
--- a/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/ContributionRegistryTool_viewContributionPredicateList/listbox.xml
+++ b/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/ContributionRegistryTool_viewContributionPredicateList/listbox.xml
@@ -352,6 +352,10 @@
                     <key> <string>css_class</string> </key>
                     <value> <string></string> </value>
                 </item>
+                <item>
+                    <key> <string>default_display_style</string> </key>
+                    <value> <string>table</string> </value>
+                </item>
                 <item>
                     <key> <string>default_params</string> </key>
                     <value>
@@ -362,6 +366,12 @@
                     <key> <string>description</string> </key>
                     <value> <string></string> </value>
                 </item>
+                <item>
+                    <key> <string>display_style_list</string> </key>
+                    <value>
+                      <list/>
+                    </value>
+                </item>
                 <item>
                     <key> <string>domain_root_list</string> </key>
                     <value>
@@ -396,10 +406,18 @@
                       <list/>
                     </value>
                 </item>
+                <item>
+                    <key> <string>global_search_column</string> </key>
+                    <value> <string></string> </value>
+                </item>
                 <item>
                     <key> <string>hidden</string> </key>
                     <value> <int>0</int> </value>
                 </item>
+                <item>
+                    <key> <string>hide_rows_on_no_search_criterion</string> </key>
+                    <value> <int>0</int> </value>
+                </item>
                 <item>
                     <key> <string>lines</string> </key>
                     <value> <int>20</int> </value>
@@ -425,6 +443,10 @@
                       </list>
                     </value>
                 </item>
+                <item>
+                    <key> <string>page_navigation_mode</string> </key>
+                    <value> <string>slider</string> </value>
+                </item>
                 <item>
                     <key> <string>page_template</string> </key>
                     <value> <string></string> </value>
@@ -445,6 +467,10 @@
                     <key> <string>report_tree</string> </key>
                     <value> <int>0</int> </value>
                 </item>
+                <item>
+                    <key> <string>row_css_method</string> </key>
+                    <value> <string></string> </value>
+                </item>
                 <item>
                     <key> <string>search</string> </key>
                     <value> <int>0</int> </value>
@@ -490,10 +516,22 @@
                     <key> <string>stat_method</string> </key>
                     <value> <string></string> </value>
                 </item>
+                <item>
+                    <key> <string>style_columns</string> </key>
+                    <value>
+                      <list/>
+                    </value>
+                </item>
                 <item>
                     <key> <string>title</string> </key>
                     <value> <string>Contribution Predicates</string> </value>
                 </item>
+                <item>
+                    <key> <string>untranslatable_columns</string> </key>
+                    <value>
+                      <list/>
+                    </value>
+                </item>
                 <item>
                     <key> <string>url_columns</string> </key>
                     <value>
diff --git a/product/ERP5/bootstrap/erp5_core/bt/revision b/product/ERP5/bootstrap/erp5_core/bt/revision
index 40ef691a23..b99cc44f71 100644
--- a/product/ERP5/bootstrap/erp5_core/bt/revision
+++ b/product/ERP5/bootstrap/erp5_core/bt/revision
@@ -1 +1 @@
-40819
\ No newline at end of file
+40820
\ No newline at end of file
diff --git a/product/ERP5/bootstrap/erp5_core/bt/template_path_list b/product/ERP5/bootstrap/erp5_core/bt/template_path_list
index e441af2db9..472021120f 100644
--- a/product/ERP5/bootstrap/erp5_core/bt/template_path_list
+++ b/product/ERP5/bootstrap/erp5_core/bt/template_path_list
@@ -22,8 +22,12 @@ portal_contribution_registry/image_extension
 portal_contribution_registry/pdf_extension
 portal_contribution_registry/pdf_mimetype
 portal_contribution_registry/presentation_extension
+portal_contribution_registry/spreadsheet_by_content
 portal_contribution_registry/spreadsheet_extension
+portal_contribution_registry/text_by_conent_type
+portal_contribution_registry/text_by_content
 portal_contribution_registry/text_extension
+portal_contribution_registry/web_page_by_content
 portal_contribution_registry/webpage_extension
 portal_contribution_registry/webpage_mimetype
 portal_domains/base_day_domain
diff --git a/product/ERP5/interfaces/discoverable.py b/product/ERP5/interfaces/discoverable.py
new file mode 100644
index 0000000000..6a0a293f41
--- /dev/null
+++ b/product/ERP5/interfaces/discoverable.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
+#                    Jean-Paul Smets-Solanes <jp@nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsibility of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# guarantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from zope.interface import Interface
+
+class IDiscoverable(Interface):
+  """
+  Discoverable interface specification
+
+  Documents which implement IMetadataDiscoverable provides
+  methods to discover and update metadata properties
+  from content, user input, file name, etc.
+  """
+
+  def getContentInformation():
+    """
+    Returns a dictionary of possible metadata which can be extracted from the
+    document content (ex. title from an HTML file, creation date from a PDF
+    document, etc.)
+    """
+
+  def getPropertyDictFromUserLogin(user_login=None):
+    """
+    Based on the user_login, find out all properties which
+    can be discovered to later update document metadata.
+
+    user_login -- optional user login ID
+    """
+
+  def getPropertyDictFromContent():
+    """
+    Based on the result of getContentInformation, find out all
+    properties which can be discovered to later update document metadata.
+    """
+
+  def getPropertyDictFromFilename(filename):
+    """
+    Based on the file name, find out all properties which
+    can be discovered to later update document metadata.
+
+    filename -- file name to use in discovery process
+    """
+
+  def getPropertyDictFromInput():
+    """
+    Based on the user input, find out all properties which
+    can be discovered to later update document metadata.
+    """
+
+  def discoverMetadata(filename=None, user_login=None):
+    """
+    Updates the document metadata by discovering metadata from
+    the user login, the document content, the file name and the
+    user input. The order of discovery should be set in system
+    preferences.
+
+    filename - optional file name (ex. AA-BBB-CCC-223-en.doc)
+
+    user_login -- optional user login ID
+
+    XXX - it is unclear if this method should also trigger finishIngestion
+          and whether this should be documented here or not
+    """
+
+  def finishIngestion():
+    """
+    Finish the ingestion process (ex. allocate a reference number automatically if
+    no reference was defined.)
+
+    XXX - it is unclear if this method should be part of the interface
+    """
+
+  def getExtensionFromFilename():
+    """Return calculated value of extension read from filename
+    """
+
+  def getContentTypeFromContent():
+    """Return calculated value of content type read from content
+    """
diff --git a/product/ERP5/interfaces/document.py b/product/ERP5/interfaces/document.py
index 6ab86d28a8..11ef826c79 100644
--- a/product/ERP5/interfaces/document.py
+++ b/product/ERP5/interfaces/document.py
@@ -87,7 +87,7 @@ class IDocument(Interface):
 
   input      -   data supplied with http request or set on the object during (2) (e.g.
                  discovered from email text)
-  file_name  -   data which might be encoded in file name
+  filename  -   data which might be encoded in filename
   user_login -   information about user who is contributing the file
   content    -   data which might be derived from document content
 
diff --git a/product/ERP5/interfaces/downloadable.py b/product/ERP5/interfaces/downloadable.py
index 803db3365b..c1e828a1df 100644
--- a/product/ERP5/interfaces/downloadable.py
+++ b/product/ERP5/interfaces/downloadable.py
@@ -52,11 +52,11 @@ class IDownloadable(Interface):
     kw -- optional conversion parameters
     """
 
-  def getStandardFileName(format=None):
+  def getStandardFilename(format=None):
     """
     Returns a standard file name for the document to download.
     This method is the reverse of
-    IMetadataDiscoverable.getPropertyDictFromFileName.
+    IDiscoverable.getPropertyDictFromFilename.
 
     format -- extension of returned file name
     """
diff --git a/product/ERP5/interfaces/url.py b/product/ERP5/interfaces/url.py
new file mode 100644
index 0000000000..99f324d298
--- /dev/null
+++ b/product/ERP5/interfaces/url.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
+#                    Nicolas Delaby <nicolas@nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsibility of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# guarantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from zope.interface import Interface
+
+class IUrl(Interface):
+  """
+  """
+
+  def asURL():
+    """
+    Returns a text representation of the Url if defined
+    or None else.
+    """
+
+
+  def fromURL(url):
+    """
+    Analyses a URL and splits it into two parts. URLs
+    normally follow RFC 1738. However, we accept URLs
+    without the protocol a.k.a. scheme part (http, mailto, etc.). In this
+    case only the url_string a.k.a. scheme-specific-part is taken
+    into account. asURL will then generate the full URL.
+    """
+
+  def getURLServer():
+    """
+    Returns the server part of a URL
+    """
+
+  def getURLPort():
+    """
+    Returns the port part of a URL
+    """
+
+  def getURLPath():
+    """
+    Returns the path part of a URL
+    """
+
+  def asNormalisedURL(base_url=None):
+    """
+    Returns a normalised version of the url so
+    that we do not download twice the same content.
+    This normalisation must refer to the same resource !
+    Refer to http://en.wikipedia.org/wiki/URL_normalization .
+
+    base_url - Specify a default URL and a default target
+               for all links on a page.
+               if url is a relative link, we try to compute an absolute url
+               with help of base_url
+    """
diff --git a/product/ERP5/interfaces/url_registry_tool.py b/product/ERP5/interfaces/url_registry_tool.py
new file mode 100644
index 0000000000..73d8d731e1
--- /dev/null
+++ b/product/ERP5/interfaces/url_registry_tool.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
+#                    Nicolas Delaby <nicolas@nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsibility of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# guarantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from zope.interface import Interface
+
+class IUrlRegistryTool(Interface):
+  """Tool to register URLs
+  This tool aim to maintain consistency in URL management
+  of crawlable sources in order to maintain consistency
+  between an external resource identifier and generated
+  document inside ERP5.
+
+  Multiple URL can be associated to the same reference
+
+  A System Preference can used to configure the global namespace.
+  This enable isolation of url mappings for different Groups.
+
+  This is a configurable tool to support different scope for mappings.
+  So it is possible to restrict the crawling of an URL
+  only once in the context of portal;
+  Or restrict the crawling of an url for the scope of an external_source
+  or a module only (Crawling multiple times the same URL for a portal)
+  """
+
+  def clearUrlRegistryTool(context=None):
+    """Unregister all urls in all namespaces.
+    Only available for Manager
+
+    context - a context to access container of mappings.
+    """
+
+  def registerURL(url, reference, context=None):
+    """Register the mapping url:reference
+    this method is aimed to be called from interaction_workflow
+    which trig on _setReference in order to keep the association
+    between url:reference up to date.
+
+    url - external resource identifier
+    reference - reference of downloaded resource (ERP5 Object instance)
+    context - a context to access container of mappings.
+              If not passed, mappings are stored on tool itself
+    """
+
+  def getReferenceList(context=None):
+    """return all references registered by portal_url_registry
+    according given context
+
+    context - a context to access container of mappings.
+    """
+
+  def getReferenceFromURL(url, context=None):
+    """return reference of document according provided url
+
+    url - external resource identifier
+    context - a context to access container of mappings.
+              If not passed, mapping are stored on tool itself
+    """
+
+  def getURLListFromReference(reference, context=None):
+    """return list of urls associated to given reference
+    and context.
+
+    reference - reference of downloaded resource (ERP5 Object instance)
+    context - a context to access container of mappings.
+    """
+
+  def updateUrlRegistryTool():
+    """Rebuild all url mappings for active preference
+    """
diff --git a/product/ERP5/mixin/cached_convertable.py b/product/ERP5/mixin/cached_convertable.py
index 12dd12e09b..8285871af1 100644
--- a/product/ERP5/mixin/cached_convertable.py
+++ b/product/ERP5/mixin/cached_convertable.py
@@ -139,10 +139,21 @@ class CachedConvertableMixin:
       cached_value = data
       conversion_md5 = md5_new(str(data.data)).hexdigest()
       size = len(data.data)
-    else:
+    elif isinstance(data, (str, unicode,)):
       cached_value = data
       conversion_md5 = md5_new(cached_value).hexdigest()
       size = len(cached_value)
+    elif isinstance(data, dict):
+      # Dict instance are used to store computed metadata
+      # from actual content.
+      # So this value is intimely related to cache of conversion.
+      # As it should be cleared each time the document is edited.
+      # Also may be a proper API should be used
+      cached_value = data
+      conversion_md5 = None
+      size = len(cached_value)
+    else:
+      raise NotImplementedError, 'Not able to store type:%r' % type(data)
     if date is None:
       date = DateTime()
     stored_data_dict = {'content_md5': self.getContentMd5(),
diff --git a/product/ERP5/mixin/crawlable.py b/product/ERP5/mixin/crawlable.py
index f7d42a6df2..99a95bff1d 100644
--- a/product/ERP5/mixin/crawlable.py
+++ b/product/ERP5/mixin/crawlable.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 ##############################################################################
 #
-# Copyright (c) 2009 Nexedi SA and Contributors. All Rights Reserved.
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
 #                    Ivan Tyagov <ivan@nexedi.com>
 #
 # WARNING: This program as such is intended to be used by professional
@@ -27,8 +27,13 @@
 #
 ##############################################################################
 
-from AccessControl import ClassSecurityInfo, getSecurityManager
+from AccessControl import ClassSecurityInfo
 from Products.ERP5Type import Permissions
+from Products.ERP5Type.Utils import normaliseUrl
+from Products.ERP5Type.DateUtils import convertDateToHour,\
+     number_of_hours_in_day, number_of_hours_in_year
+from urlparse import urlsplit, urlunsplit
+from lxml import html as etree_html
 
 class CrawlableMixin:
   """
@@ -80,3 +85,81 @@ class CrawlableMixin:
     method = self._getTypeBasedMethod('isUpdatable',
         fallback_script_id = 'Document_isUpdatable')
     return method()
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getContentURLList')
+  def getContentURLList(self):
+    """
+    Returns a list of URLs referenced by the content of this document.
+    Default implementation consists in analysing the document
+    converted to HTML. Subclasses may overload this method
+    if necessary. However, it is better to extend the conversion
+    methods in order to produce valid HTML, which is useful to
+    many people, rather than overload this method which is only
+    useful for crawling.
+    """
+    html_content = self.asEntireHTML()
+    html_tree = etree_html.fromstring(html_content)
+    base_href = self.getContentBaseURL()
+    if base_href:
+      html_tree.make_links_absolute(base_href)
+    href_list = []
+    for elemnt, attribute_name, link, position in html_tree.iterlinks():
+      # For now take into acount only a and img tags
+      if attribute_name not in ('href',):
+        continue
+      if isinstance(link, unicode):
+        link = link.encode('utf-8')
+      href_list.append(link)
+    return href_list
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getContentBaseURL')
+  def getContentBaseURL(self):
+    """
+    Returns the content base URL based on the actual content or
+    on its URL.
+    """
+    raw_url = self.asURL() or ''
+    splitted_url = urlsplit(raw_url)
+    path_part = splitted_url[2]
+    path_part = '/'.join(path_part.split('/')[:-1])
+    base_url = urlunsplit((splitted_url[0], splitted_url[1], path_part, None,
+                           None))
+    if isinstance(base_url, unicode):
+      base_url = base_url.encode('utf-8')
+    return base_url
+
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getContentNormalisedURLList')
+  def getContentNormalisedURLList(self):
+    """
+    Call url normalizer for each url returned by getContentURLList
+    Return only url associated to the same Domain
+    """
+    reference_domain = urlsplit(normaliseUrl(self.asURL() or ''))[1]
+    # in www.example.com or www.3.example.com
+    # keep only the example.com part
+    reference_domain = ''.join(reference_domain.split('.')[-2:])
+    if isinstance(reference_domain, unicode):
+      reference_domain = reference_domain.encode('utf-8')
+    url_list = []
+    base_url = self.getContentBaseURL()
+    for url in self.getContentURLList():
+      try:
+        url = normaliseUrl(url, base_url=base_url)
+      except UnicodeDecodeError:
+        # Ignore wrong encoding errors
+        # Web is not a kind world
+        continue
+      if not url:
+        continue
+      url_domain = urlsplit(url)[1]
+      if isinstance(url_domain, unicode):
+        url_domain = url_domain.encode('utf-8')
+      if url_domain and ''.join(url_domain.split('.')[-2:]) != reference_domain:
+        continue
+      # if domain is empty (relative link) or domain is same, then OK
+      url_list.append(url)
+    return url_list
diff --git a/product/ERP5/mixin/discoverable.py b/product/ERP5/mixin/discoverable.py
new file mode 100644
index 0000000000..7ad46fdd2c
--- /dev/null
+++ b/product/ERP5/mixin/discoverable.py
@@ -0,0 +1,254 @@
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
+#                    Ivan Tyagov <ivan@nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsibility of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# guarantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo, getSecurityManager
+from Products.ERP5Type import Permissions
+from Products.ERP5Type.Utils import convertToUpperCase
+from Products.CMFCore.utils import getToolByName
+from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
+import os
+import re
+
+try:
+  import magic
+except ImportError:
+  magic = None
+
+VALID_ORDER_KEY_LIST = ('user_login', 'content', 'filename', 'input')
+
+CONTENT_INFORMATION_FORMAT = '_idiscoverable_content_information'
+
+class DiscoverableMixin(CachedConvertableMixin):
+  """
+  Implements IDiscoverable
+  This class provide methods useful for Metadata extraction.
+  It inherit from CachedConvertableMixin to access
+  Cache storage API.
+  As computed data needs to be stored in same backend.
+  """
+  security = ClassSecurityInfo()
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getPropertyDictFromUserLogin')
+  def getPropertyDictFromUserLogin(self, user_login=None):
+    """
+    Based on the user_login, find out as many properties as needed.
+    returns properties which should be set on the document
+    """
+    if user_login is None:
+      user_login = str(getSecurityManager().getUser())
+    method = self._getTypeBasedMethod('getPropertyDictFromUserLogin',
+        fallback_script_id='Document_getPropertyDictFromUserLogin')
+    return method(user_login)
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getPropertyDictFromContent')
+  def getPropertyDictFromContent(self):
+    """
+    Based on the document content, find out as many properties as needed.
+    returns properties which should be set on the document
+    """
+    # accesss data through convert
+    mime, content = self.convert(None)
+    if not content:
+       # if document is empty, we will not find anything in its content
+      return {}
+    method = self._getTypeBasedMethod('getPropertyDictFromContent',
+        fallback_script_id='Document_getPropertyDictFromContent')
+    return method()
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getPropertyDictFromFilename')
+  def getPropertyDictFromFilename(self, filename):
+    """
+    Based on the file name, find out as many properties as needed.
+    returns properties which should be set on the document
+    """
+    return self.portal_contributions.getPropertyDictFromFilename(filename)
+
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getPropertyDictFromFileName')
+  getPropertyDictFromFileName = getPropertyDictFromFilename
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getPropertyDictFromInput')
+  def getPropertyDictFromInput(self, input_parameter_dict):
+    """
+    Fetch argument_dict, then filter pass this dictionary
+    to getPropertyDictFromInput.
+    """
+    method = self._getTypeBasedMethod('getPropertyDictFromInput')
+    return method(input_parameter_dict)
+
+  ### Metadata disovery and ingestion methods
+  security.declareProtected(Permissions.ModifyPortalContent,
+                            'discoverMetadata')
+  def discoverMetadata(self, filename=None, user_login=None,
+                       input_parameter_dict=None):
+    """
+    This is the main metadata discovery function - controls the process
+    of discovering data from various sources. The discovery itself is
+    delegated to scripts or uses preference-configurable regexps. The
+    method returns either self or the document which has been
+    merged in the discovery process.
+
+    filename - this parameter is a file name of the form "AA-BBB-CCC-223-en"
+
+    user_login - this is a login string of a person; can be None if the user is
+                 currently logged in, then we'll get him from session
+    input_parameter_dict - arguments provided to Create this content by user.
+    """
+    # Preference is made of a sequence of 'user_login', 'content', 'filename', 'input'
+    method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList')
+    order_list = list(method())
+    order_list.reverse()
+    # build a dictionary according to the order
+    kw = {}
+    for order_id in order_list:
+      result = None
+      if order_id not in VALID_ORDER_KEY_LIST:
+        # Prevent security attack or bad preferences
+        raise AttributeError, "%s is not in valid order key list" % order_id
+      method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id)
+      method = getattr(self, method_id)
+      if order_id == 'filename':
+        if filename is not None:
+          result = method(filename)
+      elif order_id == 'user_login':
+        if user_login is not None:
+          result = method(user_login)
+      elif order_id == 'input':
+        if input_parameter_dict is not None:
+          result = method(input_parameter_dict)
+      else:
+        result = method()
+      if result is not None:
+        for key, value in result.iteritems():
+          if value not in (None, ''):
+            kw[key]=value
+    # Prepare the content edit parameters
+    portal_type = kw.pop('portal_type', None)
+    if portal_type and portal_type != self.getPortalType():
+      # Reingestion is required to update portal_type
+      return self.migratePortalType(portal_type)
+    # Try not to invoke an automatic transition here
+    self._edit(**kw)
+    if not portal_type:
+      # If no portal_type was dicovered, pass self
+      # through to portal_contribution_registry
+      # to guess destination portal_type against all properties.
+      # If returned portal_type is different, then reingest.
+      registry = getToolByName(self.getPortalObject(),
+                              'portal_contribution_registry')
+      portal_type = registry.findPortalTypeName(context=self)
+      if portal_type != self.getPortalType():
+        return self.migratePortalType(portal_type)
+    # Finish ingestion by calling method
+    self.finishIngestion() # XXX - is this really the right place ?
+    self.reindexObject() # XXX - is this really the right place ?
+    # Revision merge is tightly coupled
+    # to metadata discovery - refer to the documentation of mergeRevision method
+    merged_doc = self.mergeRevision() # XXX - is this really the right place ?
+    merged_doc.reindexObject() # XXX - is this really the right place ?
+    return merged_doc # XXX - is this really the right place ?
+
+  security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion')
+  def finishIngestion(self):
+    """
+    Finish the ingestion process by calling the appropriate script. This
+    script can for example allocate a reference number automatically if
+    no reference was defined.
+    """
+    method = self._getTypeBasedMethod('finishIngestion',
+                                 fallback_script_id='Document_finishIngestion')
+    return method()
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getContentTypeFromContent')
+  def getContentTypeFromContent(self):
+    """
+    Return content_type read from metadata extraction of content.
+    This method is called by portal_contribution_registry
+    """
+    mime, content = self.convert(None)
+    if not content:
+      return
+    if magic is not None:
+      # This will be delegated soon to external web service
+      # like cloudooo
+      # ERP5 will no longer handle data itself.
+      mimedetector = magic.Magic(mime=True)
+      return mimedetector.from_buffer(content)
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getExtensionFromFilename')
+  def getExtensionFromFilename(self, filename=None):
+    """
+    Return extension read from filename in lower case.
+    """
+    if not filename:
+      filename = self.getStandardFilename()
+    basename, extension = os.path.splitext(filename)
+    if extension:
+      extension = extension[1:].lower() # remove first dot
+    return extension
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getContentInformation')
+  def getContentInformation(self):
+    """
+    Call private implementation, then store the result in conversion
+    cache storage.
+    """
+    format = CONTENT_INFORMATION_FORMAT
+    # How to knows if a instance implement an interface
+    try:
+      mime, cached_value = self.getConversion(format=format)
+      return cached_value
+    except KeyError:
+      value = self._getContentInformation()
+      self.setConversion(value, format=format)
+      return value
+
+  def _getContentInformation(self):
+    """
+    Returns the content information from the HTML conversion.
+    The default implementation tries to build a dictionary
+    from the HTML conversion of the document and extract
+    the document title.
+    """
+    result = {}
+    html = self.asEntireHTML()
+    if not html:
+      return result
+    title_list = re.findall(self.title_parser, str(html))
+    if title_list:
+      result['title'] = title_list[0]
+    return result
diff --git a/product/ERP5/mixin/downloadable.py b/product/ERP5/mixin/downloadable.py
index 0f2277de86..91b01c8023 100644
--- a/product/ERP5/mixin/downloadable.py
+++ b/product/ERP5/mixin/downloadable.py
@@ -31,6 +31,7 @@ from Products.ERP5Type import Permissions
 from Products.ERP5Type.Utils import fill_args_from_request
 from Products.CMFCore.utils import getToolByName, _setCacheHeaders,\
     _ViewEmulator
+import warnings
 
 _MARKER = []
 
@@ -108,15 +109,31 @@ class DownloadableMixin:
     return str(data)
 
   security.declareProtected(Permissions.AccessContentsInformation,
-                            'getStandardFileName')
-  def getStandardFileName(self, format=None):
+                            'getStandardFilename')
+  def getStandardFilename(self, format=None):
     """Returns the document coordinates as a standard file name. This
     method is the reverse of getPropertyDictFromFileName.
     """
-    method = self._getTypeBasedMethod('getStandardFileName',
+    method = self._getTypeBasedMethod('getStandardFilename',
+                             fallback_script_id='Document_getStandardFilename')
+    if method is None:
+      # backward compatibility
+      method = self._getTypeBasedMethod('getStandardFileName',
                              fallback_script_id='Document_getStandardFileName')
     return method(format=format)
 
+  # backward compatibility
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getStandardFileName')
+  def getStandardFileName(self, format=None):
+    """(deprecated) use getStandardFilename() instead."""
+    warnings.warn('getStandardFileName() is deprecated. '
+                  'use getStandardFilename() instead.')
+    return self.getStandardFilename(format=format)
+    method = self._getTypeBasedMethod('getStandardFilename',
+                             fallback_script_id='Document_getStandardFilename')
+    return method(format=format)
+
   def manage_FTPget(self):
     """Return body for ftp. and WebDAV
     """
diff --git a/product/ERP5/tests/testBase.py b/product/ERP5/tests/testBase.py
index 7b2c474ed9..3ad11386aa 100644
--- a/product/ERP5/tests/testBase.py
+++ b/product/ERP5/tests/testBase.py
@@ -43,6 +43,7 @@ from zExceptions import BadRequest
 from Products.ERP5Type.tests.backportUnittest import skip
 from Products.ERP5Type.Tool.ClassTool import _aq_reset
 from Products.ERP5Type.Workflow import addWorkflowByType
+from Products.CMFCore.WorkflowCore import WorkflowException
 
 def getDummyTypeBaseMethod(self):
   """ Use a type Base method
@@ -1248,6 +1249,43 @@ class TestBase(ERP5TypeTestCase, ZopeTestCase.Functional):
     self.assertFalse(person.isIndexable)
     self.assertEquals(0, len(self.portal.portal_catalog(uid=person.getUid())))
 
+  def test_metaWorkflowTransition(self):
+    """Test Meta Transtion, jump from state to another without explicitely
+    transtion defined.
+    """
+    module = self.portal.person_module
+    person = module.newContent(portal_type='Person')
+    self.assertEquals(person.getValidationState(), 'draft')
+    self.assertFalse(self.portal.portal_workflow.isTransitionPossible(person,
+                                                                 'invalidate'))
+    # test low-level implementation
+    self.portal.portal_workflow.validation_workflow._executeMetaTransition(
+                                                         person, 'invalidated')
+    self.assertEquals(person.getValidationState(), 'invalidated')
+    validation_history = person.workflow_history['validation_workflow']
+    self.assertEquals(len(validation_history), 2)
+    self.assertEquals(validation_history[-1]['comment'],
+                                      'Jump from \'draft\' to \'invalidated\'')
+    person = module.newContent(portal_type='Person')
+    self.assertEquals(person.getValidationState(), 'draft')
+
+    # test high-level implementation
+    self.portal.portal_workflow._jumpToStateFor(person, 'invalidated')
+    self.assertEquals(person.getValidationState(), 'invalidated')
+
+    person = module.newContent(portal_type='Person')
+    self.assertEquals(person.getValidationState(), 'draft')
+    self.portal.portal_workflow._jumpToStateFor(person, 'invalidated',
+                                               wf_id='validation_workflow')
+    self.assertEquals(person.getValidationState(), 'invalidated')
+    person = module.newContent(portal_type='Person')
+    self.assertEquals(person.getValidationState(), 'draft')
+    self.assertRaises(WorkflowException,
+                      self.portal.portal_workflow._jumpToStateFor,
+                      person, 'invalidated', wf_id='edit_workflow')
+    self.assertEquals(person.getValidationState(), 'draft')
+
+
 class TestERP5PropertyManager(unittest.TestCase):
   """Tests for ERP5PropertyManager.
   """
diff --git a/product/ERP5/tests/testCRM.py b/product/ERP5/tests/testCRM.py
index 27feaee8ea..29a74531e1 100644
--- a/product/ERP5/tests/testCRM.py
+++ b/product/ERP5/tests/testCRM.py
@@ -36,7 +36,7 @@ from Products.CMFCore.WorkflowCore import WorkflowException
 from Products.ERP5Type.tests.utils import DummyMailHost, FileUpload
 from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\
                                                        _getConversionServerDict
-from Products.ERP5OOo.tests.testIngestion import FILE_NAME_REGULAR_EXPRESSION
+from Products.ERP5OOo.tests.testIngestion import FILENAME_REGULAR_EXPRESSION
 from Products.ERP5OOo.tests.testIngestion import REFERENCE_REGULAR_EXPRESSION
 from Products.ERP5Type.tests.backportUnittest import expectedFailure
 
@@ -443,7 +443,7 @@ class TestCRMMailIngestion(BaseTestCRM):
       data=self._readTestData(filename)
     return self.portal.portal_contributions.newContent(
                     container_path='event_module',
-                    file_name='postfix_mail.eml',
+                    filename='postfix_mail.eml',
                     data=data)
 
   def test_findTypeByName_MailMessage(self):
@@ -451,7 +451,7 @@ class TestCRMMailIngestion(BaseTestCRM):
     self.assertEquals(
       'Mail Message',
       self.portal.portal_contribution_registry.findPortalTypeName(
-      file_name='postfix_mail.eml', mime_type='message/rfc822', data='Test'
+      filename='postfix_mail.eml', content_type='message/rfc822', data='Test'
       ))
 
   def test_Base_getEntityListFromFromHeader(self):
@@ -767,7 +767,7 @@ class TestCRMMailSend(BaseTestCRM):
     conversion_dict = _getConversionServerDict()
     default_pref.setPreferredOoodocServerAddress(conversion_dict['hostname'])
     default_pref.setPreferredOoodocServerPortNumber(conversion_dict['port'])
-    default_pref.setPreferredDocumentFileNameRegularExpression(FILE_NAME_REGULAR_EXPRESSION)
+    default_pref.setPreferredDocumentFileNameRegularExpression(FILENAME_REGULAR_EXPRESSION)
     default_pref.setPreferredDocumentReferenceRegularExpression(REFERENCE_REGULAR_EXPRESSION)
     if default_pref.getPreferenceState() == 'disabled':
       default_pref.enable()
diff --git a/product/ERP5/tests/testContributionRegistryTool.py b/product/ERP5/tests/testContributionRegistryTool.py
index 92102549c1..351ce22030 100644
--- a/product/ERP5/tests/testContributionRegistryTool.py
+++ b/product/ERP5/tests/testContributionRegistryTool.py
@@ -120,36 +120,36 @@ return predicate.getDestinationPortalType()
     tool = self.portal.portal_contribution_registry
 
     # Test extension matching
-    self.assertEqual(tool.findPortalTypeName(file_name='test.txt'), 'Text')
-    self.assertEqual(tool.findPortalTypeName(file_name='test.odt'), 'Text')
-    self.assertEqual(tool.findPortalTypeName(file_name='001.jpg'), 'Image')
-    self.assertEqual(tool.findPortalTypeName(file_name='002.PNG'), 'Image')
-    self.assertEqual(tool.findPortalTypeName(file_name='002.PNG'), 'Image')
-    self.assertEqual(tool.findPortalTypeName(file_name='index.html'), 'Web Page')
+    self.assertEqual(tool.findPortalTypeName(filename='test.txt'), 'Text')
+    self.assertEqual(tool.findPortalTypeName(filename='test.odt'), 'Text')
+    self.assertEqual(tool.findPortalTypeName(filename='001.jpg'), 'Image')
+    self.assertEqual(tool.findPortalTypeName(filename='002.png'), 'Image')
+    self.assertEqual(tool.findPortalTypeName(filename='002.PNG'), 'Image')
+    self.assertEqual(tool.findPortalTypeName(filename='index.html'), 'Web Page')
     # Unknown extension
-    self.assertEqual(tool.findPortalTypeName(file_name='index.xxx'), 'File')
+    self.assertEqual(tool.findPortalTypeName(filename='index.xxx'), 'File')
 
     # Test mimetype matching
-    self.assertEqual(tool.findPortalTypeName(mime_type='text/html'), 'Web Page')
+    self.assertEqual(tool.findPortalTypeName(content_type='text/html'), 'Web Page')
 
     # Unknown mimetype
-    self.assertEqual(tool.findPortalTypeName(mime_type='application/octet-stream'), 'File')
+    self.assertEqual(tool.findPortalTypeName(content_type='application/octet-stream'), 'File')
 
     # Test both of extension and mimetype
-    self.assertNotEqual(tool.findPortalTypeName(file_name='message.eml'),
+    self.assertNotEqual(tool.findPortalTypeName(filename='message.eml'),
                         'Mail Message')
-    self.assertNotEqual(tool.findPortalTypeName(mime_type='message/rfc822'),
+    self.assertNotEqual(tool.findPortalTypeName(content_type='message/rfc822'),
                         'Mail Message')
-    self.assertEqual(tool.findPortalTypeName(file_name='message.eml',
-                                             mime_type='message/rfc822'),
+    self.assertEqual(tool.findPortalTypeName(filename='message.eml',
+                                             content_type='message/rfc822'),
                      'Mail Message')
 
     # Test test script
     data = """\
 Subject: Fax
 """
-    self.assertEqual(tool.findPortalTypeName(file_name='message.eml',
-                                             mime_type='message/rfc822',
+    self.assertEqual(tool.findPortalTypeName(filename='message.eml',
+                                             content_type='message/rfc822',
                                              data=data),
                      'Fax Message')
 
diff --git a/product/ERP5/tests/testERP5WebWithDms.py b/product/ERP5/tests/testERP5WebWithDms.py
index 130adf6b2c..a43d23b927 100644
--- a/product/ERP5/tests/testERP5WebWithDms.py
+++ b/product/ERP5/tests/testERP5WebWithDms.py
@@ -37,7 +37,8 @@ from AccessControl.SecurityManagement import newSecurityManager
 from Testing import ZopeTestCase
 from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\
      _getConversionServerDict
-from Products.ERP5Type.tests.utils import FileUpload
+from Products.ERP5Type.tests.utils import FileUpload, createZODBPythonScript
+
 
 LANGUAGE_LIST = ('en', 'fr', 'de', 'bg',)
 
@@ -568,8 +569,21 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
 
   def test_PreviewOOoDocumentWithEmbeddedImage(self):
     """Tests html preview of an OOo document with images as extensible content.
+    For this test, Presentation_checkConversionFormatPermission does not allow
+    access to original format for Unauthenticated users.
+    Chack that user can still access to other format.
     """
     portal = self.portal
+    script_id = 'Presentation_checkConversionFormatPermission'
+    python_code = """from AccessControl import getSecurityManager
+user = getSecurityManager().getUser()
+if (not user or not user.getId()) and not format:
+  return False
+return True
+"""
+    createZODBPythonScript(portal.portal_skins.custom, script_id,
+                           'format, **kw', python_code)
+    
     request = portal.REQUEST
     request['PARENTS'] = [self.app]
     self.getPortalObject().aq_parent.acl_users._doAddUser(
@@ -611,7 +625,7 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
     # then publish the document and access it anonymously by reference through
     # the web site
     document.publish()
-    
+
     transaction.commit()
     self.tic()
 
@@ -620,7 +634,7 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
     self.assertTrue(response.getHeader('content-type').startswith('text/html'))
     html = response.getBody()
     self.assertTrue('<img' in html, html)
-    
+
     # find the img src
     img_list = etree.HTML(html).findall('.//img')
     self.assertEquals(1, len(img_list))
@@ -633,6 +647,22 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
     png = response.getBody()
     self.assertTrue(png.startswith('\x89PNG'))
 
+    # Now purge cache and let Anonymous user converting the document.
+    self.login()
+    document.edit() # Reset cache key
+    transaction.commit()
+    self.tic()
+    response = self.publish('%s/%s/asEntireHTML' % (
+                            website.absolute_url_path(), document_reference))
+    self.assertTrue(response.getHeader('content-type').startswith('text/html'))
+    html = response.getBody()
+    self.assertTrue('<img' in html, html)
+    
+    # find the img src
+    img_list = etree.HTML(html).findall('.//img')
+    self.assertEquals(1, len(img_list))
+    src = img_list[0].get('src')
+
   def test_ImageConversionThroughWebSite(self):
     """Check that conversion parameters pass in url
     are hounoured to display an image in context of a website
diff --git a/product/ERP5/tests/testWebCrawler.py b/product/ERP5/tests/testWebCrawler.py
new file mode 100644
index 0000000000..39c5f370c6
--- /dev/null
+++ b/product/ERP5/tests/testWebCrawler.py
@@ -0,0 +1,298 @@
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
+#                    Nicolas Delaby <nicolas@erp5.org>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsibility of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# guarantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+
+import unittest
+from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\
+     _getConversionServerDict
+
+import transaction
+
+# test files' home
+FILENAME_REGULAR_EXPRESSION = "(?P<reference>[A-Z&Ã©@{]{3,7})-(?P<language>[a-z]{2})-(?P<version>[0-9]{3})"
+REFERENCE_REGULAR_EXPRESSION = "(?P<reference>[A-Z&Ã©@{]{3,7})(-(?P<language>[a-z]{2}))?(-(?P<version>[0-9]{3}))?"
+
+class TestWebCrawler(ERP5TypeTestCase):
+  """
+    Test Crawling mechanism
+  """
+
+  _path_to_delete_list = []
+  system_pref_id = 'my_preference'
+
+  def getTitle(self):
+    """
+      Return the title of the current test set.
+    """
+    return "ERP5 Live DMS - Web Crawling"
+
+  def getBusinessTemplateList(self):
+    """
+      Return the list of required business templates.
+    """
+    return ('erp5_base',
+            'erp5_ingestion',
+            'erp5_ingestion_mysql_innodb_catalog',
+            'erp5_web',
+            'erp5_dms')
+
+  def afterSetUp(self):
+    """
+      Initialize the ERP5 site.
+    """
+    self.login()
+    self.portal = self.getPortal()
+    self.setSystemPreference()
+    self.bootstrapWebSite()
+    transaction.commit()
+    self.tic()
+
+  def beforeTearDown(self):
+    portal = self.portal
+    module_id_list = [
+      'web_page_module',
+      'web_site_module',
+      'external_source_module',
+      'document_module',
+      ]
+    # delete created documents by test
+    for module_id in module_id_list:
+      module = portal[module_id]
+      module.manage_delObjects(list(module.objectIds()))
+    # Unindex deleted documents
+    transaction.commit()
+    self.tic()
+
+  def setSystemPreference(self):
+    portal_preferences = self.portal.portal_preferences
+    system_preference = portal_preferences._getOb(self.system_pref_id, None)
+    if system_preference is None:
+      system_preference = portal_preferences.newContent(id=self.system_pref_id,
+                                               portal_type='System Preference')
+    conversion_dict = _getConversionServerDict()
+    system_preference.\
+                   setPreferredOoodocServerAddress(conversion_dict['hostname'])
+    system_preference.\
+                    setPreferredOoodocServerPortNumber(conversion_dict['port'])
+    system_preference.setPreferredDocumentFilenameRegularExpression(
+                                                   FILENAME_REGULAR_EXPRESSION)
+    system_preference.setPreferredDocumentReferenceRegularExpression(
+                                                  REFERENCE_REGULAR_EXPRESSION)
+    if system_preference.getPreferenceState() != 'global':
+      system_preference.enable()
+
+
+  def bootstrapWebSite(self):
+    """Create 1 Website
+    live_test_web_site/section1/section1a
+                      /section2
+    create 2 web pages
+      W-REFERENCE.PAGE
+      W-REFERENCE.HOMEPAGE
+
+    the website use light version of erp5_web_layout
+    It keep just displaying sections and subsection
+    And default Web page
+    """
+    web_site_portal_type = 'Web Site'
+    web_section_portal_type = 'Web Section'
+    web_page_portal_type = 'Web Page'
+    web_site_module = self.portal.getDefaultModule(web_site_portal_type)
+    web_page_module = self.portal.getDefaultModule(web_page_portal_type)
+
+    text_content = """<p><a href="W-REFERENCE.PAGE">Page</a></p>"""
+    web_page_id = 'live_test_home'
+    home_page = web_page_module.newContent(portal_type=web_page_portal_type,
+                                          title='Home Page',
+                                          text_content=text_content,
+                                          reference='W-REFERENCE.HOMEPAGE',
+                                          version='001',
+                                          language='en',
+                                          id=web_page_id)
+    home_page.submit()
+    home_page.publish()
+
+    web_site_id = 'live_test_web_site'
+    web_site = web_site_module.newContent(portal_type=web_site_portal_type,
+                      id=web_site_id,
+                      title='Live Test Web Site',
+                      visible=True,
+                      default_page_displayed=True,
+                      site_map_section_parent=True,
+                      authorization_forced=True,
+                      aggregate_value=home_page,
+                      available_language_set=['en'],
+                      container_layout='erp5_web_layout_test',
+                      content_layout='erp5_web_content_layout_test')
+    web_site.publish()
+
+    text_content = """<p>
+    <a href="%s/W-REFERENCE.HOMEPAGE">absolute link to HOME PAGE</a>
+    </p>""" % web_site.absolute_url()
+    section1a_page = web_page_module.newContent(
+                                              portal_type=web_page_portal_type,
+                                              title='Home Page',
+                                              text_content=text_content,
+                                              reference='W-REFERENCE.PAGE',
+                                              version='001',
+                                              language='en')
+    section1a_page.submit()
+    section1a_page.publish()
+    web_section1 = web_site.newContent(portal_type=web_section_portal_type,
+                                      title='Section 1',
+                                      id='section1',
+                                      aggregate_value=section1a_page)
+    web_section2 = web_site.newContent(portal_type=web_section_portal_type,
+                                      title='Section 2',
+                                      id='section2',
+                                      aggregate_value=section1a_page)
+    web_section1a = web_section1.newContent(
+                                          portal_type=web_section_portal_type,
+                                          title='Section 1a',
+                                          id='section 1a', #add a space in id
+                                          aggregate_value=section1a_page)
+
+  def test_01_check_URLTransformations(self):
+    """Check crawlable functionalities regarding URL handling
+
+    getContentBaseURL
+    asNormalisedURL
+    getContentNormalisedURLList
+    """
+    web_page_portal_type = 'Web Page'
+    web_page_module = self.portal.getDefaultModule(web_page_portal_type)
+    web_page = web_page_module.newContent(portal_type=web_page_portal_type)
+    self.assertEquals(web_page.getContentBaseURL(), '')
+    web_page.fromURL('http://www.example.com')
+    self.assertEquals(web_page.getContentBaseURL(), 'http://www.example.com')
+    web_page.fromURL('http://www.example.com/section/sub_section')
+    self.assertEquals(web_page.getContentBaseURL(),
+                      'http://www.example.com/section')
+    text_content = """<html>
+    <head>
+      <base href="http://www.example.com"/>
+    </head>
+    <body>
+      <p><a href="http://www.notexample.com/">External link</a></p>
+      <p><a href="http://www.example.com//I don't care I put what/ I want/">
+          Funny link</a></p>
+      <p><a href="http://www.example.com/section">Internal link</a></p>
+      <p><a href="section2">Relative Internal link</a></p>
+      <p><a href="http://www.example.com/?title=%E9+crit">With Encoding issue
+      This link will be discarded</a></p>
+      <img src="my_image_link"/>
+      <script src="should_not_be_followed.js"/>
+      <p><a href="http://http://www.example.com/section">Not a link</a></p>
+    </body>
+    </html>"""
+    web_page.edit(text_content=text_content)
+    self.assertEquals(web_page.getContentBaseURL(), "http://www.example.com")
+    self.assertEquals(web_page.getContentNormalisedURLList(),
+                    ["http://www.example.com/I don't care I put what/ I want/",
+                     'http://www.example.com/section',
+                     'http://www.example.com/section2',])
+    # relative links without base tag
+    text_content = """<html>
+    <head>
+    </head>
+    <body>
+      <p><a href="section2">Relative Internal link</a></p>
+    </body>
+    </html>"""
+    web_page.edit(text_content=text_content)
+    web_page.fromURL('http://www.example.com/#fffff')
+    self.assertEquals(web_page.getContentBaseURL(), "http://www.example.com")
+    self.assertEquals(web_page.getContentNormalisedURLList(),
+                      ['http://www.example.com/section2',])
+    self.assertEquals(web_page.asNormalisedURL(),
+                      'http://www.example.com/#fffff')
+
+  def test_02_crawlWebSite(self):
+    """Call portal_contribution to crawl website hosted by itself.
+    """
+    web_site = self.portal.web_site_module.live_test_web_site
+    external_source_portal_type = 'URL Crawler'
+    web_crawler_module = self.portal.getDefaultModule(
+                                                   external_source_portal_type)
+    web_crawler = web_crawler_module.newContent(
+                                       portal_type=external_source_portal_type,
+                                       crawling_depth=5)
+    web_crawler.fromURL(web_site.absolute_url())
+    transaction.commit()
+    self.tic()
+    web_crawler.crawlContent()
+    transaction.commit()
+    self.tic()
+
+    # 6 = 1 website
+    #     + 3 Web Sections
+    #     + 1 absolute link to home_page
+    #     + 1 relative link from home_page to another web page
+    self.assertEquals(len(web_crawler), 6)
+    self.assertEquals(len(self.portal.portal_url_registry._getMappingDict()),
+                      6)
+    date_before = web_crawler.getModificationDate()
+    web_crawler.crawlContent()
+    transaction.commit()
+    self.tic()
+    # Nothing happens, portal_url_registry keep crawling twice
+    # the same url
+    self.assertEquals(len(web_crawler), 6)
+    self.assertEquals(len(self.portal.portal_url_registry._getMappingDict()),
+                      6)
+    # not modified
+    self.assertEquals(date_before, web_crawler.getModificationDate())
+
+    new_web_crawler = web_crawler_module.newContent(
+                                       portal_type=external_source_portal_type,
+                                       crawling_depth=5)
+    new_web_crawler.fromURL(web_site.absolute_url())
+    transaction.commit()
+    self.tic()
+    new_web_crawler.crawlContent()
+    transaction.commit()
+    self.tic()
+    # check that portal_url_registry
+    # block contribution of existing content
+    self.assertFalse(len(new_web_crawler))
+
+    # set another namespace on preference
+    preference = self.portal.portal_preferences[self.system_pref_id]
+    preference.setPreferredIngestionNamespace('NEW')
+    transaction.commit()
+    self.tic()
+    new_web_crawler.crawlContent()
+    transaction.commit()
+    self.tic()
+    self.assertEquals(len(web_crawler), 6)
+
+
+def test_suite():
+  suite = unittest.TestSuite()
+  suite.addTest(unittest.makeSuite(TestWebCrawler))
+  return suite
-- 
2.30.9