Commit 02287fad authored by Jean-Paul Smets's avatar Jean-Paul Smets

Various fixes for portal type discovery. Crawler has some method commented to...

Various fixes for portal type discovery. Crawler has some method commented to prevent excessive ZODB usage (until this is optimised).

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@13635 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 4f9101c1
...@@ -29,6 +29,8 @@ ...@@ -29,6 +29,8 @@
import cStringIO import cStringIO
import re import re
import string import string
import socket
import md5
import urllib2, urllib import urllib2, urllib
from AccessControl import ClassSecurityInfo, getSecurityManager from AccessControl import ClassSecurityInfo, getSecurityManager
...@@ -42,7 +44,18 @@ from zLOG import LOG ...@@ -42,7 +44,18 @@ from zLOG import LOG
from DateTime import DateTime from DateTime import DateTime
from Acquisition import aq_base from Acquisition import aq_base
# Install openers
import ContributionOpener
opener = urllib2.build_opener(ContributionOpener.DirectoryFileHandler)
urllib2.install_opener(opener)
# A temporary hack until urllib2 supports timeout setting - XXX
import socket
socket.setdefaulttimeout(60) # 1 minute timeout
# Global parameters
TEMP_NEW_OBJECT_KEY = '_v_new_object' TEMP_NEW_OBJECT_KEY = '_v_new_object'
MAX_REPEAT = 10
_marker = [] # Create a new marker object. _marker = [] # Create a new marker object.
...@@ -111,47 +124,53 @@ class ContributionTool(BaseTool): ...@@ -111,47 +124,53 @@ class ContributionTool(BaseTool):
# types share the same constructor. However, if Memo has # types share the same constructor. However, if Memo has
# same constructor as Text and Memo is not in content_type_registry # same constructor as Text and Memo is not in content_type_registry
# then it should be considered. # then it should be considered.
valid_portal_type_list = [] extra_valid_portal_type_list = []
content_registry_type_dict = getContentTypeRegistryTypeDict() content_registry_type_dict = getContentTypeRegistryTypeDict()
portal_type_tool = self.portal_types portal_type_tool = self.portal_types
for pt in portal_type_tool.objectValues(): for pt in portal_type_tool.objectValues():
if hasattr(pt, 'factory') and pt.factory == portal_type_tool[document.getPortalType()].factory: if hasattr(pt, 'factory') and pt.factory == portal_type_tool[document.getPortalType()].factory:
if not content_registry_type_dict.has_key(pt.id): if not content_registry_type_dict.has_key(pt.id):
valid_portal_type_list.append(pt.id) extra_valid_portal_type_list.append(pt.id)
if not extra_valid_portal_type_list:
# There is really no ambiguity here
# The portal_type set by PUT_factory is appropriate
# This is the best case we can get
# LOG('findTypeName no ambiguity', 0, document.portal_type)
return document.portal_type
valid_portal_type_list = [document.portal_type] + extra_valid_portal_type_list
# Check if the filename tells which portal_type this is # Check if the filename tells which portal_type this is
portal_type_list = self.getPropertyDictFromFileName(file_name).get('portal_type', []) portal_type_list = self.getPropertyDictFromFileName(file_name).get('portal_type', [])
if isinstance(portal_type_list, str): portal_type_list = [portal_type_list]
portal_type_list = filter(lambda x: x in valid_portal_type_list, portal_type_list)
if not portal_type_list:
portal_type_list = valid_portal_type_list
if len(portal_type_list) == 1: if len(portal_type_list) == 1:
# if we have only one, then this is it # if we have only one, then this is it
# LOG('findTypeName single portal_type_list', 0, portal_type_list[0])
return portal_type_list[0] return portal_type_list[0]
# If it is still None, we need to read the document # If it is still None, we need to read the document
# to check which of the candidates is suitable # to check which of the candidates is suitable
if portal_type is None: # Let us give a chance to getPropertyDictFromContent to
# The document is now responsible of telling all its properties # tell us what is the portal type of this document
portal_type = document.getPropertyDictFromContent().get('portal_type', None) content_portal_type_list = document.getPropertyDictFromContent().get('portal_type', None)
if portal_type is not None: if content_portal_type_list:
# we check if it matches the candidate list, if there were any if isinstance(portal_type, str):
if len(portal_type_list)>1 and portal_type not in portal_type_list: content_portal_type_list = [content_portal_type_list]
raise TypeError('%s not in the list of %s' % (portal_type, str(portal_type_list))) # Filter valid candidates
return portal_type content_portal_type_list = filter(lambda x: x in portal_type_list, content_portal_type_list)
else: if content_portal_type_list:
# if not found but the candidate list is there, return the first # if we have more than one, then return the first one
if len(portal_type_list)>0: # LOG('findTypeName from content', 0, content_portal_type_list[0])
return content_portal_type_list[0]
# If portal_type_list is not empty, return the first one
# LOG('findTypeName from first portal_type_list', 0, portal_type_list[0])
return portal_type_list[0] return portal_type_list[0]
if portal_type is None:
# We can not do anything anymore
#return document.portal_type # XXX Wrong or maybe right ?
return None
if portal_type not in valid_portal_type_list:
# We will not be able to migrate ob to portal_type
#return ob.portal_type
return None
return portal_type
security.declareProtected(Permissions.AddPortalContent, 'newContent') security.declareProtected(Permissions.AddPortalContent, 'newContent')
def newContent(self, id=None, portal_type=None, url=None, container=None, def newContent(self, id=None, portal_type=None, url=None, container=None,
container_path=None, container_path=None,
...@@ -209,28 +228,37 @@ class ContributionTool(BaseTool): ...@@ -209,28 +228,37 @@ class ContributionTool(BaseTool):
del kw['file_name'] del kw['file_name']
else: else:
# build a new file from the url # build a new file from the url
data = urllib2.urlopen(url).read() url_file = urllib2.urlopen(url)
data = url_file.read() # time out must be set or ... too long XXX
file = cStringIO.StringIO() file = cStringIO.StringIO()
file.write(data) file.write(data)
file.seek(0) file.seek(0)
# Create a file name based on the URL and quote it
file_name = url.split('/')[-1] or url.split('/')[-2] file_name = url.split('/')[-1] or url.split('/')[-2]
file_name = self._encodeURL(file_name) file_name = urllib.quote(file_name, safe='')
if hasattr(file, 'headers'): file_name = file_name.replace('%', '')
headers = file.headers # For URLs, we want an id by default equal to the encoded URL
if id is None: id = self._encodeURL(url)
if hasattr(url_file, 'headers'):
headers = url_file.headers
if hasattr(headers, 'type'): if hasattr(headers, 'type'):
mime_type = headers.type mime_type = headers.type
kw['file'] = file kw['file'] = file
# If the portal_type was provided, we can go faster # If the portal_type was provided, we can go faster
if portal_type is not None and portal_type != '': if portal_type and container is None:
# We know the portal_type, let us find the module # We know the portal_type, let us find the default module
module = self.getDefaultModule(portal_type) # and use it as container
container = self.getDefaultModule(portal_type)
# And return a document if portal_type and container is not None:
# We could simplify things here and return a document immediately
# NOTE: we use the module ID generator rather than the provided ID # NOTE: we use the module ID generator rather than the provided ID
document = module.newContent(portal_type=portal_type, **kw) #document = module.newContent(portal_type=portal_type, **kw)
if discover_metadata: document.discoverMetadata(file_name=file_name, user_login=user_login) #if discover_metadata:
return document # document.activate().discoverMetadata(file_name=file_name, user_login=user_login)
#return document
pass # XXX - This needs to be implemented once the rest is stable
# From here, there is no hope unless a file was provided # From here, there is no hope unless a file was provided
if file is None: if file is None:
...@@ -239,6 +267,7 @@ class ContributionTool(BaseTool): ...@@ -239,6 +267,7 @@ class ContributionTool(BaseTool):
# So we will simulate WebDAV to get an empty object # So we will simulate WebDAV to get an empty object
# with PUT_factory - we provide the mime_type as # with PUT_factory - we provide the mime_type as
# parameter # parameter
# LOG('new content', 0, "%s -- %s" % (file_name, mime_type))
ob = self.PUT_factory(file_name, mime_type, None) ob = self.PUT_factory(file_name, mime_type, None)
# Raise an error if we could not guess the portal type # Raise an error if we could not guess the portal type
...@@ -250,7 +279,6 @@ class ContributionTool(BaseTool): ...@@ -250,7 +279,6 @@ class ContributionTool(BaseTool):
document = BaseTool._getOb(self, file_name) document = BaseTool._getOb(self, file_name)
# Then edit the document contents (so that upload can happen) # Then edit the document contents (so that upload can happen)
kw.setdefault('source_reference', file_name) # XXX redundant with discoverMetadata
document._edit(**kw) document._edit(**kw)
if url: document.fromURL(url) if url: document.fromURL(url)
...@@ -260,15 +288,15 @@ class ContributionTool(BaseTool): ...@@ -260,15 +288,15 @@ class ContributionTool(BaseTool):
# Move the document to where it belongs # Move the document to where it belongs
if container_path is not None: if container_path is not None:
container = self.getPortalObject().restrictedTraverse(container_path) container = self.getPortalObject().restrictedTraverse(container_path)
document = self._setObject(file_name, ob, user_login=user_login, container=container, id=id) document = self._setObject(file_name, ob, user_login=user_login,
container=container, id=id, discover_metadata=discover_metadata)
document = self._getOb(file_name) # Call _getOb to purge cache document = self._getOb(file_name) # Call _getOb to purge cache
# Notify workflows # Notify workflows
document.notifyWorkflowCreated() #document.notifyWorkflowCreated()
# Reindex it and return the document # Reindex it and return the document
document.reindexObject() document.reindexObject()
if document.getCrawlingDepth() > 0: document.activate().crawlContent()
return document return document
security.declareProtected( Permissions.AddPortalContent, 'newXML' ) security.declareProtected( Permissions.AddPortalContent, 'newXML' )
...@@ -315,7 +343,7 @@ class ContributionTool(BaseTool): ...@@ -315,7 +343,7 @@ class ContributionTool(BaseTool):
return property_dict return property_dict
# WebDAV virtual folder support # WebDAV virtual folder support
def _setObject(self, name, ob, user_login=None, container=None, id=None): def _setObject(self, name, ob, user_login=None, container=None, id=None, discover_metadata=1):
""" """
The strategy is to let NullResource.PUT do everything as The strategy is to let NullResource.PUT do everything as
usual and at the last minute put the object in a different usual and at the last minute put the object in a different
...@@ -368,11 +396,27 @@ class ContributionTool(BaseTool): ...@@ -368,11 +396,27 @@ class ContributionTool(BaseTool):
else: else:
new_id = id new_id = id
ob.id = new_id ob.id = new_id
existing_document = module.get(new_id, None)
if existing_document is None:
# There is no preexisting document - we can therefore
# set the new object
module._setObject(new_id, ob) module._setObject(new_id, ob)
# We can now discover metadata # We can now discover metadata
document = module[new_id] document = module[new_id]
document.discoverMetadata(file_name=name, user_login=user_login) if discover_metadata:
# Metadata disovery is done as an activity by default
# If we need to discoverMetadata synchronously, it must
# be for user interface and should thus be handled by
# ZODB scripts
document.activate().discoverMetadata(file_name=name, user_login=user_login)
else:
document = existing_document
if document.isExternalDocument():
# If this is an external document, update its content
document.activate().updateContentFromURL()
else:
# This is where we may have to implement revision support
raise NotImplementedError
# Keep the document close to us - this is only useful for # Keep the document close to us - this is only useful for
# file upload from webdav # file upload from webdav
...@@ -465,13 +509,30 @@ class ContributionTool(BaseTool): ...@@ -465,13 +509,30 @@ class ContributionTool(BaseTool):
we must anyway insert objects in btrees and this we must anyway insert objects in btrees and this
is simimar in cost to accessing them. is simimar in cost to accessing them.
""" """
# Produce an MD5 from the URL
hex_md5 = md5.md5(url).hexdigest()
# Take the first part in the URL which is not empty
# LOG("_encodeURL", 0, url)
url_segment = url.split(':')[1]
url_segment_list = url_segment.split('/')
url_domain = None
for url_part in url_segment_list:
if url_part:
url_domain = url_part
break
# Return encoded url
if url_domain:
url_domain = urllib.quote(url_domain, safe='')
url_domain = url_domain.replace('%', '')
return "%s-%s" % (url_domain, hex_md5)
return hex_md5
url = urllib.quote(url, safe='') url = urllib.quote(url, safe='')
url = url.replace('_', '__') url = url.replace('_', '__')
url = url.replace('%', '_') url = url.replace('%', '_')
return url return url
security.declareProtected(Permissions.AddPortalContent, 'crawlContent') security.declareProtected(Permissions.AddPortalContent, 'crawlContent')
def crawlContent(self, content): def crawlContent(self, content, container=None):
""" """
Analyses content and download linked pages Analyses content and download linked pages
...@@ -485,9 +546,11 @@ class ContributionTool(BaseTool): ...@@ -485,9 +546,11 @@ class ContributionTool(BaseTool):
base_url = content.getContentBaseURL() base_url = content.getContentBaseURL()
url_list = map(lambda url: self._normaliseURL(url, base_url), set(content.getContentURLList())) url_list = map(lambda url: self._normaliseURL(url, base_url), set(content.getContentURLList()))
for url in set(url_list): for url in set(url_list):
# LOG('trying to crawl', 0, url)
# Some url protocols should not be crawled # Some url protocols should not be crawled
if url.split(':')[0] in no_crawl_protocol_list: if url.split(':')[0] in no_crawl_protocol_list:
continue continue
if container is None:
#if content.getParentValue() #if content.getParentValue()
# in place of not ? # in place of not ?
container = content.getParentValue() container = content.getParentValue()
...@@ -499,6 +562,7 @@ class ContributionTool(BaseTool): ...@@ -499,6 +562,7 @@ class ContributionTool(BaseTool):
# XXX - This call is not working due to missing group_method_id # XXX - This call is not working due to missing group_method_id
# therefore, multiple call happen in parallel and eventually fail # therefore, multiple call happen in parallel and eventually fail
# (the same URL is created multiple times) # (the same URL is created multiple times)
# LOG('activate newContentFromURL', 0, url)
self.activate(activity="SQLQueue").newContentFromURL(container_path=container.getRelativeUrl(), self.activate(activity="SQLQueue").newContentFromURL(container_path=container.getRelativeUrl(),
id=id, url=url, crawling_depth=depth - 1) id=id, url=url, crawling_depth=depth - 1)
else: else:
...@@ -506,28 +570,35 @@ class ContributionTool(BaseTool): ...@@ -506,28 +570,35 @@ class ContributionTool(BaseTool):
new_depth = max(depth - 1, document.getCrawlingDepth()) new_depth = max(depth - 1, document.getCrawlingDepth())
document._setCrawlingDepth(new_depth) document._setCrawlingDepth(new_depth)
# And activate updateContentFromURL on existing document # And activate updateContentFromURL on existing document
next_date = document.getNextAlarmDate() next_date = document.getNextAlarmDate() # This should prevent doing the update too often
document.activate(at_date=next_date).updateContentFromURL() # LOG('activate updateContentFromURL', 0, url)
document.activate(at_date=next_date).updateContentFromURL(crawling_depth=depth - 1)
security.declareProtected(Permissions.AddPortalContent, 'updateContentFromURL') security.declareProtected(Permissions.AddPortalContent, 'updateContentFromURL')
def updateContentFromURL(self, content): def updateContentFromURL(self, content, repeat=MAX_REPEAT, crawling_depth=0):
""" """
Updates an existing content. Updates an existing content.
""" """
# Step 0: update crawling_depth if required
if crawling_depth > content.getCrawlingDepth():
content._setCrawlingDepth(crawling_depth)
# Step 1: download new content # Step 1: download new content
try:
url = content.asURL() url = content.asURL()
data = urllib2.urlopen(url).read() data = urllib2.urlopen(url).read()
file = cStringIO.StringIO() file = cStringIO.StringIO()
file.write(data) file.write(data)
file.seek(0) file.seek(0)
except socket.error, msg: # repeat multiple times in case of socket error
content.updateContentFromURL(repeat=repeat - 1)
# Step 2: compare and update if necessary (md5) # Step 2: compare and update if necessary (md5)
# do here some md5 stuff to compare contents... # do here some md5 stuff to compare contents...
if 1: if 1:
content._edit(file=file) # content._edit(file=file) # Commented for testing
# Step 3: convert to base format # Step 3: convert to base format
content.convertToBaseFormat() # content.convertToBaseFormat() # Commented for testing
# Step 4: activate populate (unless interaction workflow does it) # Step 4: activate populate (unless interaction workflow does it)
content.activate().populateContent() # content.activate().populateContent() # Commented for testing
# Step 5: activate crawlContent # Step 5: activate crawlContent
content.activate().crawlContent() content.activate().crawlContent()
else: else:
...@@ -539,7 +610,7 @@ class ContributionTool(BaseTool): ...@@ -539,7 +610,7 @@ class ContributionTool(BaseTool):
content.activate(at_date=next_date).updateContentFromURL() content.activate(at_date=next_date).updateContentFromURL()
security.declareProtected(Permissions.AddPortalContent, 'newContentFromURL') security.declareProtected(Permissions.AddPortalContent, 'newContentFromURL')
def newContentFromURL(self, **kw): def newContentFromURL(self, container_path=None, id=None, repeat=MAX_REPEAT, **kw):
""" """
A wrapper method for newContent which provides extra safety A wrapper method for newContent which provides extra safety
in case or errors (ie. download, access, conflict, etc.). in case or errors (ie. download, access, conflict, etc.).
...@@ -550,6 +621,33 @@ class ContributionTool(BaseTool): ...@@ -550,6 +621,33 @@ class ContributionTool(BaseTool):
NOTE: implementation needs to be done. NOTE: implementation needs to be done.
""" """
return self.newContent(**kw) # First of all, make sure do not try to create an existing document
if container_path is not None and id is not None:
container = self.restrictedTraverse(container_path)
document = container.get(id, None)
if document is not None:
# Document aleardy exists: no need to keep on crawling
return
try:
document = self.newContent(container_path=container_path, id=id, **kw)
if document.getCrawlingDepth() > 0: document.activate().crawlContent()
document.activate(at_date=document.getNextAlarmDate()).updateContentFromURL()
except urllib2.HTTPError, error:
# Catch any HTTP error
self.activate(at_date=DateTime() + 1).newContentFromURL(
container_path=container_path, id=id,
repeat=repeat - 1, **kw)
except urllib2.URLError, error:
if error.reason.args[0] == -3:
# Temporary failure in name resolution - try again in 1 day
self.activate(at_date=DateTime() + 1).newContentFromURL(
container_path=container_path, id=id,
repeat=repeat - 1, **kw)
else:
# Unknown errror - to be extended
raise
except:
# Pass exception to Zope (ex. conflict errors)
raise
InitializeClass(ContributionTool) InitializeClass(ContributionTool)
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment