Commit 667c91df authored by Vincent Pelletier's avatar Vincent Pelletier

ZSQLCatalog: Make _reindexObjectList tolerate duplicate documents

When _reindexObjectList receives a document more than once (which may
happen when multiple different-tag indexation activities are spawned for
the same document), it would emit a meaningless error, saying that
document /foo/bar stole the uid of document /foo/bar.
Instead, fix duplicate tracking and skip such dulicates.

Also, simplify & make _reindexObjectList more robust:
- Always check path length.
- Allocate uids before looking for duplicates in catalog (it may not be
  actually needed at this level nowadays).
- Always check both uid-to-path and path-to-uid mappings.
- Reuse existing mappings to detect duplicates among objects being reindexed,
  removing the need for assigned_uid_dict.
- Avoid computing path more than once, as it's expensive.
parent 69aefdff
...@@ -1312,108 +1312,69 @@ class Catalog(Folder, ...@@ -1312,108 +1312,69 @@ class Catalog(Folder,
def _catalogObjectList(self, object_list, method_id_list=None, def _catalogObjectList(self, object_list, method_id_list=None,
disable_cache=0, check_uid=1, idxs=None): disable_cache=0, check_uid=1, idxs=None):
"""This is the real method to catalog objects.""" """This is the real method to catalog objects."""
LOG('SQLCatalog', TRACE, 'catalogging %d objects' % len(object_list))
if idxs not in (None, []): if idxs not in (None, []):
LOG('ZSLQCatalog.SQLCatalog:catalogObjectList', WARNING, LOG('ZSLQCatalog.SQLCatalog:catalogObjectList', WARNING,
'idxs is ignored in this function and is only provided to be compatible with CMFCatalogAware.reindexObject.') 'idxs is ignored in this function and is only provided to be compatible with CMFCatalogAware.reindexObject.')
if not self.getPortalObject().isIndexable(): if not self.getPortalObject().isIndexable():
return return
path_uid_dict = {} object_path_dict = {}
uid_path_dict = {} uid_list = []
uid_list_append = uid_list.append
if check_uid:
path_list = []
path_list_append = path_list.append
uid_list = []
uid_list_append = uid_list.append
for object in object_list:
path = object.getPath()
if path is not None:
path_list_append(path)
uid = object.uid
if uid is not None:
uid_list_append(uid)
path_uid_dict = self.getUidDictForPathList(path_list=path_list)
uid_path_dict = self.getPathDictForUidList(uid_list=uid_list)
# This dict will store uids and objects which are verified below.
# The purpose is to prevent multiple objects from having the same
# uid inside object_list.
assigned_uid_dict = {}
for object in object_list: for object in object_list:
if object in object_path_dict:
continue
path = object.getPath()
if len(path) > MAX_PATH_LEN:
raise ValueError('path too long (%i): %r' % (len(path), path))
object_path_dict[object] = path
try: try:
uid = aq_base(object).uid uid = aq_base(object).uid
except AttributeError: except AttributeError:
uid = None uid = None
if uid is None or uid == 0: if uid is None or uid == 0:
try: object.uid = uid = self.newUid()
object.uid = self.newUid() uid_list_append(uid)
except ConflictError: LOG('SQLCatalog', TRACE, 'catalogging %d objects' % len(object_path_dict))
raise if check_uid:
except: path_uid_dict = self.getUidDictForPathList(path_list=object_path_dict.values())
raise RuntimeError, 'could not set missing uid for %r' % (object,) uid_path_dict = self.getPathDictForUidList(uid_list=uid_list)
elif check_uid: for object, path in object_path_dict.iteritems():
if uid in assigned_uid_dict: uid = object.uid
error_message = 'uid of %r is %r and ' \ if path_uid_dict.setdefault(path, uid) != uid:
'is already assigned to %s in catalog !!! This can be fatal.' % \ error_message = 'path %r has uids %r (catalog) and %r (being indexed) ! This can break relations' % (
(object, uid, assigned_uid_dict[uid]) path,
if not self.sql_catalog_raise_error_on_uid_check: path_uid_dict[path],
LOG("SQLCatalog.catalogObjectList", ERROR, error_message) uid,
else: )
raise ValueError(error_message) if self.sql_catalog_raise_error_on_uid_check:
raise ValueError(error_message)
path = object.getPath() LOG("SQLCatalog._catalogObjectList", ERROR, error_message)
index = path_uid_dict.get(path) catalog_path = uid_path_dict.setdefault(uid, path)
if index is not None: if catalog_path != path and catalog_path != 'deleted':
if index < 0: # Two possible cases if catalog_path == 'deleted':
raise CatalogError, 'A negative uid %d is used for %s. Your catalog is broken. Recreate your catalog.' % (index, path) # - Reindexed object's path changed (ie, it or at least one of its
if uid != index or isinstance(uid, int): # parents was renamed) but unindexObject was not called yet.
error_message = 'uid of %r changed from %r (property) to %r '\ # Reindexing is harmelss: unindexObject and then an
'(catalog, by path) !!! This can be fatal' % (object, uid, index) # immediateReindexObject will be called.
if not self.sql_catalog_raise_error_on_uid_check: # - Reindexed object was deleted by a concurrent transaction, which
LOG("SQLCatalog.catalogObjectList", ERROR, error_message) # committed after we got our ZODB snapshot of this object.
else: # Reindexing is harmless: unindexObject will be called, and
raise ValueError(error_message) # cannot be executed in parallel thanks to activity's
else: # serialisation_tag (so we cannot end up with a fantom object in
if uid in uid_path_dict: # catalog).
catalog_path = uid_path_dict.get(uid) # So we index object.
else: # We could also not index it to save the time needed to index, but
catalog_path = self.getPathForUid(uid) # this would slow down all regular case to slightly improve an
if catalog_path == 'deleted': # exceptional case.
# Two possible cases: error_message = 'uid %r is shared between %r (catalog) and %r (being indexed) ! This can break relations' % (
# - Reindexed object's path changed (ie, it or at least one of its uid,
# parents was renamed) but unindexObject was not called yet. uid_path_dict[uid],
# Reindexing is harmelss: unindexObject and then an path,
# immediateReindexObject will be called. )
# - Reindexed object was deleted by a concurrent transaction, which if self.sql_catalog_raise_error_on_uid_check:
# committed after we got our ZODB snapshot of this object. raise ValueError(error_message)
# Reindexing is harmless: unindexObject will be called, and LOG('SQLCatalog._catalogObjectList', ERROR, error_message)
# cannot be executed in parallel thanks to activity's
# serialisation_tag (so we cannot end up with a fantom object in
# catalog).
# So we index object.
# We could also not index it to save the time needed to index, but
# this would slow down all regular case to slightly improve an
# exceptional case.
pass
elif catalog_path is not None:
# An uid conflict happened... Why?
# can be due to path length
if len(path) > MAX_PATH_LEN:
LOG('SQLCatalog', ERROR, 'path of object %r is too long for catalog. You should use a shorter path.' %(object,))
error_message = 'uid of %r is %r and ' \
'is already assigned to %s in catalog !!! This can be fatal.' \
% (object, uid, catalog_path)
if not self.sql_catalog_raise_error_on_uid_check:
LOG('SQLCatalog', ERROR, error_message)
else:
raise ValueError(error_message)
uid = object.uid
assigned_uid_dict[uid] = object
if method_id_list is None: if method_id_list is None:
method_id_list = self.getSqlCatalogObjectListList() method_id_list = self.getSqlCatalogObjectListList()
...@@ -1449,7 +1410,7 @@ class Catalog(Folder, ...@@ -1449,7 +1410,7 @@ class Catalog(Folder,
except KeyError: except KeyError:
pass pass
if expression is None: if expression is None:
catalogged_object_list = object_list catalogged_object_list = object_path_dict.keys()
else: else:
text = expression.text text = expression.text
catalogged_object_list = catalogged_object_list_cache.get(text) catalogged_object_list = catalogged_object_list_cache.get(text)
...@@ -1463,7 +1424,7 @@ class Catalog(Folder, ...@@ -1463,7 +1424,7 @@ class Catalog(Folder,
% (method_name, text), DeprecationWarning) % (method_name, text), DeprecationWarning)
expression_cache_key_list = filter.get('expression_cache_key', ()) expression_cache_key_list = filter.get('expression_cache_key', ())
expression_result_cache = {} expression_result_cache = {}
for object in object_list: for object in object_path_dict:
if expression_cache_key_list: if expression_cache_key_list:
# Expressions are slow to evaluate because they are executed # Expressions are slow to evaluate because they are executed
# in restricted environment. So we try to save results of # in restricted environment. So we try to save results of
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment