ZSQLCatalog: Make _reindexObjectList tolerate duplicate documents

When _reindexObjectList receives a document more than once (which may happen when multiple different-tag indexation activities are spawned for the same document), it would emit a meaningless error, saying that document /foo/bar stole the uid of document /foo/bar. Instead, fix duplicate tracking and skip such dulicates. Also, simplify & make _reindexObjectList more robust: - Always check path length. - Allocate uids before looking for duplicates in catalog (it may not be actually needed at this level nowadays). - Always check both uid-to-path and path-to-uid mappings. - Reuse existing mappings to detect duplicates among objects being reindexed, removing the need for assigned_uid_dict. - Avoid computing path more than once, as it's expensive.

ZSQLCatalog: Make _reindexObjectList tolerate duplicate documents
When _reindexObjectList receives a document more than once (which may happen when multiple different-tag indexation activities are spawned for the same document), it would emit a meaningless error, saying that document /foo/bar stole the uid of document /foo/bar. Instead, fix duplicate tracking and skip such dulicates. Also, simplify & make _reindexObjectList more robust: - Always check path length. - Allocate uids before looking for duplicates in catalog (it may not be actually needed at this level nowadays). - Always check both uid-to-path and path-to-uid mappings. - Reuse existing mappings to detect duplicates among objects being reindexed, removing the need for assigned_uid_dict. - Avoid computing path more than once, as it's expensive.
667c91df · Vincent Pelletier · 69aefdff · 667c91df
Commit 667c91df authored Sep 18, 2017 by Vincent Pelletier
Hide whitespace changes
Inline Side-by-side

Showing with 53 additions and 92 deletions

product/ZSQLCatalog/SQLCatalog.py product/ZSQLCatalog/SQLCatalog.py +53 -92

No files found.
--- a/product/ZSQLCatalog/SQLCatalog.py
+++ b/product/ZSQLCatalog/SQLCatalog.py
@@ -1312,108 +1312,69 @@ class Catalog(Folder,
  def _catalogObjectList(self, object_list, method_id_list=None,
                         disable_cache=0, check_uid=1, idxs=None):
    """This is the real method to catalog objects."""
-    LOG('SQLCatalog', TRACE, 'catalogging %d objects' % len(object_list))
    if idxs not in (None, []):
      LOG('ZSLQCatalog.SQLCatalog:catalogObjectList', WARNING,
          'idxs is ignored in this function and is only provided to be compatible with CMFCatalogAware.reindexObject.')
    if not self.getPortalObject().isIndexable():
      return
-    path_uid_dict = {}
+    object_path_dict = {}
-    uid_path_dict = {}
+    uid_list = []
+    uid_list_append = uid_list.append
-    if check_uid:
-      path_list = []
-      path_list_append = path_list.append
-      uid_list = []
-      uid_list_append = uid_list.append
-      for object in object_list:
-        path = object.getPath()
-        if path is not None:
-          path_list_append(path)
-        uid = object.uid
-        if uid is not None:
-          uid_list_append(uid)
-      path_uid_dict = self.getUidDictForPathList(path_list=path_list)
-      uid_path_dict = self.getPathDictForUidList(uid_list=uid_list)
-    # This dict will store uids and objects which are verified below.
-    # The purpose is to prevent multiple objects from having the same
-    # uid inside object_list.
-    assigned_uid_dict = {}
    for object in object_list:
+      if object in object_path_dict:
+        continue
+      path = object.getPath()
+      if len(path) > MAX_PATH_LEN:
+        raise  ValueError('path too long (%i): %r' % (len(path), path))
+      object_path_dict[object] = path
      try:
        uid = aq_base(object).uid
      except AttributeError:
        uid = None
      if uid is None or uid == 0:
-        try:
+        object.uid = uid = self.newUid()
-          object.uid = self.newUid()
+      uid_list_append(uid)
-        except ConflictError:
+    LOG('SQLCatalog', TRACE, 'catalogging %d objects' % len(object_path_dict))
-          raise
+    if check_uid:
-        except:
+      path_uid_dict = self.getUidDictForPathList(path_list=object_path_dict.values())
-          raise RuntimeError, 'could not set missing uid for %r' % (object,)
+      uid_path_dict = self.getPathDictForUidList(uid_list=uid_list)
-      elif check_uid:
+      for object, path in object_path_dict.iteritems():
-        if uid in assigned_uid_dict:
+        uid = object.uid
-            error_message = 'uid of %r is %r and ' \
+        if path_uid_dict.setdefault(path, uid) != uid:
-                  'is already assigned to %s in catalog !!! This can be fatal.' % \
+          error_message = 'path %r has uids %r (catalog) and %r (being indexed) ! This can break relations' % (
-                  (object, uid, assigned_uid_dict[uid])
+            path,
-            if not self.sql_catalog_raise_error_on_uid_check:
+            path_uid_dict[path],
-                LOG("SQLCatalog.catalogObjectList", ERROR, error_message)
+            uid,
-            else:
+          )
-                raise ValueError(error_message)
+          if self.sql_catalog_raise_error_on_uid_check:
+            raise ValueError(error_message)
-        path = object.getPath()
+          LOG("SQLCatalog._catalogObjectList", ERROR, error_message)
-        index = path_uid_dict.get(path)
+        catalog_path = uid_path_dict.setdefault(uid, path)
-        if index is not None:
+        if catalog_path != path and catalog_path != 'deleted':
-          if index < 0:
+          # Two possible cases if catalog_path == 'deleted':
-            raise CatalogError, 'A negative uid %d is used for %s. Your catalog is broken. Recreate your catalog.' % (index, path)
+          # - Reindexed object's path changed (ie, it or at least one of its
-          if uid != index or isinstance(uid, int):
+          #   parents was renamed) but unindexObject was not called yet.
-            error_message = 'uid of %r changed from %r (property) to %r '\
+          #   Reindexing is harmelss: unindexObject and then an
-	                    '(catalog, by path) !!! This can be fatal' % (object, uid, index)
+          #   immediateReindexObject will be called.
-            if not self.sql_catalog_raise_error_on_uid_check:
+          # - Reindexed object was deleted by a concurrent transaction, which
-              LOG("SQLCatalog.catalogObjectList", ERROR, error_message)
+          #   committed after we got our ZODB snapshot of this object.
-            else:
+          #   Reindexing is harmless: unindexObject will be called, and
-              raise ValueError(error_message)
+          #   cannot be executed in parallel thanks to activity's
-        else:
+          #   serialisation_tag (so we cannot end up with a fantom object in
-          if uid in uid_path_dict:
+          #   catalog).
-            catalog_path = uid_path_dict.get(uid)
+          # So we index object.
-          else:
+          # We could also not index it to save the time needed to index, but
-            catalog_path = self.getPathForUid(uid)
+          # this would slow down all regular case to slightly improve an
-          if catalog_path == 'deleted':
+          # exceptional case.
-            # Two possible cases:
+          error_message = 'uid %r is shared between %r (catalog) and %r (being indexed) ! This can break relations' % (
-            # - Reindexed object's path changed (ie, it or at least one of its
+            uid,
-            #   parents was renamed) but unindexObject was not called yet.
+            uid_path_dict[uid],
-            #   Reindexing is harmelss: unindexObject and then an
+            path,
-            #   immediateReindexObject will be called.
+          )
-            # - Reindexed object was deleted by a concurrent transaction, which
+          if self.sql_catalog_raise_error_on_uid_check:
-            #   committed after we got our ZODB snapshot of this object.
+            raise ValueError(error_message)
-            #   Reindexing is harmless: unindexObject will be called, and
+          LOG('SQLCatalog._catalogObjectList', ERROR, error_message)
-            #   cannot be executed in parallel thanks to activity's
-            #   serialisation_tag (so we cannot end up with a fantom object in
-            #   catalog).
-            # So we index object.
-            # We could also not index it to save the time needed to index, but
-            # this would slow down all regular case to slightly improve an
-            # exceptional case.
-            pass
-          elif catalog_path is not None:
-            # An uid conflict happened... Why?
-            # can be due to path length
-            if len(path) > MAX_PATH_LEN:
-              LOG('SQLCatalog', ERROR, 'path of object %r is too long for catalog. You should use a shorter path.' %(object,))
-            error_message = 'uid of %r is %r and ' \
-                            'is already assigned to %s in catalog !!! This can be fatal.' \
-                            % (object, uid, catalog_path)
-            if not self.sql_catalog_raise_error_on_uid_check:
-                LOG('SQLCatalog', ERROR, error_message)
-            else:
-                raise ValueError(error_message)
-            uid = object.uid
-        assigned_uid_dict[uid] = object
    if method_id_list is None:
      method_id_list = self.getSqlCatalogObjectListList()
@@ -1449,7 +1410,7 @@ class Catalog(Folder,
        except KeyError:
          pass
        if expression is None:
-          catalogged_object_list = object_list
+          catalogged_object_list = object_path_dict.keys()
        else:
          text = expression.text
          catalogged_object_list = catalogged_object_list_cache.get(text)
@@ -1463,7 +1424,7 @@ class Catalog(Folder,
                            % (method_name, text), DeprecationWarning)
            expression_cache_key_list = filter.get('expression_cache_key', ())
            expression_result_cache = {}
-            for object in object_list:
+            for object in object_path_dict:
              if expression_cache_key_list:
                # Expressions are slow to evaluate because they are executed
                # in restricted environment. So we try to save results of