Fix poor performance, Implement a dummy cache of relevant_paragraph list

refactor getParagraphItem to not try/except too much without need. Added few log entries for errors. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk/utils@41666 20353a03-c40f-0410-a6d1-a30d3c3de9de

Fix poor performance, Implement a dummy cache of relevant_paragraph list
refactor getParagraphItem to not try/except too much without need. Added few log entries for errors. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk/utils@41666 20353a03-c40f-0410-a6d1-a30d3c3de9de
c9dba102 · Rafael Monnerat · d17e1a73 · c9dba102
Commit c9dba102 authored Dec 22, 2010 by Rafael Monnerat
Hide whitespace changes
Inline Side-by-side

Showing with 25 additions and 12 deletions

cloudooo/granulate/oogranulate.py cloudooo/granulate/oogranulate.py +25 -12

No files found.
--- a/cloudooo/granulate/oogranulate.py
+++ b/cloudooo/granulate/oogranulate.py
@@ -53,8 +53,6 @@ def getTemplatePath(format):
  """
  return path.join(path.dirname(__file__), 'template.%s' % format)
 class OOGranulate(object):
  """Granulate an OpenOffice document into tables, images, chapters and
  paragraphs."""
@@ -147,32 +145,47 @@ class OOGranulate(object):
    path = 'Pictures/%s' % id
    return self.document.getFile(path)
+  def _getRelevantParagraphList(self):
+    """ This should use memcache or another cache infrastructure.
+    """
+    RELEVANT_PARAGRAPH_CACHE = getattr(self, "RELEVANT_PARAGRAPH_CACHE", None)
+    if RELEVANT_PARAGRAPH_CACHE is None:
+      relevant_paragraph_list = self.document.parsed_content.xpath(
+                                 RELEVANT_PARAGRAPH_XPATH_QUERY,
+                                 namespaces=self.document.parsed_content.nsmap)
+      setattr(self, "RELEVANT_PARAGRAPH_CACHE", relevant_paragraph_list)
+    return self.RELEVANT_PARAGRAPH_CACHE
  def getParagraphItemList(self):
    """Returns the list of paragraphs in the form of (id, class) where class
    may have special meaning to define TOC/TOI."""
-    relevant_paragraph_list = self.document.parsed_content.xpath(
-                                RELEVANT_PARAGRAPH_XPATH_QUERY,
-                                namespaces=self.document.parsed_content.nsmap)
    id = 0
    paragraph_list = []
-    for p in relevant_paragraph_list:
+    for p in self._getRelevantParagraphList():
      paragraph_list.append((id, p.attrib[TEXT_STYLENAME_NAMESPACE]))
      id += 1
    return paragraph_list
  def getParagraphItem(self, paragraph_id):
    """Returns the paragraph in the form of (text, class)."""
+    relevant_paragraph_list = self._getRelevantParagraphList()
    try:
-      relevant_paragraph_list = self.document.parsed_content.xpath(
-                                RELEVANT_PARAGRAPH_XPATH_QUERY,
-                                namespaces=self.document.parsed_content.nsmap)
      paragraph = relevant_paragraph_list[paragraph_id]
-      text = ''.join(paragraph.xpath('.//text()', namespaces=paragraph.nsmap))
-      p_class = paragraph.attrib[TEXT_STYLENAME_NAMESPACE]
-      return (text, p_class)
    except IndexError:
+      logger.error("Unable to find paragraph %s at paragraph list." % paragraph_id)
+      return None
+    text = ''.join(paragraph.xpath('.//text()', namespaces=paragraph.nsmap))
+    if TEXT_STYLENAME_NAMESPACE not in paragraph.attrib.keys():
+      logger.error("Unable to find %s attribute at paragraph %s " % \
+                              (TEXT_STYLENAME_NAMESPACE, paragraph_id))
      return None
+    p_class = paragraph.attrib[TEXT_STYLENAME_NAMESPACE]
+    return (text, p_class)
  def getChapterItemList(self, file):
    """Returns the list of chapters in the form of (id, level)."""
    raise NotImplementedError