Commit c9dba102 authored by Rafael Monnerat's avatar Rafael Monnerat

Fix poor performance, Implement a dummy cache of relevant_paragraph list

refactor getParagraphItem to not try/except too much without need.
Added few log entries for errors.

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk/utils@41666 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent d17e1a73
...@@ -53,8 +53,6 @@ def getTemplatePath(format): ...@@ -53,8 +53,6 @@ def getTemplatePath(format):
""" """
return path.join(path.dirname(__file__), 'template.%s' % format) return path.join(path.dirname(__file__), 'template.%s' % format)
class OOGranulate(object): class OOGranulate(object):
"""Granulate an OpenOffice document into tables, images, chapters and """Granulate an OpenOffice document into tables, images, chapters and
paragraphs.""" paragraphs."""
...@@ -147,32 +145,47 @@ class OOGranulate(object): ...@@ -147,32 +145,47 @@ class OOGranulate(object):
path = 'Pictures/%s' % id path = 'Pictures/%s' % id
return self.document.getFile(path) return self.document.getFile(path)
def _getRelevantParagraphList(self):
""" This should use memcache or another cache infrastructure.
"""
RELEVANT_PARAGRAPH_CACHE = getattr(self, "RELEVANT_PARAGRAPH_CACHE", None)
if RELEVANT_PARAGRAPH_CACHE is None:
relevant_paragraph_list = self.document.parsed_content.xpath(
RELEVANT_PARAGRAPH_XPATH_QUERY,
namespaces=self.document.parsed_content.nsmap)
setattr(self, "RELEVANT_PARAGRAPH_CACHE", relevant_paragraph_list)
return self.RELEVANT_PARAGRAPH_CACHE
def getParagraphItemList(self): def getParagraphItemList(self):
"""Returns the list of paragraphs in the form of (id, class) where class """Returns the list of paragraphs in the form of (id, class) where class
may have special meaning to define TOC/TOI.""" may have special meaning to define TOC/TOI."""
relevant_paragraph_list = self.document.parsed_content.xpath(
RELEVANT_PARAGRAPH_XPATH_QUERY,
namespaces=self.document.parsed_content.nsmap)
id = 0 id = 0
paragraph_list = [] paragraph_list = []
for p in relevant_paragraph_list: for p in self._getRelevantParagraphList():
paragraph_list.append((id, p.attrib[TEXT_STYLENAME_NAMESPACE])) paragraph_list.append((id, p.attrib[TEXT_STYLENAME_NAMESPACE]))
id += 1 id += 1
return paragraph_list return paragraph_list
def getParagraphItem(self, paragraph_id): def getParagraphItem(self, paragraph_id):
"""Returns the paragraph in the form of (text, class).""" """Returns the paragraph in the form of (text, class)."""
relevant_paragraph_list = self._getRelevantParagraphList()
try: try:
relevant_paragraph_list = self.document.parsed_content.xpath(
RELEVANT_PARAGRAPH_XPATH_QUERY,
namespaces=self.document.parsed_content.nsmap)
paragraph = relevant_paragraph_list[paragraph_id] paragraph = relevant_paragraph_list[paragraph_id]
text = ''.join(paragraph.xpath('.//text()', namespaces=paragraph.nsmap))
p_class = paragraph.attrib[TEXT_STYLENAME_NAMESPACE]
return (text, p_class)
except IndexError: except IndexError:
logger.error("Unable to find paragraph %s at paragraph list." % paragraph_id)
return None
text = ''.join(paragraph.xpath('.//text()', namespaces=paragraph.nsmap))
if TEXT_STYLENAME_NAMESPACE not in paragraph.attrib.keys():
logger.error("Unable to find %s attribute at paragraph %s " % \
(TEXT_STYLENAME_NAMESPACE, paragraph_id))
return None return None
p_class = paragraph.attrib[TEXT_STYLENAME_NAMESPACE]
return (text, p_class)
def getChapterItemList(self, file): def getChapterItemList(self, file):
"""Returns the list of chapters in the form of (id, level).""" """Returns the list of chapters in the form of (id, level)."""
raise NotImplementedError raise NotImplementedError
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment