Commit 7b951d3f authored by Hugo H. Maia Vieira's avatar Hugo H. Maia Vieira

Implement getParagraphItemList and getParagraphItem


git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk/utils@41241 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent c4e63930
1.0.10 (unreleased)
===================
- Add getImage for OOGranulate
- Add getImageItemList for OOGranulate
- Add getParagraphItemList and getParagraphItem for OOGranulate
- Add getImageItemList and getImage for OOGranulate
- Add OdfDocument
- Add granulate interface.
......
......@@ -42,6 +42,17 @@ class OOGranulate(object):
def __init__(self, file, source_format):
self.document = OdfDocument(file, source_format)
def _relevantParagraphList(self):
"""Returns a list with the relevants lxml.etree._Element 'p' tags of
self.document.parsed_content. It exclude the 'p' inside 'draw:frame'."""
# XXX: this algorithm could be improved to not iterate with the file twice
# and probably get all relevant paragraph list by a single xpath call
all_p_list = self.document.parsed_content.xpath('//text:p',
namespaces=self.document.parsed_content.nsmap)
draw_p_list = self.document.parsed_content.xpath('//draw:frame//text:p',
namespaces=self.document.parsed_content.nsmap)
return [x for x in all_p_list if x not in draw_p_list]
def getTableItemList(self, file):
"""Returns the list of table IDs in the form of (id, title)."""
raise NotImplementedError
......@@ -73,14 +84,27 @@ class OOGranulate(object):
path = 'Pictures/%s' % id
return self.document.getFile(path)
def getParagraphItemList(self, file):
def getParagraphItemList(self):
"""Returns the list of paragraphs in the form of (id, class) where class
may have special meaning to define TOC/TOI."""
raise NotImplementedError
def getParagraphItem(self, file, paragraph_id):
key = '{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name'
id = 0
paragraph_list = []
for p in self._relevantParagraphList():
paragraph_list.append((id, p.attrib[key]))
id += 1
return paragraph_list
def getParagraphItem(self, paragraph_id):
"""Returns the paragraph in the form of (text, class)."""
raise NotImplementedError
try:
paragraph = self._relevantParagraphList()[paragraph_id]
text = ''.join(paragraph.xpath('.//text()', namespaces=paragraph.nsmap))
key = '{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name'
p_class = paragraph.attrib[key]
return (text, p_class)
except IndexError:
return None
def getChapterItemList(self, file):
"""Returns the list of chapters in the form of (id, level)."""
......
......@@ -55,11 +55,11 @@ class IImageGranulator(Interface):
class ITextGranulator(Interface):
"""Provides methods to granulate a document into chapters and paragraphs."""
def getParagraphItemList(file):
def getParagraphItemList():
"""Returns the list of paragraphs in the form of (id, class) where class may
have special meaning to define TOC/TOI."""
def getParagraphItem(file, paragraph_id):
def getParagraphItem(paragraph_id):
"""Returns the paragraph in the form of (text, class)."""
def getChapterItemList(file):
......
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
......@@ -83,17 +84,40 @@ class TestOOGranulate(cloudoooTestCase):
obtained_image = self.oogranulate.getImage('anything.png')
self.assertEquals('', obtained_image)
def testGetParagraphItemList(self):
"""Test if getParagraphItemList() returns the right paragraphs list"""
self.assertRaises(NotImplementedError,
self.oogranulate.getParagraphItemList,
'file')
def testRelevantParagraphList(self):
"""Test if _relevantParagraphList returns a list with 'p' excluding the 'p'
inside 'draw:frame'"""
draw_p_list = self.oogranulate.document.parsed_content.xpath(
'//draw:frame//text:p',
namespaces=self.oogranulate.document.parsed_content.nsmap)
self.assertTrue(draw_p_list not in self.oogranulate._relevantParagraphList())
def testGetParagraphItem(self):
def testGetParagraphItemList(self):
"""Test if getParagraphItemList() returns the right paragraphs list, with
the ids always in the same order"""
for i in range(5):
data = open('./data/granulate_test.odt').read()
oogranulate = OOGranulate(data, 'odt')
paragraph_list = oogranulate.getParagraphItemList()
self.assertEquals((0, 'P3'), paragraph_list[0])
self.assertEquals((1, 'P1'), paragraph_list[1])
self.assertEquals((2, 'P12'), paragraph_list[2])
self.assertEquals((8, 'P13'), paragraph_list[8])
self.assertEquals((19, 'Standard'), paragraph_list[19])
def testGetParagraphItemSuccessfully(self):
"""Test if getParagraphItem() returns the right paragraph"""
self.assertRaises(NotImplementedError, self.oogranulate.getParagraphItem,
'file',
'paragraph_id')
self.assertEquals(('Some images without title', 'P13'),
self.oogranulate.getParagraphItem(8))
big_paragraph = self.oogranulate.getParagraphItem(5)
self.assertEquals('P8', big_paragraph[1])
self.assertTrue(big_paragraph[0].startswith(u'A prática cotidiana prova'))
self.assertTrue(big_paragraph[0].endswith(u'corresponde às necessidades.'))
def testGetParagraphItemWithoutSuccess(self):
"""Test if getParagraphItem() returns None for not existent id"""
self.assertEquals(None, self.oogranulate.getParagraphItem(200))
def testGetChapterItemList(self):
"""Test if getChapterItemList() returns the right chapters list"""
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment