diff --git a/product/ERP5OOo/OOoUtils.py b/product/ERP5OOo/OOoUtils.py index 52ecdd4214d4dca6b824ac82180645526a48448e..c06a97c88be3a48a2b95324b82bfecff81d23769 100644 --- a/product/ERP5OOo/OOoUtils.py +++ b/product/ERP5OOo/OOoUtils.py @@ -428,10 +428,43 @@ class OOoParser(Implicit): xpath = '@*[local-name()="%s"]' % attribute_type_mapping[value_type] cell_data = str(cell.xpath(xpath)[0]) else: # read text nodes - text_tags = cell.findall('./{%s}p' % cell.nsmap['text']) - if len(text_tags): - cell_data = ''.join([text.xpath('string(.)') - for text in text_tags]) + # Text nodes can contain multiple <text:p> tags, one for each + # line. There are also some tags for special entities, for + # instance <text:s/> for a space (or using <text:s text:c="3"/> + # for multiple spaces) <text:tab/> for a tab and <text:line-break/> + # for new line + text_ns = cell.nsmap['text'] + def format_node(node): + if node.tag == '{%s}table-cell' % node.nsmap['table']: + return "\n".join(part for part in + [format_node(child) for child in node.iterchildren()] + if part is not None) + elif node.tag == '{%s}p' % node.nsmap['text']: + part_list = [node.text] + part_list.extend(format_node(child) + for child in node.iterchildren()) + return ''.join(part for part in part_list if part) + elif node.tag == '{%s}s' % node.nsmap['text']: + count = int(node.get('{%s}c' % node.nsmap['text'], 1)) + return ''.join(part for part in + [node.text, ' ' * count, node.tail] if part) + elif node.tag == '{%s}span' % node.nsmap['text']: + part_list = [node.text] + part_list.extend(format_node(child) + for child in node.iterchildren()) + part_list.append(node.tail) + return ''.join(part for part in part_list if part) + elif node.tag == '{%s}tab' % node.nsmap['text']: + return ''.join(part for part in + [node.text, '\t', node.tail] if part) + elif node.tag == '{%s}line-break' % node.nsmap['text']: + return ''.join(part for part in + [node.text, '\n', node.tail] if part) + elif node.tag == '{%s}a' % node.nsmap['text']: + return ''.join(part for part in + [node.text, node.tail] if part) + # we can also have table:annotation, and they are ignored + cell_data = format_node(cell) # Add the cell to the line table_line.append(cell_data) diff --git a/product/ERP5OOo/tests/testOOoParser.py b/product/ERP5OOo/tests/testOOoParser.py index c243d410270633bca0e6d2b425e9345057817399..70546f94e2d2a09432820e8cf01a4a6051a0ce40 100644 --- a/product/ERP5OOo/tests/testOOoParser.py +++ b/product/ERP5OOo/tests/testOOoParser.py @@ -106,6 +106,16 @@ class TestOOoParser(unittest.TestCase): if not_ok: self.fail('Spreadsheet not read!') + def test_getSpreadSheetMappingText(self): + parser = OOoParser() + parser.openFile(open(makeFilePath('complex_text.ods'), 'rb')) + mapping = parser.getSpreadsheetsMapping() + self.assertEquals(['Feuille1'], mapping.keys()) + self.assertEquals(mapping['Feuille1'][0], [' leading space']) + self.assertEquals(mapping['Feuille1'][1], [' leading space']) + self.assertEquals(mapping['Feuille1'][2], ['tab\t']) + self.assertEquals(mapping['Feuille1'][3], ['New\nLine']) + def test_suite(): suite = unittest.TestSuite() diff --git a/product/ERP5OOo/tests/test_document/complex_text.ods b/product/ERP5OOo/tests/test_document/complex_text.ods new file mode 100644 index 0000000000000000000000000000000000000000..0a521e364ab0717b91dc8cb82d381dcfbe7b4cfc Binary files /dev/null and b/product/ERP5OOo/tests/test_document/complex_text.ods differ