Commit 2e4e5cc0 authored by Jean-Paul Smets's avatar Jean-Paul Smets

Use mixin.CachedConvertableMixin. getConvertedSize removed from Document class...

Use mixin.CachedConvertableMixin. getConvertedSize removed from Document class (redundant with getConversionSize). docstring put again where it belongs (ie. Document class). Preparing the removal of document interface.

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@29844 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 3146edb0
......@@ -52,9 +52,11 @@ import zope.interface
import cStringIO
import string
from OFS.Image import Pdata
import md5
from Products.PythonScripts.Utility import allow_class
# Mixin Import
from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
_MARKER = []
VALID_ORDER_KEY_LIST = ('user_login', 'content', 'file_name', 'input')
# these property ids are unchangable
......@@ -113,136 +115,6 @@ class DocumentProxyError(Exception):pass
class NotConvertedError(Exception):pass
allow_class(NotConvertedError)
class ConversionCacheMixin:
"""
This class provides a generic API to store in the ZODB
various converted versions of a file or of a string.
Versions are stored in dictionaries; the class stores also
generation time of every format and its mime-type string.
Format can be a string or a tuple (e.g. format, resolution).
"""
# Declarative security
security = ClassSecurityInfo()
security.declareObjectProtected(Permissions.AccessContentsInformation)
def _getCacheFactory(self):
"""
"""
if self.isTempObject():
return
cache_tool = getToolByName(self, 'portal_caches')
preference_tool = getToolByName(self, 'portal_preferences')
cache_factory_name = preference_tool.getPreferredConversionCacheFactory('document_cache_factory')
cache_factory = cache_tool.getRamCacheRoot().get(cache_factory_name)
#XXX This conditional statement should be remove as soon as
#Broadcasting will be enable among all zeo clients.
#Interaction which update portal_caches should interact with all nodes.
if cache_factory is None and getattr(cache_tool, cache_factory_name, None) is not None:
#ram_cache_root is not up to date for current node
cache_tool.updateCache()
return cache_tool.getRamCacheRoot().get(cache_factory_name)
security.declareProtected(Permissions.ModifyPortalContent, 'clearConversionCache')
def clearConversionCache(self):
"""
"""
if self.isTempObject():
self.temp_conversion_data = {}
return
for cache_plugin in self._getCacheFactory().getCachePluginList():
cache_plugin.delete(self.getPath(), DEFAULT_CACHE_SCOPE)
security.declareProtected(Permissions.View, 'hasConversion')
def hasConversion(self, **kw):
"""
If you want to get conversion cache value if exists, please write
the code like:
try:
mime, data = getConversion(**kw)
except KeyError:
...
instead of:
if self.hasConversion(**kw):
mime, data = self.getConversion(**kw)
else:
...
for better performance.
"""
try:
self.getConversion(**kw)
return True
except KeyError:
return False
security.declareProtected(Permissions.ModifyPortalContent, 'setConversion')
def setConversion(self, data, mime=None, calculation_time=None, **kw):
"""
"""
cache_id = self.generateCacheId(**kw)
if self.isTempObject():
if getattr(aq_base(self), 'temp_conversion_data', None) is None:
self.temp_conversion_data = {}
self.temp_conversion_data[cache_id] = (mime, aq_base(data))
return
cache_factory = self._getCacheFactory()
cache_duration = cache_factory.cache_duration
if data is not None:
for cache_plugin in cache_factory.getCachePluginList():
try:
cache_entry = cache_plugin.get(self.getPath(), DEFAULT_CACHE_SCOPE)
cache_dict = cache_entry.getValue()
except KeyError:
cache_dict = {}
cache_dict.update({cache_id: (self.getContentMd5(), mime, aq_base(data))})
cache_plugin.set(self.getPath(), DEFAULT_CACHE_SCOPE,
cache_dict, calculation_time=calculation_time,
cache_duration=cache_duration)
security.declareProtected(Permissions.View, 'getConversion')
def getConversion(self, **kw):
"""
"""
cache_id = self.generateCacheId(**kw)
if self.isTempObject():
return getattr(aq_base(self), 'temp_conversion_data', {})[cache_id]
for cache_plugin in self._getCacheFactory().getCachePluginList():
cache_entry = cache_plugin.get(self.getPath(), DEFAULT_CACHE_SCOPE)
data_list = cache_entry.getValue().get(cache_id)
if data_list:
md5sum, mime, data = data_list
if md5sum != self.getContentMd5():
raise KeyError, 'Conversion cache key is compromised for %r' % cache_id
return mime, data
raise KeyError, 'Conversion cache key does not exists for %r' % cache_id
security.declareProtected(Permissions.View, 'getConversionSize')
def getConversionSize(self, **kw):
"""
"""
try:
return len(self.getConversion(**kw))
except KeyError:
return 0
def generateCacheId(self, **kw):
"""Generate proper cache id based on **kw.
Function inspired from ERP5Type.Cache
"""
return str(makeSortedTuple(kw)).translate(string.maketrans('', ''), '[]()<>\'", ')
security.declareProtected(Permissions.ModifyPortalContent, 'updateContentMd5')
def updateContentMd5(self):
"""Update md5 checksum from the original file
"""
data = self.getData()
self._setContentMd5(md5.new(data).hexdigest()) #reindex is useless
class PermanentURLMixIn(ExtensibleTraversableMixIn):
"""
Provides access to documents through their permanent URL.
......@@ -455,8 +327,168 @@ class UpdateMixIn:
return method()
class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, SnapshotMixin, UpdateMixIn):
"""
class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, SnapshotMixin, UpdateMixIn):
"""Document is an abstract class with all methods related to document
management in ERP5. This includes searchable text, explicit relations,
implicit relations, metadata, versions, languages, etc.
Documents may either store their content directly or cache content
which is retrieved from a specified URL. The second case if often
referred as "External Document". Standalone "External Documents" may
be created by specifying a URL to the contribution tool which is in
charge of initiating the download process and selecting the appropriate
document type. Groups of "External Documents" may also be generated from
so-called "External Source" (refer to ExternalSource class for more
information).
External Documents may be downloaded once or updated at regular interval.
The later can be useful to update the content of an external source.
Previous versions may be stored in place or kept in a separate file.
This feature is known as the crawling API. It is mostly implemented
in ContributionTool with wrappers in the Document class. It can be useful
for create a small search engine.
There are currently two types of Document subclasses:
* File for binary file based documents. File has subclasses such as Image,
OOoDocument, PDFDocument, etc. to implement specific conversion methods.
* TextDocument for text based documents. TextDocument has subclasses such
as Wiki to implement specific methods.
TextDocument itself has a subclass (XSLTDocument) which provides
XSLT based analysis and transformation of XML content based on XSLT
templates.
Conversion should be achieved through the convert method and other methods
of the conversion API (convertToBaseFormat, etc.).
Moreover, any Document subclass must ne able to convert documents to text
(asText method) and HTML (asHTML method). Text is required for full text
indexing. HTML is required for crawling.
Instances can be created directly, or via portal_contributions tool which
manages document ingestion process whereby a file can be uploaded by http
or sent in by email or dropped in by webdav or in some other way as yet
unknown. The ingestion process has the following steps:
(1) portal type detection
(2) object creation and upload of data
(3) metadata discovery (optionally with conversion of data to another format)
(4) other possible actions to finalise the ingestion (ex. by assigning
a reference)
This class handles (3) and calls a ZMI script to do (4).
Metadata can be drawn from various sources:
input - data supplied with http request or set on the object during (2) (e.g.
discovered from email text)
file_name - data which might be encoded in file name
user_login - information about user who is contributing the file
content - data which might be derived from document content
If a certain property is defined in more than one source, it is set according to
preference order returned by a script
Document_getPreferredDocumentMetadataDiscoveryOrderList
(or any type-based version since discovery is type dependent)
Methods for discovering metadata are:
getPropertyDictFromInput
getPropertyDictFromFileName
getPropertyDictFromUserLogin
getPropertyDictFromContent
Methods for processing content are implemented either in Document class
or in Base class:
getSearchableReferenceList (Base)
getSearchableText (Base)
index_html (overriden in Document subclasses)
Methods for handling relations are implemented either in Document class
or in Base class:
getImplicitSuccessorValueList (Base)
getImplicitPredecessorValueList (Base)
getImplicitSimilarValueList (Base)
getSimilarCloudValueList (Document)
Implicit relations consist in finding document references inside
searchable text (ex. INV-23456) and deducting relations from that.
Two customisable methods required. One to find a list of implicit references
inside the content (getSearchableReferenceList) and one to convert a given
document reference into a list of reference strings which could be present
in other content (asSearchableReferenceList).
document.getSearchableReferenceList() returns
[
{'reference':' INV-12367'},
{'reference': 'INV-1112', 'version':'012}',
{'reference': 'AB-CC-DRK', 'version':'011', 'language': 'en'}
]
The Document class behaviour can be extended / customized through scripts
(which are type-based so can be adjusted per portal type).
* Document_getPropertyDictFromUserLogin - finds a user (by user_login or
from session) and returns properties which should be set on the document
* Document_getPropertyDictFromContent - analyzes document content and returns
properties which should be set on the document
* Base_getImplicitSuccessorValueList - finds appropriate all documents
referenced in the current content
* Base_getImplicitPredecessorValueList - finds document predecessors based on
the document coordinates (can use only complete coordinates, or also partial)
* Document_getPreferredDocumentMetadataDiscoveryOrderList - returns an order
in which metadata should be set/overwritten
* Document_finishIngestion - called by portal_activities after all the ingestion
is completed (and after document has been converted, so text_content
is available if the document has it)
* Document_getNewRevision - calculates revision number which should be set
on this document. Implementation depends on revision numbering policy which
can be very different. Interaction workflow should call setNewRevision method.
* Document_populateContent - analyses the document content and produces
subcontent based on it (ex. images, news, etc.). This scripts can
involve for example an XSLT transformation to process XML.
Subcontent: documents may include subcontent (files, images, etc.)
so that publication of rich content can be path independent. Subcontent
can also be used to help the rendering in HTML of complex documents
such as ODF documents.
Consistency checking:
Default implementation uses DocumentReferenceConstraint to check if the
reference/language/version triplet is unique. Additional constraints
can be added if necessary.
NOTE: Document.py supports a notion of revision which is very specific.
The underlying concept is that, as soon as a document has a reference,
the association of (reference, version, language) must be unique accross
the whole system. This means that a given document in a given version in a
given language is unique. The underlying idea is similar to the one in a Wiki
system in which each page is unique and acts the the atom of collaboration.
In the case of ERP5, if a team collaborates on a Text document written with
an offline word processor, all updates should be placed inside the same object.
A Contribution will thus modify an existing document, if allowed from security
point of view, and increase the revision number. Same goes for properties
(title). Each change generates a new revision.
conversion API - not same as document - XXX BAD
XXX make multiple interfaces
TODO:
- move all implementation bits to MixIn classes
- in the end, Document class should have zero code
and only serve as a quick and easy way to create
new types of documents (one could even consider
that this class should be trashed)
-
"""
meta_type = 'ERP5 Document'
......@@ -467,7 +499,14 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna
isDocument = 1
__dav_collection__=0
zope.interface.implements( interfaces.IDocument, )
zope.interface.implements(interfaces.IConvertable,
interfaces.ITextConvertable,
interfaces.IHtmlConvertable,
interfaces.ICachedConvertable,
interfaces.IVersionable,
interfaces.IDownloadable,
interfaces.ICrawlable,
)
# Regular expressions
href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
......@@ -1051,13 +1090,13 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna
# Try not to invoke an automatic transition here
self._edit(**kw)
# Finish ingestion by calling method
self.finishIngestion()
self.reindexObject()
self.finishIngestion() # XXX - is this really the right place ?
self.reindexObject() # XXX - is this really the right place ?
# Revision merge is tightly coupled
# to metadata discovery - refer to the documentation of mergeRevision method
merged_doc = self.mergeRevision()
merged_doc.reindexObject()
return merged_doc
merged_doc = self.mergeRevision() # XXX - is this really the right place ?
merged_doc.reindexObject() # XXX - is this really the right place ?
return merged_doc # XXX - is this really the right place ?
security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion')
def finishIngestion(self):
......@@ -1092,13 +1131,6 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna
return 'text/plain', '' # XXX - Why ?
raise NotImplementedError
def getConvertedSize(self, format):
"""
Returns the size of the converted document
"""
format, data = self.convert(format)
return len(data)
security.declareProtected(Permissions.View, 'asSubjectText')
def asSubjectText(self, **kw):
"""
......@@ -1272,7 +1304,7 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna
"""
return False
def convertFile(self, **kw):
def convertFile(self, **kw): # XXX - It it really useful to explicitly define ?
"""
Workflow transition invoked when conversion occurs.
"""
......
......@@ -35,13 +35,15 @@ from Products.ERP5Type.Base import WorkflowMethod
from Products.ERP5Type import Permissions, PropertySheet, Constraint, interfaces
from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5.Document.Document import Document
from Products.ERP5.Document.Document import ConversionCacheMixin
from Products.ERP5.Document.Document import ConversionError
from Products.ERP5Type.Base import Base
from Products.CMFDefault.File import File as CMFFile
from zLOG import LOG
from DateTime import DateTime
# Mixin Import
from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
mimetypes.init()
def _unpackData(data):
......@@ -51,7 +53,7 @@ def _unpackData(data):
"""
return str(data)
class File(Document, CMFFile, ConversionCacheMixin):
class File(Document, CMFFile, CachedConvertableMixin):
"""
A File can contain raw data which can be uploaded and downloaded.
It is the root class of Image, OOoDocument (ERP5OOo product),
......
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
......@@ -34,11 +35,12 @@ from Products.CMFCore.utils import getToolByName, _setCacheHeaders,\
from Products.ERP5Type import Permissions, PropertySheet, Constraint, interfaces
from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5.Document.Image import Image
from Products.ERP5.Document.Document import ConversionCacheMixin, ConversionError
from Products.ERP5.Document.Document import ConversionError
from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
from zLOG import LOG, WARNING
class PDFDocument(Image, ConversionCacheMixin):
class PDFDocument(Image, CachedConvertableMixin):
"""
PDFDocument is a subclass of Image which is able to
extract text content from a PDF file either as text
......
......@@ -47,11 +47,13 @@ from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5Type.UnrestrictedMethod import UnrestrictedMethod
from Products.ERP5.Document.File import File
from Products.ERP5.Document.Document import PermanentURLMixIn
from Products.ERP5.Document.Document import ConversionCacheMixin
from Products.ERP5.Document.Document import ConversionError
from Products.ERP5.Document.Document import NotConvertedError
from zLOG import LOG, ERROR
# Mixin Import
from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
enc=base64.encodestring
dec=base64.decodestring
......@@ -80,7 +82,7 @@ class TimeoutTransport(SafeTransport):
return SafeTransport.make_connection(self, h)
class OOoDocument(PermanentURLMixIn, File, ConversionCacheMixin):
class OOoDocument(PermanentURLMixIn, File, CachedConvertableMixin):
"""
A file document able to convert OOo compatible files to
any OOo supported format, to capture metadata and to
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment