############################################################################## # # Copyright (c) 2002-2007 Nexedi SARL and Contributors. All Rights Reserved. # # WARNING: This program as such is intended to be used by professional # programmers who take the whole responsability of assessing all potential # consequences resulting from its eventual inadequacies and bugs # End users who are looking for a ready-to-use solution with commercial # garantees and support are strongly adviced to contract a Free Software # Service Company # # This program is Free Software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # ############################################################################## from AccessControl import ClassSecurityInfo from Products.CMFCore.utils import getToolByName from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface from Products.ERP5Type.XMLObject import XMLObject from Products.ERP5.Document.Url import UrlMixIn import mimetypes import re import urllib from htmlentitydefs import name2codepoint from DateTime import DateTime class ExternalSource(XMLObject, UrlMixIn): """ An External Source consists of single URL which defines the root of a collection of documents, each of which can be accessed individually. The URL can be an http site, an ftp site, a local repository, a samba server, etc. The main purpose of External Sources is to group related documents and define shared security policies, shared updated policies, etc. For example, all pages of a wiki with restricted access rights share the same security policy (ex. team, project, etc.). Another purpose of the External Source class is to make it easy to manage external sources of knowledge (adding them, removing them, etc.). The second purpose of an external source is to provide a way to search contents stored externally in a system which is not compatible with ERP5 Catalog. Example of external sources: - a Web Site - a SAMBA share - an FTP server - a backup server - a mail directory - a mailing list archive ExternalSource may be subclassed to provide more automation features. This is useful for example to manage the creation of a mailing list, the deletion of mailing list and the definition of the members of a mailing list in a centralised way. NOTE: RSS feeds are not external sources but standard Text documents with transformation and update policy. They use the populateContent method to create subcontent from a root content. This is different with crawling. NOTE2: access to filesystems through URL requires to extend urllib2 so that directories are handled as if they were web pages OR RSS feed with a list of files (and associated URL). Complete implemetation of external sources will require major extensions to urllib2 (or equivalent). NOTE3: it is possible to make external search sources persistent by triggering an activity with newContent for every displayed result. This can be done by wrapping the results in a generator (yield). The interest of this approach is to make it possible to search already searched contents without having to go through the external source search (ie. with the front page search). """ # CMF Type Definition meta_type = 'ERP5 External Source' portal_type = 'External Source' isPortalContent = 1 isRADContent = 1 # Declarative security security = ClassSecurityInfo() security.declareObjectProtected(Permissions.AccessContentsInformation) # Default Properties property_sheets = ( PropertySheet.Base , PropertySheet.CategoryCore , PropertySheet.DublinCore , PropertySheet.Version , PropertySheet.Reference , PropertySheet.Document , PropertySheet.TextDocument , PropertySheet.Url , PropertySheet.ExternalDocument , PropertySheet.Periodicity ) # Crawling API security.declareProtected(Permissions.ModifyPortalContent, 'crawlContent') def crawlContent(self): """ Creates the initial content from the URL by crawling the root """ self.portal_contributions.crawlContent(self, container=self) security.declareProtected(Permissions.AccessContentsInformation, 'getContentURLList') def getContentURLList(self): """ Returns the root of the crawling process """ return [self.asURL()] security.declareProtected(Permissions.AccessContentsInformation, 'getContentBaseURL') def getContentBaseURL(self): """ Returns None to force crawler to ignore this parameter """ return None security.declareProtected(Permissions.View, 'isIndexContent') def isIndexContent(self, content=None): """ This method is able to tell if content object is an index or "real" content. Sometimes (though not often) we want to define a content as index (e.g. if it is only a list of mailing list messages), so that we do not index it for searching etc). Default implementation returns False. """ if content is None: # this means that we are called directly, and external source # is an index by definition return True method = self._getTypeBasedMethod('isIndexContent') if method is None: return False return method(content) # Search API security.declareProtected(Permissions.SearchCatalog, 'searchResults') def searchResults(self, **kw): """ Search results. There is no notion of security here since the source is external. NOTE: implementation is delegated to a script so that different kinds of sources may be implemented using different portal types. NOTE2: a typical implementation consists in creating a specific SQL method with a dedicated connector then force the SQL catalog to use that method instead of the standard ones, yet delegate the SQL generation to the catalog. """ method = self._getTypeBasedMethod('searchResults') return method(**kw) security.declareProtected(Permissions.SearchCatalog, 'countResults') def countResults(self, **kw): """ Count results. There is no notion of security here since the source is external. NOTE: implementation is delegated to a script so that different kinds of sources may be implemented using different portal types. """ method = self._getTypeBasedMethod('countResults') return method(**kw)