external web page, supports recursive spidering

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@10447 20353a03-c40f-0410-a6d1-a30d3c3de9de

external web page, supports recursive spidering
git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@10447 20353a03-c40f-0410-a6d1-a30d3c3de9de
d392b1df · Bartek Górny · 1c5d04a6 · d392b1df · d392b1df
Commit d392b1df authored Oct 01, 2006 by Bartek Górny
Showing with 216 additions and 0 deletions

product/ERP5OOo/Document/ExternalWebPage.py product/ERP5OOo/Document/ExternalWebPage.py +207 -0

product/ERP5OOo/PropertySheet/ExternalDocument.py product/ERP5OOo/PropertySheet/ExternalDocument.py +9 -0

No files found.
--- a/product/ERP5OOo/Document/ExternalWebPage.py
+++ b/product/ERP5OOo/Document/ExternalWebPage.py
+
+
+##############################################################################
+#
+# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo
+from Products.CMFCore.WorkflowCore import WorkflowMethod
+from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
+from Products.ERP5OOo.Document.DMSFile import stripHtml
+from Products.ERP5OOo.Document.ExternalDocument import ExternalDocument, SpiderException
+
+import mimetypes, re, urllib
+from htmlentitydefs import name2codepoint
+
+rx=[]
+rx.append(re.compile('<!--.*?-->',re.DOTALL|re.MULTILINE)) # clear comments (sometimes JavaScript code in comments contains > chars)
+rx.append(re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)) # clear tags
+rx.append(re.compile('\s+')) # compress multiple spaces
+
+def clearHtml(s):
+  for r in rx:
+    s=r.sub(" ",s)
+  return s
+
+
+tgtencoding='utf-8'
+encodings=['iso-8859-2','iso-8859-15','windows-1250']
+rx_charset=re.compile('<meta.*charset="?([\w\d\-]*)',re.DOTALL|re.MULTILINE|re.IGNORECASE)
+
+def recode(s):
+  """
+  maybe it can be useful system-wide
+  """
+  _encodings=encodings[:] # local copy
+  _encodings.insert(0,tgtencoding) # if not declared or declared wrongly, we try
+  m=rx_charset.search(s)
+  if m and len(m.groups())>0:
+    enc=m.groups()[0].lower()
+    if enc==tgtencoding:
+      return s
+    if enc in _encodings:
+      _encodings.remove(enc)
+    _encodings.insert(0,enc) # we'll start from what we've found
+  for enc in _encodings:
+    try:
+      return s.decode(enc).encode('utf-8')
+    except UnicodeDecodeError, LookupError:
+      pass
+  raise CanNotDecode('sorry')
+
+def _convertEntities(txt,rx,mapper=None):
+  def repl(code):
+    if mapper:
+      code=mapper.get(code)
+    if code is None:
+      return ''
+    return unichr(int(code)).encode(tgtencoding)
+  res=re.split(rx,txt)
+  res[1::2]=map(repl,res[1::2]) # Isn't it beautiful? :)
+  return ''.join(res)
+
+rx_chars=re.compile('&#(\d{3});')
+rx_ents=re.compile('&(\w{1,6});')
+
+def convertEntities(txt):
+  txt=_convertEntities(txt,rx_chars)
+  txt=_convertEntities(txt,rx_ents, name2codepoint)
+  return txt
+
+class ExternalWebPage(ExternalDocument):
+  """
+  caching sources from outside
+  """
+  # CMF Type Definition
+  meta_type = 'ERP5 External Web Page'
+  portal_type = 'External Web Page'
+  isPortalContent = 1
+  isRADContent = 1
+
+  # Declarative security
+  security = ClassSecurityInfo()
+  security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+  # Default Properties
+  property_sheets = ( PropertySheet.Base
+                    , PropertySheet.CategoryCore
+                    , PropertySheet.DublinCore
+                    , PropertySheet.Version
+                    , PropertySheet.Reference
+                    , PropertySheet.DMSFile
+                    , PropertySheet.Document
+                    , PropertySheet.Url
+                    , PropertySheet.ExternalDocument
+                    )
+
+  def _findTopObject(self):
+    '''
+    find the top object from which the spidering begun
+    we search upwards untill we find or reach portal object
+    the top object is the one that is maintaining the dictionary
+    I think we have to do it instead of using simple acquisition
+    because we have to find a non-empty one
+    '''
+    ob=self
+    if hasattr(self,'urldict') and len(self.urldict)>0:
+      return self
+    else:
+      while 1:
+        ob=ob.aq_parent
+        if ob==self.getPortalObject():
+          return self
+        if hasattr(ob,'urldict') and len(ob.urldict)>0:
+          return ob
+
+  security.declareProtected(Permissions.ModifyPortalContent,'addUrl')
+  def addUrl(self,url):
+    '''
+    record url that has already been spidered
+    '''
+    self.urldict[url]=1
+    self._p_changed=1
+
+  security.declareProtected(Permissions.ModifyPortalContent,'checkUrl')
+  def checkUrl(self,url):
+    '''
+    check if the url has already been spidered
+    '''
+    return self.urldict.has_key(url)
+
+  security.declareProtected(Permissions.ModifyPortalContent,'resetTopObject')
+  def resetTopObject(self):
+    '''
+    reset the url dictionary
+    remember do it before you start recursive spidering
+    '''
+    self.urldict={}
+    self._p_changed=1
+
+  def _processData(self,s):
+    top=self._findTopObject()
+    # record my url in top object
+    top.addUrl(self.getQualifiedUrl())
+    # remove current subobjects
+    self.manage_delObjects([i.getId() for i in self.searchFolder(portal_type='External Web Page')])
+    if self.getOptionRecursively()>0 and self.getRecursionDepth()>0:
+      # first find links in text
+      rx=re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
+      for ref in re.findall(rx, s):
+        if ref.startswith('javascript'):
+          continue
+        # XXX not sure where to store those already spidered
+        # for now, the only precaution against infinite loop is recursion depth
+        # select internal links
+        baseref='/'.join(self.getQualifiedUrl().split('/')[:-1])
+        if not ref.startswith('http'):
+          # complete relative paths
+          ref=baseref+'/'+ref
+        # create subobjects
+        if ref.startswith(baseref) and not top.checkUrl(ref):
+          n=self.newContent(portal_type='External Web Page')
+          # set coordinates
+          n.setUrlProtocol('http')
+          n.setUrlString(ref)
+          n.setOptionRecursively(1)
+          n.setRecursionDepth(self.getRecursionDepth()-1)
+          # copy attributes
+          for atr in self.portal_types[self.getPortalType()].getInstanceBaseCategoryList():
+            n.setProperty(atr,self.getProperty(atr))
+          n.activate(activity='SQLQueue').ExternalDocument_spiderAndSetState()
+    # process self
+    # here we check encoding and convert to UTF8
+    try:
+      s=recode(s)
+    except CanNotDecode:
+      self.setStatusMessage("Spidered on %s, %i chars, but could not decode" % (self._time(), chars))
+      return False
+    s=stripHtml(s) # remove headers, doctype and the like
+    s=clearHtml(s) # remove tags
+    s=convertEntities(s) # convert charrefs and named entities
+    return s
+
+
+# vim: filetype=python syntax=python shiftwidth=2 
--- a/product/ERP5OOo/PropertySheet/ExternalDocument.py
+++ b/product/ERP5OOo/PropertySheet/ExternalDocument.py
@@ -7,5 +7,14 @@ class ExternalDocument:
            'description' : 'message about status',
            'type'        : 'string',
            'mode'        : 'w' },
+        {   'id'          : 'option_recursively',
+            'description' : 'do we want recursive spidering (meaningless in some classes)',
+            'type'        : 'int',
+            'mode'        : 'w'},
+        {   'id'          : 'recursion_depth',
+            'description' : 'how deep should recursive spidering be (0 - no recursion) (meaningless in some classes)',
+            'type'        : 'int',
+            'default'     : 5,
+            'mode'        : 'w'},
        )