ExternalDocument.py 5.35 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31

##############################################################################
#
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

from AccessControl import ClassSecurityInfo
from Products.CMFCore.WorkflowCore import WorkflowMethod
from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
32
from Products.ERP5OOo.Document.DMSFile import DMSFile
33 34

import mimetypes, re, urllib
35
from htmlentitydefs import name2codepoint
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
from DateTime import DateTime


class SpiderException(Exception):

  def __init__(self,code, msg):
    msg="%i: %s" % (code, msg)
    Exception.__init__(self,msg)

class Opener(urllib.FancyURLopener):

  def http_error_default(self, url, fp, code, msg, headers):
    raise SpiderException(code, msg)

class ExternalDocument(DMSFile):
  """
  caching sources from outside
53 54 55 56
  This is basically an abstract class
  classes deriving from it should overwrite method _processData (this
  is the one that does something with character data obtained from source)
  Spidering method supports http, ftp and file protocols, and possibly many others
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
  """
  # CMF Type Definition
  meta_type = 'ERP5 External Document'
  portal_type = 'External Document'
  isPortalContent = 1
  isRADContent = 1

  # Declarative security
  security = ClassSecurityInfo()
  security.declareObjectProtected(Permissions.AccessContentsInformation)

  # Default Properties
  property_sheets = ( PropertySheet.Base
                    , PropertySheet.CategoryCore
                    , PropertySheet.DublinCore
                    , PropertySheet.Version
                    , PropertySheet.Reference
                    , PropertySheet.DMSFile
                    , PropertySheet.Document
                    , PropertySheet.Url
                    , PropertySheet.ExternalDocument
                    )

80
  protocols=(('Web page','http'),('FTP site','ftp'),('Local file','file'),)
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95

  searchable_attrs=DMSFile.searchable_attrs+('text_content',)

  security.declareProtected(Permissions.View, 'getProtocolList')
  def getProtocolList(self):
    """
    """
    return [x[1] for x in self.protocols]

  security.declareProtected(Permissions.View, 'getProtocolItemList')
  def getProtocolItemList(self):
    """
    """
    return self.protocols

96 97 98 99 100 101 102 103
  security.declarePrivate(Permissions.View, '_spiderSource')
  def _spiderSource(self):
    """
    FancyURLopener can open various protocols
    """
    op=Opener()
    f=op.open(self.getQualifiedUrl())
    s=f.read()
104 105
    inf=f.info()
    return s, inf
106 107

  security.declarePrivate('_processData')
108
  def _processData(self,s, inf):
109 110 111 112 113 114 115 116 117 118 119
    raise Exception('this should be implemented in subclass')

  security.declareProtected(Permissions.ModifyPortalContent,'resetTopObject')
  def resetTopObject(self):
    '''
    abstract function for maintaining interface
    call before beginning recursive spidering
    used mostly in web pages
    '''
    pass

120 121 122
  security.declareProtected(Permissions.View, 'getProtocolItemList')
  def spiderSource(self):
    """
123 124 125
    spiders external datasource
    sets status message
    returned value tells us if it succeeded or failed
126 127
    """
    try:
128
      s,inf=self._spiderSource()
129 130
    except Exception,e:
      self.log(e,level=1)
131
      self.setExternalProcessingStatusMessage("Tried on %s: %s" % (self._time(),str(e)))
132 133 134
      return False
    chars=len(s)
    if chars==0:
135
      self.setExternalProcessingStatusMessage("Tried on %s: got empty string" % self._time())
136
      return False
137
    try:
138
      s=self._processData(s,inf)
139 140
    except Exception,e:
      self.log(e,level=1)
141
      self.setExternalProcessingStatusMessage("Spidered on %s, %i chars, but could not process; reason: %s" % (self._time(), chars, str(e)))
142
      return False
143
    self.setTextContent(s)
144
    self.setExternalProcessingStatusMessage("Spidered on %s, %i chars, recorded %i chars" % (self._time(), chars, len(s)))
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
    return True

  security.declareProtected(Permissions.View, 'getProtocolItemList')
  def getQualifiedUrl(self):
    """
    this should be in the Url, not here
    otherwise why does the url have a property 'url_protocol'?
    """
    return (self.getUrlProtocol() or '')+'://'+(self.getUrlString() or '')

  def _time(self):
    return DateTime().strftime('%Y/%m/%d %H:%M:%S')


# vim: syntax=python shiftwidth=2