ContributionTool.py 27.1 KB
Newer Older
1
# -*- coding: utf-8 -*-
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
##############################################################################
#
# Copyright (c) 2007 Nexedi SARL and Contributors. All Rights Reserved.
#                    Jean-Paul Smets <jp@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

30
import cStringIO
31 32
import re
import string
33 34
import socket
import md5
Jean-Paul Smets's avatar
Jean-Paul Smets committed
35
import urllib2, urllib
36

Bartek Górny's avatar
Bartek Górny committed
37
from AccessControl import ClassSecurityInfo, getSecurityManager
38
from Products.ERP5Type.Globals import InitializeClass, DTMLFile
39
from Products.CMFCore.utils import getToolByName, _checkPermission
40 41 42
from Products.ERP5Type.Tool.BaseTool import BaseTool
from Products.ERP5Type import Permissions
from Products.ERP5 import _dtmldir
Jean-Paul Smets's avatar
Jean-Paul Smets committed
43 44
from Products.ERP5.Document.Url import no_crawl_protocol_list, no_host_protocol_list

45 46 47
from zLOG import LOG
from DateTime import DateTime
from Acquisition import aq_base
48
from zExceptions import BadRequest
49

50 51 52 53 54 55 56
# Install openers
import ContributionOpener
opener = urllib2.build_opener(ContributionOpener.DirectoryFileHandler)
urllib2.install_opener(opener)

# A temporary hack until urllib2 supports timeout setting - XXX
import socket
57
socket.setdefaulttimeout(600) # 1 minute timeout
58 59

# Global parameters
60
TEMP_NEW_OBJECT_KEY = '_v_new_object'
61
MAX_REPEAT = 10
62 63

_marker = []  # Create a new marker object.
64 65 66 67

class ContributionTool(BaseTool):
  """
    ContributionTool provides an abstraction layer to unify the contribution
68
    of documents into an ERP5 Site.
69

70 71
    ContributionTool needs to be configured in portal_types (allowed contents) so
    that it can store Text, Spreadsheet, PDF, etc. 
72

73 74 75
    The main method of ContributionTool is newContent. This method can
    be provided various parameters from which the portal type and document
    metadata can be derived. 
76 77

    Configuration Scripts:
Jean-Paul Smets's avatar
Jean-Paul Smets committed
78

79 80 81
      - ContributionTool_getPropertyDictFromFileName: receives file name and a 
        dict derived from filename by regular expression, and does any necesary
        operations (e.g. mapping document type id onto a real portal_type).
Jean-Paul Smets's avatar
Jean-Paul Smets committed
82 83 84 85 86 87

    Problems which are not solved

      - handling of relative links in HTML contents (or others...)
        some text rewriting is necessary.

88 89 90 91 92 93
  """
  title = 'Contribution Tool'
  id = 'portal_contributions'
  meta_type = 'ERP5 Contribution Tool'
  portal_type = 'Contribution Tool'

Jean-Paul Smets's avatar
Jean-Paul Smets committed
94 95 96
  # Regular expressions
  simple_normaliser = re.compile('#.*')

97 98 99 100 101 102 103
  # Declarative Security
  security = ClassSecurityInfo()

  security.declareProtected(Permissions.ManagePortal, 'manage_overview' )
  manage_overview = DTMLFile( 'explainContributionTool', _dtmldir )

  security.declareProtected(Permissions.AddPortalContent, 'newContent')
104
  def newContent(self, id=None, portal_type=None, url=None, container=None,
Jean-Paul Smets's avatar
Jean-Paul Smets committed
105
                       container_path=None,
106
                       discover_metadata=1, temp_object=0,
107
                       user_login=None, data=None, file_name=None, **kw):
108 109 110 111 112 113 114
    """
      The newContent method is overriden to implement smart content
      creation by detecting the portal type based on whatever information
      was provided and finding out the most appropriate module to store
      the content.

      user_login is the name under which the content will be created
115 116
      XXX - this is a security hole which needs to be fixed by
      making sure only Manager can use this parameter
117

118 119 120 121
      container -- if specified, it is possible to define
      where to contribute the content. Else, ContributionTool
      tries to guess.

Jean-Paul Smets's avatar
Jean-Paul Smets committed
122 123 124
      container_path -- if specified, defines the container path
      and has precedence over container

125 126
      url -- if specified, content is download from the URL.

127 128 129 130
      NOTE:
        We always generate ID. So, we must prevent using the one
        which we were provided.
    """
131 132 133 134 135 136 137
    if file_name is not None:
      kw['file_name'] = file_name
    if data is not None:
      # This is only used to make sure
      # we can pass file as parameter to ZPublisher
      # whenever we ingest email
      kw['data'] = data
138

139 140
    document = None

141
    # Try to find the file_name
142
    mime_type = None
143
    if not url:
144 145
      # check if file was provided
      file = kw.get('file', None)
146
      if file is not None and file_name is None:
147 148 149 150 151 152 153 154 155 156 157 158
        file_name = file.filename
      else:
        # some channels supply data and file-name separately
        # this is the case for example for email ingestion
        # in this case, we build a file wrapper for it
        data = kw.get('data', None)
        if data is not None:
          file_name = kw.get('file_name', None)
          if file_name is not None:
            file = cStringIO.StringIO()
            file.write(data)
            file.seek(0)
159 160 161
            kw['file'] = file
            del kw['data']
            del kw['file_name']
Jean-Paul Smets's avatar
Jean-Paul Smets committed
162
    else:
163
      # build a new file from the url
164 165
      url_file = urllib2.urlopen(url)
      data = url_file.read() # time out must be set or ... too long XXX
166 167 168
      file = cStringIO.StringIO()
      file.write(data)
      file.seek(0)
169
      # Create a file name based on the URL and quote it
Jean-Paul Smets's avatar
Jean-Paul Smets committed
170
      file_name = url.split('/')[-1] or url.split('/')[-2]
171 172
      file_name = urllib.quote(file_name, safe='')
      file_name = file_name.replace('%', '')
173 174
      # For URLs, we want an id by default equal to the encoded URL
      if id is None: id = self.encodeURL(url)
175 176
      if hasattr(url_file, 'headers'):
        headers = url_file.headers
177 178
        if hasattr(headers, 'type'):
          mime_type = headers.type
179
          kw['content_type'] = mime_type
180
      kw['file'] = file
181 182

    # If the portal_type was provided, we can go faster
183 184 185
    if portal_type and container is None:
      # We know the portal_type, let us find the default module
      # and use it as container
186 187 188 189
      try:
        container = self.getDefaultModule(portal_type)
      except ValueError:
        container = None
190

191 192
    if portal_type and container is not None:
      # We could simplify things here and return a document immediately
193
      # NOTE: we use the module ID generator rather than the provided ID
194 195 196 197 198
      #document = module.newContent(portal_type=portal_type, **kw)
      #if discover_metadata:
      #  document.activate().discoverMetadata(file_name=file_name, user_login=user_login)
      #return document
      pass # XXX - This needs to be implemented once the rest is stable
199

200
    # From here, there is no hope unless a file was provided
201 202 203
    if file is None:
      raise ValueError, "could not determine portal type"

204 205
    #
    # Check if same file is already exists. if it exists, then update it.
206
    #
207
    if portal_type is None:
Nicolas Delaby's avatar
Nicolas Delaby committed
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
      portal_type = self._guessPortalType(file_name, mime_type, data)
      property_dict = self.getMatchedFileNamePatternDict(file_name)
      reference = property_dict.get('reference', None)
      version  = property_dict.get('version', None)
      language  = property_dict.get('language', None)
      if portal_type and reference and version and language:
        portal_catalog = getToolByName(self, 'portal_catalog')
        document = portal_catalog.getResultValue(portal_type=portal_type,
                                                  reference=reference,
                                                  version=version,
                                                  language=language)
        if document is not None:
          # document is already uploaded. So overrides file.
          if not _checkPermission(Permissions.ModifyPortalContent, document):
            raise Unauthorized, "[DMS] You are not allowed to update the existing document which has the same coordinates (id %s)" % document.getId()
          document.edit(file=kw['file'])
          return document
225

226 227 228
    # Temp objects use the standard newContent from Folder
    if temp_object:
      # For temp_object creation, use the standard method
Nicolas Delaby's avatar
Nicolas Delaby committed
229 230
      return BaseTool.newContent(self, id=id, portal_type=portal_type,
                                 temp_object=temp_object, **kw)
231

232
    # Then put the file inside ourselves for a short while
233 234
    if container_path is not None:
      container = self.getPortalObject().restrictedTraverse(container_path)
235 236
    document = self._setObject(file_name, None, portal_type=portal_type,
                               user_login=user_login, id=id,
237 238 239
                               container=container,
                               discover_metadata=discover_metadata,
                               )
240
    object_id = document.getId()
241
    document = self._getOb(object_id) # Call _getOb to purge cache
242 243 244 245 246 247
    rewrite_method = document._getTypeBasedMethod('rewriteIngestionData')
    if rewrite_method is not None:
      modified_kw = rewrite_method(**kw.copy())
      if modified_kw is not None:
        kw.update(modified_kw)

248
    # Then edit the document contents (so that upload can happen)
249
    document._edit(**kw)
250 251
    # if no content_type has been set, guess it
    if 'content_type' not in kw and getattr(document, 'guessMimeType', None) is not None:
252
      # For File force to setup the mime_type
253
      document.guessMimeType(fname=file_name)
254 255
    if url:
      document.fromURL(url)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
256

Jean-Paul Smets's avatar
Jean-Paul Smets committed
257
    # Notify workflows
258
    #document.notifyWorkflowCreated()
Jean-Paul Smets's avatar
Jean-Paul Smets committed
259

260
    # Allow reindexing, reindex it and return the document
Romain Courteaud's avatar
Romain Courteaud committed
261 262 263 264 265
    try:
      delattr(document, 'isIndexable')
    except AttributeError:
      # Document does not have such attribute
      pass
266
    document.reindexObject()
267 268
    return document

269
  security.declareProtected( Permissions.AddPortalContent, 'newXML' )
270 271 272 273 274 275 276
  def newXML(self, xml):
    """
      Create a new content based on XML data. This is intended for contributing
      to ERP5 from another application.
    """
    pass

277 278
  security.declareProtected(Permissions.ModifyPortalContent,'getMatchedFileNamePatternDict')
  def getMatchedFileNamePatternDict(self, file_name):
279
    """
280
      Get matched group dict of file name parsing regular expression.
281
    """
282
    property_dict = {}
283 284 285 286 287 288

    if file_name is None:
      return property_dict

    regex_text = self.portal_preferences.getPreferredDocumentFileNameRegularExpression()
    if regex_text in ('', None):
289 290
      return property_dict

291 292 293
    if regex_text:
      pattern = re.compile(regex_text)
      if pattern is not None:
294
        try:
295
          property_dict = pattern.match(file_name).groupdict()
296 297
        except AttributeError: # no match
          pass
298 299 300 301 302 303 304 305 306 307 308 309
    return property_dict

  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromFileName')
  def getPropertyDictFromFileName(self, file_name):
    """
      Gets properties from filename. File name is parsed with a regular expression
      set in preferences. The regexp should contain named groups.
    """
    if file_name is None:
      return {}
    property_dict = self.getMatchedFileNamePatternDict(file_name)
    method = self._getTypeBasedMethod('getPropertyDictFromFileName',
310
        fallback_script_id = 'ContributionTool_getPropertyDictFromFileName')
311
    property_dict = method(file_name, property_dict)
Ivan Tyagov's avatar
Ivan Tyagov committed
312
    if property_dict.get('portal_type', None) is not None:
313
      # we have to return portal_type as a tuple
314
      # because we should allow for having multiple candidate types
315 316
      property_dict['portal_type'] = (property_dict['portal_type'],)
    else:
317
      # we have to find candidates by file extenstion
Ivan Tyagov's avatar
Ivan Tyagov committed
318 319 320
      if file_name.rfind('.')!= -1:
        ext = file_name.split('.')[-1]
        property_dict['portal_type'] = self.ContributionTool_getCandidateTypeListByExtension(ext)
321 322
    return property_dict

323
  # WebDAV virtual folder support
324 325
  def _setObject(self, name, ob, portal_type=None, user_login=None,
                 container=None, id=None, discover_metadata=1):
326
    """
327 328
      portal_contribution_registry will find appropriate portal type
      name by file_name and content itself.
329 330 331 332 333

      The ContributionTool instance must be configured in such
      way that _verifyObjectPaste will return TRUE.

    """
334 335 336 337 338
    # _setObject is called by constructInstance at a time
    # when the object has no portal_type defined yet. It
    # will be removed later on. We can safely store the
    # document inside us at this stage. Else we
    # must find out where to store it.
339 340 341 342 343 344
    if ob is not None:
      # Call from webdav API
      # redefine parameters
      portal_type = ob.getPortalType()
      container = ob.getParentValue()
      id = ob.getId()
345 346 347 348
    if not portal_type:
      document = BaseTool.newContent(self, id=name,
                                     portal_type=portal_type,
                                     is_indexable=0)
349 350 351 352 353
    else:
      # We give the system a last chance to analyse the
      # portal_type based on the document content
      # (ex. a Memo is a kind of Text which can be identified
      # by the fact it includes some specific content)
354

355 356
      # Now we know the portal_type, let us find the module
      # to which we should move the document to
357
      if container is None:
358
        module = self.getDefaultModule(portal_type)
359 360
      else:
        module = container
Jean-Paul Smets's avatar
Jean-Paul Smets committed
361 362 363 364
      if id is None:
        new_id = module.generateNewId()
      else:
        new_id = id
Nicolas Delaby's avatar
Nicolas Delaby committed
365
      existing_document = module._getOb(new_id, None)
366 367 368
      if existing_document is None:
        # There is no preexisting document - we can therefore
        # set the new object
369 370 371
        document = module.newContent(id=new_id,
                                     portal_type=portal_type,
                                     is_indexable=0)
372 373 374 375 376 377
        # We can now discover metadata
        if discover_metadata:
          # Metadata disovery is done as an activity by default
          # If we need to discoverMetadata synchronously, it must
          # be for user interface and should thus be handled by
          # ZODB scripts
378 379 380
          document.activate(after_path_and_method_id=(document.getPath(),
            ('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
          .discoverMetadata(file_name=name, user_login=user_login)
381
      else:
382
        document = existing_document
383 384
      # Keep the document close to us - this is only useful for
      # file upload from webdav
385 386
      if not hasattr(self, '_v_document_cache'):
        self._v_document_cache = {}
387
      self._v_document_cache[document.getId()] = document.getRelativeUrl()
388 389 390

    # Return document to newContent method
    return document
391

392 393 394 395 396
  def _getOb(self, id, default=_marker):
    """
    Check for volatile temp object info first
    and try to find it
    """
397 398
    # Use the document cache if possible and return result immediately
    # this is only useful for webdav
Jean-Paul Smets's avatar
Jean-Paul Smets committed
399 400 401
    if hasattr(self, '_v_document_cache'):
      document_url = self._v_document_cache.get(id, None)
      if document_url is not None:
402
        del self._v_document_cache[id]
Jean-Paul Smets's avatar
Jean-Paul Smets committed
403 404
        return self.getPortalObject().unrestrictedTraverse(document_url)

405 406 407 408 409 410 411 412 413 414 415 416
    # Try first to return the real object inside
    # This is much safer than trying to access objects displayed by listDAVObjects
    # because the behaviour of catalog is unpredicatble if a string is passed
    # for a UID. For example 
    #   select path from catalog where uid = "001193.html";
    # will return the same as
    #   select path from catalog where uid = 1193;
    # This was the source of an error in which the contribution tool
    # was creating a web page and was returning a Base Category
    # when
    #   o = folder._getOb(id)
    # was called in DocumentConstructor
417 418 419 420 421 422 423
    if default is _marker:
      result = BaseTool._getOb(self, id)
    else:
      result = BaseTool._getOb(self, id, default=default)
    if result is not None:
      # if result is None, ignore it at this stage
      # we can be more lucky with portal_catalog
424 425 426
      return result

    # Return an object listed by listDAVObjects
427 428 429
    # ids are concatenation of uid + '-' + standard file name of documents
    # get the uid
    uid = str(id).split('-', 1)[0]
430 431
    object = self.getPortalObject().portal_catalog.unrestrictedGetResultValue(uid=uid)
    if object is not None:
432
      return object.getObject() # Make sure this does not break security. XXX
433 434
    if default is not _marker:
      return default
435 436 437
    # Raise an AttributeError the same way as in OFS.ObjectManager._getOb
    raise AttributeError, id

438

Bartek Górny's avatar
Bartek Górny committed
439
  def listDAVObjects(self):
440 441 442
    """
      Get all contents contributed by the current user. This is
      delegated to a script in order to help customisation.
443
    XXX Killer feature, it is not scalable
444 445 446 447 448 449 450 451 452 453 454 455 456
    """
    method = getattr(self, 'ContributionTool_getMyContentList', None)
    if method is not None:
      object_list = method()
    else:
      sm = getSecurityManager()
      user = sm.getUser()
      object_list = self.portal_catalog(portal_type=self.getPortalMyDocumentTypeList(),
                                        owner=str(user))

    def wrapper(o_list):
      for o in o_list:
        o = o.getObject()
457 458
        id = '%s-%s' % (o.getUid(), o.getStandardFileName(),)
        yield o.asContext(id=id)
459 460

    return wrapper(object_list)
Bartek Górny's avatar
Bartek Górny committed
461

Jean-Paul Smets's avatar
Jean-Paul Smets committed
462
  # Crawling methods
463 464
  security.declareProtected(Permissions.View, 'normaliseURL')
  def normaliseURL(self, url, base_url=None):
Jean-Paul Smets's avatar
Jean-Paul Smets committed
465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
    """
      Returns a normalised version of the url so
      that we do not download twice the same content.
      URL normalisation is an important part in crawlers.
      The current implementation is obviously simplistic.
      Refer to http://en.wikipedia.org/wiki/Web_crawler
      and study Harvestman for more ideas.
    """
    url = self.simple_normaliser.sub('', url)
    url_split = url.split(':')
    url_protocol = url_split[0]
    if url_protocol in no_host_protocol_list:
      return url
    if base_url and len(url_split) == 1:
      # Make relative URL absolute
      url = '%s/%s' % (base_url, url)
    return url

483 484
  security.declareProtected(Permissions.View, 'encodeURL')
  def encodeURL(self, url):
Jean-Paul Smets's avatar
Jean-Paul Smets committed
485 486 487 488 489 490 491 492 493 494
    """
    Returns the URL as an ID. ID should be chosen in such
    way that it is optimal with HBTreeFolder (ie. so that
    distribution of access time on a cluster is possible)

    NOTE: alternate approach is based on a url table
    and catalog lookup. It is faster ? Not sure. Since
    we must anyway insert objects in btrees and this
    is simimar in cost to accessing them.
    """
495 496 497
    # Produce an MD5 from the URL
    hex_md5 = md5.md5(url).hexdigest()
    # Take the first part in the URL which is not empty
498
    # LOG("encodeURL", 0, url)
499 500 501 502 503 504 505 506 507 508 509 510 511
    url_segment = url.split(':')[1]
    url_segment_list = url_segment.split('/')
    url_domain = None
    for url_part in url_segment_list:
      if url_part:
        url_domain = url_part
        break
    # Return encoded url
    if url_domain:
      url_domain = urllib.quote(url_domain, safe='')
      url_domain = url_domain.replace('%', '')
      return "%s-%s" % (url_domain, hex_md5)
    return hex_md5
Jean-Paul Smets's avatar
Jean-Paul Smets committed
512 513 514 515 516 517
    url = urllib.quote(url, safe='')
    url = url.replace('_', '__')
    url = url.replace('%', '_')
    return url

  security.declareProtected(Permissions.AddPortalContent, 'crawlContent')
518
  def crawlContent(self, content, container=None):
Jean-Paul Smets's avatar
Jean-Paul Smets committed
519 520 521 522 523 524 525
    """
      Analyses content and download linked pages

      XXX: missing is the conversion of content local href to something
      valid.
    """
    depth = content.getCrawlingDepth()
526 527 528 529 530 531 532 533
    if depth < 0:
      # Do nothing if crawling depth is reached
      # (this is not a duplicate code but a way to prevent
      # calling isIndexContent unnecessarily)
      return
    if not content.isIndexContent(): # Decrement depth only if it is a content document
      depth = depth - 1
    if depth < 0:
Jean-Paul Smets's avatar
Jean-Paul Smets committed
534 535 536
      # Do nothing if crawling depth is reached
      return
    base_url = content.getContentBaseURL()
537
    url_list = map(lambda url: self.normaliseURL(url, base_url), set(content.getContentURLList()))
Jean-Paul Smets's avatar
Jean-Paul Smets committed
538
    for url in set(url_list):
539
      # LOG('trying to crawl', 0, url)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
540 541 542
      # Some url protocols should not be crawled
      if url.split(':')[0] in no_crawl_protocol_list:
        continue
543 544 545 546
      if container is None:
        #if content.getParentValue()
        # in place of not ?
        container = content.getParentValue()
Jean-Paul Smets's avatar
Jean-Paul Smets committed
547
      # Calculate the id under which content will be stored
548
      id = self.encodeURL(url)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
549 550 551 552 553 554
      # Try to access the document if it already exists
      document = container.get(id, None)
      if document is None:
        # XXX - This call is not working due to missing group_method_id
        # therefore, multiple call happen in parallel and eventually fail
        # (the same URL is created multiple times)
555
        # LOG('activate newContentFromURL', 0, url)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
556
        self.activate(activity="SQLQueue").newContentFromURL(container_path=container.getRelativeUrl(),
557 558 559 560 561
                                                      id=id, url=url, crawling_depth=depth)
      elif depth and document.getCrawlingDepth() < depth:
        # Update the crawling depth if necessary
        document._setCrawlingDepth(depth)
        document.activate().crawlContent()
Jean-Paul Smets's avatar
Jean-Paul Smets committed
562 563

  security.declareProtected(Permissions.AddPortalContent, 'updateContentFromURL')
564
  def updateContentFromURL(self, content, repeat=MAX_REPEAT, crawling_depth=0):
Jean-Paul Smets's avatar
Jean-Paul Smets committed
565 566 567
    """
      Updates an existing content.
    """
568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583
    # First, test if the document is updatable according to
    # its workflow states (if it has a workflow associated with)
    if content.isUpdatable():
      # Step 0: update crawling_depth if required
      if crawling_depth > content.getCrawlingDepth():
        content._setCrawlingDepth(crawling_depth)
      # Step 1: download new content
      try:
        url = content.asURL()
        data = urllib2.urlopen(url).read()
        file = cStringIO.StringIO()
        file.write(data)
        file.seek(0)
      except urllib2.HTTPError, error:
        if repeat == 0:
          # XXX - Call the extendBadURLList method,--NOT Implemented--
584
          # IDEA : ajouter l'url en question dans une list "bad_url_list" puis lors du crawling au lieu que de boucler sur 
585
          #        la liste des url extraites de la page web on fait un test supplementaire qui verifie que l'url n'est pas 
586
          #        dans la liste bad_url_lis
Jérome Perrin's avatar
Jérome Perrin committed
587
          raise
588 589 590 591 592
        content.activate(at_date=DateTime() + 1).updateContentFromURL(repeat=repeat - 1)
        return
      except urllib2.URLError, error:
        if repeat == 0:
          # XXX - Call the extendBadURLList method,--NOT Implemented--
Jérome Perrin's avatar
Jérome Perrin committed
593
          raise
594 595 596 597 598 599 600
        content.activate(at_date=DateTime() + 1).updateContentFromURL(repeat=repeat - 1)
        return

      # Step 2: compare and update if necessary (md5)
      # md5 stuff to compare contents
      new_content_md5 = md5.md5(data).hexdigest()
      content_md5 = content.getContentMd5()
601
      if content_md5 == new_content_md5:
602 603 604 605 606
        return
      content._edit(file=file)# Please make sure that if content is the same
                              # we do not update it
                              # This feature must be implemented by Base or File
                              # not here (look at _edit in Base)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
607
      # Step 3: convert to base format
608
      content.convertToBaseFormat()
Jean-Paul Smets's avatar
Jean-Paul Smets committed
609
      # Step 4: activate populate (unless interaction workflow does it)
610
      content.activate().populateContent()
Jean-Paul Smets's avatar
Jean-Paul Smets committed
611
      # Step 5: activate crawlContent
612 613 614 615
      depth = content.getCrawlingDepth()
      if depth > 0:
        content.activate().crawlContent()
      content.setContentMd5(new_content_md5)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
616 617

  security.declareProtected(Permissions.AddPortalContent, 'newContentFromURL')
618
  def newContentFromURL(self, container_path=None, id=None, repeat=MAX_REPEAT, repeat_interval=1, batch_mode=True, **kw):
Jean-Paul Smets's avatar
Jean-Paul Smets committed
619 620 621 622 623 624 625 626 627 628
    """
      A wrapper method for newContent which provides extra safety
      in case or errors (ie. download, access, conflict, etc.).
      The method is able to handle a certain number of exceptions
      and can postpone itself through an activity based on
      the type of exception (ex. for a 404, postpone 1 day), using
      the at_date parameter and some standard values.

      NOTE: implementation needs to be done.
    """
Ivan Tyagov's avatar
Ivan Tyagov committed
629
    document = None
630 631 632 633 634 635
    # First of all, make sure do not try to create an existing document
    if container_path is not None and id is not None:
      container = self.restrictedTraverse(container_path)
      document = container.get(id, None)
      if document is not None:
        # Document aleardy exists: no need to keep on crawling
636
        return document
637 638
    try:
      document = self.newContent(container_path=container_path, id=id, **kw)
639 640 641 642 643 644
      if document.isIndexContent() and document.getCrawlingDepth() >= 0:
        # If this is an index document, keep on crawling even if crawling_depth is 0
        document.activate().crawlContent()
      elif document.getCrawlingDepth() > 0:
        # If this is an index document, stop crawling if crawling_depth is 0
        document.activate().crawlContent()
645
    except urllib2.HTTPError, error:
646
      if repeat == 0 and batch_mode:
647 648 649
        # here we must call the extendBadURLList method,--NOT Implemented--
        # which had to add this url to bad URL list, so next time we avoid
        # crawling bad URL
Jérome Perrin's avatar
Jérome Perrin committed
650
        raise
651 652 653 654 655 656
      if repeat > 0:
        # Catch any HTTP error
        self.activate(at_date=DateTime() + repeat_interval).newContentFromURL(
                          container_path=container_path, id=id,
                          repeat=repeat - 1,
                          repeat_interval=repeat_interval, **kw)
657
    except urllib2.URLError, error:
658
      if repeat == 0 and batch_mode:
659
        # XXX - Call the extendBadURLList method, --NOT Implemented--
Jérome Perrin's avatar
Jérome Perrin committed
660
        raise
661 662 663
      #if getattr(error.reason,'args',None):
        #if error.reason.args[0] == socket.EAI_AGAIN:
          ## Temporary failure in name resolution - try again in 1 day
664 665 666 667 668 669
      if repeat > 0:
        self.activate(at_date=DateTime() + repeat_interval,
                      activity="SQLQueue").newContentFromURL(
                        container_path=container_path, id=id,
                        repeat=repeat - 1,
                        repeat_interval=repeat_interval, **kw)
670
    return document
Jean-Paul Smets's avatar
Jean-Paul Smets committed
671

672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690
  def _guessPortalType(self, name, typ, body):
    """
       Call Portal Contribution Registry
       to know which portal_type should be used
    """
    findPortalTypeName = None
    registry = getToolByName(self, 'portal_contribution_registry', None)
    if registry is not None:
      findPortalTypeName = registry.findPortalTypeName
    else:
      # Keep backward compatibility
      registry = getToolByName(self, 'content_type_registry', None)
      if registry is None:
        return None
      findPortalTypeName = registry.findTypeName

    portal_type = findPortalTypeName(name, typ, body)
    return portal_type

Ivan Tyagov's avatar
Ivan Tyagov committed
691
InitializeClass(ContributionTool)