discoverable.py 10.5 KB
Newer Older
Nicolas Delaby's avatar
Nicolas Delaby committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
#                    Ivan Tyagov <ivan@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

from AccessControl import ClassSecurityInfo, getSecurityManager
from Products.ERP5Type import Permissions
from Products.ERP5Type.Utils import convertToUpperCase
from Products.CMFCore.utils import getToolByName
from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
import os
import re

try:
  import magic
except ImportError:
  magic = None

43 44
VALID_ORDER_KEY_LIST = ('user_login', 'content', 'filename', 'file_name',
                        'input')
Nicolas Delaby's avatar
Nicolas Delaby committed
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141

CONTENT_INFORMATION_FORMAT = '_idiscoverable_content_information'

class DiscoverableMixin(CachedConvertableMixin):
  """
  Implements IDiscoverable
  This class provide methods useful for Metadata extraction.
  It inherit from CachedConvertableMixin to access
  Cache storage API.
  As computed data needs to be stored in same backend.
  """
  security = ClassSecurityInfo()

  security.declareProtected(Permissions.AccessContentsInformation,
                            'getPropertyDictFromUserLogin')
  def getPropertyDictFromUserLogin(self, user_login=None):
    """
    Based on the user_login, find out as many properties as needed.
    returns properties which should be set on the document
    """
    if user_login is None:
      user_login = str(getSecurityManager().getUser())
    method = self._getTypeBasedMethod('getPropertyDictFromUserLogin',
        fallback_script_id='Document_getPropertyDictFromUserLogin')
    return method(user_login)

  security.declareProtected(Permissions.AccessContentsInformation,
                            'getPropertyDictFromContent')
  def getPropertyDictFromContent(self):
    """
    Based on the document content, find out as many properties as needed.
    returns properties which should be set on the document
    """
    # accesss data through convert
    mime, content = self.convert(None)
    if not content:
       # if document is empty, we will not find anything in its content
      return {}
    method = self._getTypeBasedMethod('getPropertyDictFromContent',
        fallback_script_id='Document_getPropertyDictFromContent')
    return method()

  security.declareProtected(Permissions.AccessContentsInformation,
                            'getPropertyDictFromFilename')
  def getPropertyDictFromFilename(self, filename):
    """
    Based on the file name, find out as many properties as needed.
    returns properties which should be set on the document
    """
    return self.portal_contributions.getPropertyDictFromFilename(filename)


  security.declareProtected(Permissions.AccessContentsInformation,
                            'getPropertyDictFromFileName')
  getPropertyDictFromFileName = getPropertyDictFromFilename

  security.declareProtected(Permissions.AccessContentsInformation,
                            'getPropertyDictFromInput')
  def getPropertyDictFromInput(self, input_parameter_dict):
    """
    Fetch argument_dict, then filter pass this dictionary
    to getPropertyDictFromInput.
    """
    method = self._getTypeBasedMethod('getPropertyDictFromInput')
    return method(input_parameter_dict)

  ### Metadata disovery and ingestion methods
  security.declareProtected(Permissions.ModifyPortalContent,
                            'discoverMetadata')
  def discoverMetadata(self, filename=None, user_login=None,
                       input_parameter_dict=None):
    """
    This is the main metadata discovery function - controls the process
    of discovering data from various sources. The discovery itself is
    delegated to scripts or uses preference-configurable regexps. The
    method returns either self or the document which has been
    merged in the discovery process.

    filename - this parameter is a file name of the form "AA-BBB-CCC-223-en"

    user_login - this is a login string of a person; can be None if the user is
                 currently logged in, then we'll get him from session
    input_parameter_dict - arguments provided to Create this content by user.
    """
    # Preference is made of a sequence of 'user_login', 'content', 'filename', 'input'
    method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList')
    order_list = list(method())
    order_list.reverse()
    # build a dictionary according to the order
    kw = {}
    for order_id in order_list:
      result = None
      if order_id not in VALID_ORDER_KEY_LIST:
        # Prevent security attack or bad preferences
        raise AttributeError, "%s is not in valid order key list" % order_id
      method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id)
      method = getattr(self, method_id)
142
      if order_id in ('filename', 'file_name',):
Nicolas Delaby's avatar
Nicolas Delaby committed
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
        if filename is not None:
          result = method(filename)
      elif order_id == 'user_login':
        if user_login is not None:
          result = method(user_login)
      elif order_id == 'input':
        if input_parameter_dict is not None:
          result = method(input_parameter_dict)
      else:
        result = method()
      if result is not None:
        for key, value in result.iteritems():
          if value not in (None, ''):
            kw[key]=value
    # Prepare the content edit parameters
158 159 160 161 162 163
    portal_type = None
    if input_parameter_dict is not None:
      # User decision take precedence, never try to change this value
      portal_type = input_parameter_dict.get('portal_type')
    if not portal_type:
      # Read discovered portal_type
164
      portal_type = kw.pop('portal_type', None)
Nicolas Delaby's avatar
Nicolas Delaby committed
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
    if portal_type and portal_type != self.getPortalType():
      # Reingestion is required to update portal_type
      return self.migratePortalType(portal_type)
    # Try not to invoke an automatic transition here
    self._edit(**kw)
    if not portal_type:
      # If no portal_type was dicovered, pass self
      # through to portal_contribution_registry
      # to guess destination portal_type against all properties.
      # If returned portal_type is different, then reingest.
      registry = getToolByName(self.getPortalObject(),
                              'portal_contribution_registry')
      portal_type = registry.findPortalTypeName(context=self)
      if portal_type != self.getPortalType():
        return self.migratePortalType(portal_type)
    # Finish ingestion by calling method
    self.finishIngestion() # XXX - is this really the right place ?
    self.reindexObject() # XXX - is this really the right place ?
    # Revision merge is tightly coupled
    # to metadata discovery - refer to the documentation of mergeRevision method
    merged_doc = self.mergeRevision() # XXX - is this really the right place ?
    merged_doc.reindexObject() # XXX - is this really the right place ?
    return merged_doc # XXX - is this really the right place ?

  security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion')
  def finishIngestion(self):
    """
    Finish the ingestion process by calling the appropriate script. This
    script can for example allocate a reference number automatically if
    no reference was defined.
    """
    method = self._getTypeBasedMethod('finishIngestion',
                                 fallback_script_id='Document_finishIngestion')
    return method()

  security.declareProtected(Permissions.AccessContentsInformation,
                            'getContentTypeFromContent')
  def getContentTypeFromContent(self):
    """
    Return content_type read from metadata extraction of content.
    This method is called by portal_contribution_registry
    """
    mime, content = self.convert(None)
    if not content:
      return
    if magic is not None:
      # This will be delegated soon to external web service
      # like cloudooo
      # ERP5 will no longer handle data itself.
      mimedetector = magic.Magic(mime=True)
      return mimedetector.from_buffer(content)

  security.declareProtected(Permissions.AccessContentsInformation,
                            'getExtensionFromFilename')
  def getExtensionFromFilename(self, filename=None):
    """
    Return extension read from filename in lower case.
    """
    if not filename:
      filename = self.getStandardFilename()
    basename, extension = os.path.splitext(filename)
    if extension:
      extension = extension[1:].lower() # remove first dot
    return extension

  security.declareProtected(Permissions.AccessContentsInformation,
                            'getContentInformation')
  def getContentInformation(self):
    """
    Call private implementation, then store the result in conversion
    cache storage.
    """
    format = CONTENT_INFORMATION_FORMAT
    # How to knows if a instance implement an interface
    try:
      mime, cached_value = self.getConversion(format=format)
      return cached_value
    except KeyError:
      value = self._getContentInformation()
      self.setConversion(value, format=format)
      return value

  def _getContentInformation(self):
    """
    Returns the content information from the HTML conversion.
    The default implementation tries to build a dictionary
    from the HTML conversion of the document and extract
    the document title.
    """
    result = {}
    html = self.asEntireHTML()
    if not html:
      return result
    title_list = re.findall(self.title_parser, str(html))
    if title_list:
      result['title'] = title_list[0]
    return result