document.erp5.BigFile.py 14.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
#               2012 Nexedi SA and Contributors. All Rights Reserved.
#                    Romain Courteaud <romain@nexedi.com>
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################

17
from six.moves import cStringIO as StringIO
18 19 20
from AccessControl import ClassSecurityInfo
from Products.ERP5Type import Permissions, PropertySheet
from Products.ERP5Type.Base import removeIContentishInterface
21
from erp5.component.document.File import File, _MARKER
22
from erp5.component.module.BTreeData import BTreeData
23 24 25 26
from ZPublisher.HTTPRequest import FileUpload
from ZPublisher import HTTPRangeSupport
from webdav.common import rfc1123_date
from mimetools import choose_boundary
27 28
from Products.CMFCore.utils import _setCacheHeaders, _ViewEmulator
from DateTime import DateTime
Romain Courteaud's avatar
Romain Courteaud committed
29
import re
30 31 32 33 34

class BigFile(File):
  """
  Support storing huge file.
  No convertion is allowed for now.
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62


  NOTE BigFile maintains the following invariant:

    data property is either

      - BTreeData instance,  or
      - str(*),  or
      - None.

    (*) str has to be supported because '' is a default value for `data` field
        from Data property sheet.

        Even more - for

            a) compatibility reasons, and
            b) desire to support automatic migration of File-based documents
               from document_module to BigFiles

        non-empty str for data also have to be supported.

        XXX(kirr) I'm not sure supporting non-empty str is a good idea (it
            would be simpler if .data could be either BTreeData or "empty"),
            but neither I'm experienced enough in erp5 nor know what are
            appropriate compatibility requirements.

            We discussed with Romain and settled on "None or str or BTreeData"
            invariant for now.
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
  """

  meta_type = 'ERP5 Big File'
  portal_type = 'Big File'

  # Declarative security
  security = ClassSecurityInfo()
  security.declareObjectProtected(Permissions.AccessContentsInformation)

  # Default Properties
  property_sheets = ( PropertySheet.Base
                    , PropertySheet.XMLObject
                    , PropertySheet.CategoryCore
                    , PropertySheet.DublinCore
                    , PropertySheet.Version
                    , PropertySheet.Reference
                    , PropertySheet.Document
                    , PropertySheet.Data
                    , PropertySheet.ExternalDocument
                    , PropertySheet.Url
                    , PropertySheet.Periodicity
    )

  # OFS.File has an overloaded __str__ that returns the file content
  __str__ = object.__str__

  security.declareProtected(Permissions.AccessContentsInformation,
                            'getData')
91
  def getData(self, default=None):
92 93 94 95 96 97 98 99 100 101 102 103 104 105
    """Read the full btree
    """
    btree = self._baseGetData()
    if isinstance(btree, BTreeData):
      return btree.read(0, len(btree))
    else:
      return btree

  security.declareProtected(Permissions.ModifyPortalContent, 'updateContentMd5')
  def updateContentMd5(self):
    """Update md5 checksum from the original file
    """
    self._setContentMd5(None)

106
  def _read_data(self, file, data=None, serialize=True): # pylint: disable=redefined-builtin,arguments-differ
107

108 109 110 111 112
    # We might need to make this value configurable. It is important to
    # consider the max quantity of object used in the cache. With a default
    # cache of 5000 objects, and with n = 64KB, this makes using about 330 MB
    # of memory.
    n=1 << 16
113 114 115 116 117 118

    if isinstance(file, str):
      # Big string: cut it into smaller chunks
      file = StringIO(file)

    if isinstance(file, FileUpload) and not file:
119
      raise ValueError('File not specified')
120 121 122 123 124

    seek=file.seek
    read=file.read

    seek(0,2)
Romain Courteaud's avatar
Romain Courteaud committed
125
    end=file.tell()
126

Romain Courteaud's avatar
Romain Courteaud committed
127 128
    if data is None:
      btree = BTreeData()
129 130 131 132 133
    elif isinstance(data, str):
      # we'll want to append content to this file -
      # - automatically convert str (empty or not) to BTreeData
      btree = BTreeData()
      btree.write(data, 0)
Romain Courteaud's avatar
Romain Courteaud committed
134 135
    else:
      btree = data
136 137
    seek(0)
    pos = file.tell()
Romain Courteaud's avatar
Romain Courteaud committed
138
    offset = len(btree)
139 140

    while pos < end:
141 142 143
      next_ = pos + n
      if next_ > end:
        next_ = end
144

145
      btree.write(read(next_-pos), offset+pos)
146 147
      pos = file.tell()

148 149
    if serialize:
      self.serialize()
Romain Courteaud's avatar
Romain Courteaud committed
150
    return btree, len(btree)
151

152 153
  def _data_mtime(self):
    """get .data mtime if present and fallback to self._p_mtime"""
154
    # there is no data._p_mtime when data is None or str.
155 156
    # so try and fallback to self._p_mtime
    data = self._baseGetData()
157
    mtime = getattr(data, '_p_mtime', self._p_mtime)
158 159
    return mtime

160 161 162
  def _range_request_handler(self, REQUEST, RESPONSE):
    # HTTP Range header handling: return True if we've served a range
    # chunk out of our data.
163
    range_ = REQUEST.get_header('Range', None)
164 165 166 167
    request_range = REQUEST.get_header('Request-Range', None)
    if request_range is not None:
      # Netscape 2 through 4 and MSIE 3 implement a draft version
      # Later on, we need to serve a different mime-type as well.
168
      range_ = request_range
169
    if_range = REQUEST.get_header('If-Range', None)
170 171
    if range_ is not None:
      ranges = HTTPRangeSupport.parseRange(range_)
172

173 174
      data = self._baseGetData()

175 176 177 178 179 180 181 182 183 184 185 186 187 188
      if if_range is not None:
        # Only send ranges if the data isn't modified, otherwise send
        # the whole object. Support both ETags and Last-Modified dates!
        if len(if_range) > 1 and if_range[:2] == 'ts':
          # ETag:
          if if_range != self.http__etag():
            # Modified, so send a normal response. We delete
            # the ranges, which causes us to skip to the 200
            # response.
            ranges = None
        else:
          # Date
          date = if_range.split( ';')[0]
          try: mod_since=long(DateTime(date).timeTime())
189
          except Exception: mod_since=None
190
          if mod_since is not None:
191 192
            last_mod = self._data_mtime()
            if last_mod is None:
193
              last_mod = 0
194
            last_mod = long(last_mod)
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
            if last_mod > mod_since:
              # Modified, so send a normal response. We delete
              # the ranges, which causes us to skip to the 200
              # response.
              ranges = None

      if ranges:
        # Search for satisfiable ranges.
        satisfiable = 0
        for start, end in ranges:
          if start < self.getSize():
            satisfiable = 1
            break

        if not satisfiable:
          RESPONSE.setHeader('Content-Range',
              'bytes */%d' % self.getSize())
          RESPONSE.setHeader('Accept-Ranges', 'bytes')
213
          RESPONSE.setHeader('Last-Modified', rfc1123_date(self._data_mtime()))
214 215 216 217 218 219 220 221 222 223 224 225
          RESPONSE.setHeader('Content-Type', self.content_type)
          RESPONSE.setHeader('Content-Length', self.getSize())
          RESPONSE.setStatus(416)
          return True

        ranges = HTTPRangeSupport.expandRanges(ranges, self.getSize())

        if len(ranges) == 1:
          # Easy case, set extra header and return partial set.
          start, end = ranges[0]
          size = end - start

226
          RESPONSE.setHeader('Last-Modified', rfc1123_date(self._data_mtime()))
227 228 229 230 231 232 233
          RESPONSE.setHeader('Content-Type', self.content_type)
          RESPONSE.setHeader('Content-Length', size)
          RESPONSE.setHeader('Accept-Ranges', 'bytes')
          RESPONSE.setHeader('Content-Range',
              'bytes %d-%d/%d' % (start, end - 1, self.getSize()))
          RESPONSE.setStatus(206) # Partial content

234
          # NOTE data cannot be None here (if it is - ranges are not satisfiable)
235 236 237
          if isinstance(data, str):
            RESPONSE.write(data[start:end])
            return True
238 239
          for chunk in data.iterate(start, end-start):
            RESPONSE.write(chunk)
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
          return True

        else:
          boundary = choose_boundary()

          # Calculate the content length
          size = (8 + len(boundary) + # End marker length
              len(ranges) * (         # Constant lenght per set
                  49 + len(boundary) + len(self.content_type) +
                  len('%d' % self.getSize())))
          for start, end in ranges:
            # Variable length per set
            size = (size + len('%d%d' % (start, end - 1)) +
                end - start)

255
          data = self._baseGetData()
256 257 258 259 260 261 262

          # Some clients implement an earlier draft of the spec, they
          # will only accept x-byteranges.
          draftprefix = (request_range is not None) and 'x-' or ''

          RESPONSE.setHeader('Content-Length', size)
          RESPONSE.setHeader('Accept-Ranges', 'bytes')
263
          RESPONSE.setHeader('Last-Modified', rfc1123_date(self._data_mtime()))
264 265 266 267 268 269 270 271 272 273 274 275 276
          RESPONSE.setHeader('Content-Type',
              'multipart/%sbyteranges; boundary=%s' % (
                  draftprefix, boundary))
          RESPONSE.setStatus(206) # Partial content

          for start, end in ranges:
            RESPONSE.write('\r\n--%s\r\n' % boundary)
            RESPONSE.write('Content-Type: %s\r\n' %
                self.content_type)
            RESPONSE.write(
                'Content-Range: bytes %d-%d/%d\r\n\r\n' % (
                    start, end - 1, self.getSize()))

277
            # NOTE data cannot be None here (if it is - ranges are not satisfiable)
278 279 280 281
            if isinstance(data, str):
              RESPONSE.write(data[start:end])

            else:
282 283
              for chunk in data.iterate(start, end-start):
                RESPONSE.write(chunk)
284 285 286 287 288

          RESPONSE.write('\r\n--%s--\r\n' % boundary)
          return True

  security.declareProtected(Permissions.View, 'index_html')
289
  def index_html(self, REQUEST, RESPONSE, format=_MARKER, inline=_MARKER, **kw): # pylint: disable=redefined-builtin
290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313
    """
      Support streaming
    """
    if self._range_request_handler(REQUEST, RESPONSE):
      # we served a chunk of content in response to a range request.
      return ''

    web_cache_kw = kw.copy()
    if format is not _MARKER:
      web_cache_kw['format'] = format
    _setCacheHeaders(_ViewEmulator().__of__(self), web_cache_kw)

    if format is _MARKER and not kw:
      # conversion parameters is mandatory to download the converted content.
      # By default allways return view action.
      # for all WevDAV access return raw content.
      return self.view()

    if format is _MARKER:
      format = None

    data = self._baseGetData()
    mime = self.getContentType()

314
    RESPONSE.setHeader('Content-Length', data is not None and  len(data)  or  0)
315 316 317 318 319 320
    RESPONSE.setHeader('Content-Type', mime)
    if inline is _MARKER:
      # by default, use inline for text and image formats
      inline = False
    if not inline:
      # need to return it as attachment
321
      filename = self.getStandardFilename(format=format) # pylint: disable=unused-variable
322 323 324
      RESPONSE.setHeader('Accept-Ranges', 'bytes')


325 326
    if data is None:
      return ''
327 328 329
    if isinstance(data, str):
      RESPONSE.setBase(None)
      return data
330 331
    for chunk in data.iterate():
      RESPONSE.write(chunk)
332 333
    return ''

Romain Courteaud's avatar
Romain Courteaud committed
334 335 336 337 338 339
  security.declareProtected(Permissions.ModifyPortalContent,'PUT')
  def PUT(self, REQUEST, RESPONSE):
    """Handle HTTP PUT requests"""
    self.dav__init(REQUEST, RESPONSE)
    self.dav__simpleifhandler(REQUEST, RESPONSE, refresh=1)

340
    type_=REQUEST.get_header('content-type', None)
Romain Courteaud's avatar
Romain Courteaud committed
341

342
    file_=REQUEST['BODYFILE']
Romain Courteaud's avatar
Romain Courteaud committed
343 344 345

    content_range = REQUEST.get_header('Content-Range', None)
    if content_range is None:
346 347
      # truncate the file
      self._baseSetData(None)
Romain Courteaud's avatar
Romain Courteaud committed
348 349
    else:
      current_size = int(self.getSize())
350 351 352 353
      query_range = re.compile(r'bytes \*/\*')
      append_range = re.compile(r'bytes (?P<first_byte>[0-9]+)-' \
                                 '(?P<last_byte>[0-9]+)/' \
                                 '(?P<total_content_length>[0-9]+)')
Romain Courteaud's avatar
Romain Courteaud committed
354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
      if query_range.match(content_range):
        RESPONSE.setHeader('X-Explanation', 'Resume incomplete')
        RESPONSE.setHeader('Range', 'bytes 0-%s' % (current_size-1))
        RESPONSE.setStatus(308)
        return RESPONSE

      if append_range.match(content_range):

        result_dict = append_range.search(content_range).groupdict()
        first_byte = int(result_dict['first_byte'])
        last_byte = int(result_dict['last_byte'])
        total_content_length = int(result_dict['total_content_length'])
        content_length= int(REQUEST.get_header('Content-Length', '0'))

        if (first_byte != current_size):
          RESPONSE.setHeader('X-Explanation', 'Can only append data')
          RESPONSE.setStatus(400)
          return RESPONSE
        elif (last_byte+1 != total_content_length):
          RESPONSE.setHeader('X-Explanation', 'Total size unexpected')
          RESPONSE.setStatus(400)
          return RESPONSE
        elif (last_byte+1-first_byte != content_length):
          RESPONSE.setHeader('X-Explanation', 'Content length unexpected')
          RESPONSE.setStatus(400)
          return RESPONSE

      else:
        RESPONSE.setHeader('X-Explanation', 'Can not parse range')
        RESPONSE.setStatus(400) # Partial content
        return RESPONSE

386
    self._appendData(file_, content_type=type_)
Romain Courteaud's avatar
Romain Courteaud committed
387 388 389 390

    RESPONSE.setStatus(204)
    return RESPONSE

391 392 393 394 395 396
  security.declareProtected(Permissions.ModifyPortalContent,'appendData')
  def appendData(self, data_chunk, content_type=None):
    """
    append data chunk to the end of the file, available in restricted environment.
    """
    self._appendData(data_chunk, content_type)
397 398 399 400 401 402 403 404 405 406 407 408 409

  def _appendData(self, data_chunk, content_type=None):
    """append data chunk to the end of the file

       NOTE if content_type is specified, it will change content_type for the
            whole file.
    """
    data, size = self._read_data(data_chunk, data=self._baseGetData())
    content_type=self._get_content_type(data_chunk, data, self.__name__,
                                        content_type or self.content_type)
    self.update_data(data, content_type, size)


410 411 412
# CMFFile also brings the IContentishInterface on CMF 2.2, remove it.
removeIContentishInterface(BigFile)