Commit e4f4f6b6 authored by Jim Fulton's avatar Jim Fulton

Reimplemented the ZEO Blob protocol:

- Avoid more than one round-trip call when loading blobs via copy from
  the server.

- Avoid loading large amounts of blob data into memory.  The old
  storeBlob implementation was likely to queue blob adta faster than
  it could be sent, leading to a large memory foot print for the
  queue. Now, iterators are used to read data from files only when the
  network layer is ready to send it.

- Fixed storeBlob to move the input file to the blob cache (when not
  sharing the blob directiry with the server).

- Extended the loadBlob locking model to work with multiple processes
  by using file locks rather than threading locks.  A common
  configuration is to use a client process per core, so that a machine
  is likely to have many client processes and it should be possible
  for the client processes to share a common blob cache.
parent 61aeabc6
This diff is collapsed.
...@@ -60,3 +60,18 @@ class ClientStorage: ...@@ -60,3 +60,18 @@ class ClientStorage:
def info(self, arg): def info(self, arg):
self.rpc.callAsync('info', arg) self.rpc.callAsync('info', arg)
def storeBlob(self, oid, serial, blobfilename):
def store():
yield ('recieveBlobStart', (oid, serial))
f = open(blobfilename, 'rb')
while 1:
chunk = f.read(59000)
if not chunk:
break
yield ('recieveBlobChunk', (oid, serial, chunk, ))
f.close()
yield ('recieveBlobStop', (oid, serial))
self.rpc.callAsyncIterator(store())
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
############################################################################## ##############################################################################
"""RPC stubs for interface exported by StorageServer.""" """RPC stubs for interface exported by StorageServer."""
import os
import time import time
## ##
...@@ -219,11 +220,29 @@ class StorageServer: ...@@ -219,11 +220,29 @@ class StorageServer:
def storea(self, oid, serial, data, version, id): def storea(self, oid, serial, data, version, id):
self.rpc.callAsync('storea', oid, serial, data, version, id) self.rpc.callAsync('storea', oid, serial, data, version, id)
def storeBlobEnd(self, oid, serial, data, version, id): def storeBlob(self, oid, serial, data, blobfilename, version, txn):
self.rpc.callAsync('storeBlobEnd', oid, serial, data, version, id)
def storeBlob(self, oid, serial, chunk, version, id): # Store a blob to the server. We don't want to real all of
self.rpc.callAsync('storeBlob', oid, serial, chunk, version, id) # the data into memory, so we use a message iterator. This
# allows us to read the blob data as needed.
if blobfilename is None:
self.rpc.callAsync('storeEmptyBlob',
oid, serial, data, version, id(txn))
return
def store():
yield ('storeBlobStart', ())
f = open(blobfilename, 'rb')
while 1:
chunk = f.read(59000)
if not chunk:
break
yield ('storeBlobChunk', (chunk, ))
f.close()
yield ('storeBlobEnd', (oid, serial, data, version, id(txn)))
self.rpc.callAsyncIterator(store())
def storeBlobShared(self, oid, serial, data, filename, version, id): def storeBlobShared(self, oid, serial, data, filename, version, id):
self.rpc.callAsync('storeBlobShared', oid, serial, data, filename, self.rpc.callAsync('storeBlobShared', oid, serial, data, filename,
...@@ -271,8 +290,8 @@ class StorageServer: ...@@ -271,8 +290,8 @@ class StorageServer:
def load(self, oid, version): def load(self, oid, version):
return self.rpc.call('load', oid, version) return self.rpc.call('load', oid, version)
def loadBlob(self, oid, serial, version, offset): def sendBlob(self, oid, serial):
return self.rpc.call('loadBlob', oid, serial, version, offset) return self.rpc.call('sendBlob', oid, serial)
def getTid(self, oid): def getTid(self, oid):
return self.rpc.call('getTid', oid) return self.rpc.call('getTid', oid)
......
...@@ -25,6 +25,7 @@ import cPickle ...@@ -25,6 +25,7 @@ import cPickle
import logging import logging
import os import os
import sys import sys
import tempfile
import threading import threading
import time import time
import warnings import warnings
...@@ -103,9 +104,8 @@ class ZEOStorage: ...@@ -103,9 +104,8 @@ class ZEOStorage:
self.log_label = _label self.log_label = _label
self.authenticated = 0 self.authenticated = 0
self.auth_realm = auth_realm self.auth_realm = auth_realm
self.blob_transfer = {} self.blob_tempfile = None
self.blob_log = [] self.blob_log = []
self.blob_loads = {}
# The authentication protocol may define extra methods. # The authentication protocol may define extra methods.
self._extensions = {} self._extensions = {}
for func in self.extensions: for func in self.extensions:
...@@ -525,25 +525,22 @@ class ZEOStorage: ...@@ -525,25 +525,22 @@ class ZEOStorage:
self.stats.stores += 1 self.stats.stores += 1
self.txnlog.store(oid, serial, data, version) self.txnlog.store(oid, serial, data, version)
def storeBlobStart(self):
assert self.blob_tempfile is None
self.blob_tempfile = tempfile.mkstemp(
dir=self.storage.temporaryDirectory())
def storeBlobChunk(self, chunk):
os.write(self.blob_tempfile[0], chunk)
def storeBlobEnd(self, oid, serial, data, version, id): def storeBlobEnd(self, oid, serial, data, version, id):
key = (oid, id) fd, tempname = self.blob_tempfile
if key not in self.blob_transfer: self.blob_tempfile = None
raise Exception, "Can't finish a non-started Blob" os.close(fd)
tempname, tempfile = self.blob_transfer.pop(key)
tempfile.close()
self.blob_log.append((oid, serial, data, tempname, version)) self.blob_log.append((oid, serial, data, tempname, version))
def storeBlob(self, oid, serial, chunk, version, id): def storeEmptyBlob(self, oid, serial, data, version, id):
# XXX check that underlying storage supports blobs self.blob_log.append((oid, serial, data, None, version))
key = (oid, id)
if key not in self.blob_transfer:
tempname = mktemp()
tempfile = open(tempname, "wb")
# XXX Force close and remove them when Storage closes
self.blob_transfer[key] = (tempname, tempfile)
else:
tempname, tempfile = self.blob_transfer[key]
tempfile.write(chunk)
def storeBlobShared(self, oid, serial, data, filename, version, id): def storeBlobShared(self, oid, serial, data, filename, version, id):
# Reconstruct the full path from the filename in the OID directory # Reconstruct the full path from the filename in the OID directory
...@@ -551,17 +548,8 @@ class ZEOStorage: ...@@ -551,17 +548,8 @@ class ZEOStorage:
filename) filename)
self.blob_log.append((oid, serial, data, filename, version)) self.blob_log.append((oid, serial, data, filename, version))
def loadBlob(self, oid, serial, version, offset): def sendBlob(self, oid, serial):
key = (oid, serial) self.client.storeBlob(oid, serial, self.storage.loadBlob(oid, serial))
if not key in self.blob_loads:
self.blob_loads[key] = \
open(self.storage.loadBlob(oid, serial, version))
blobdata = self.blob_loads[key]
blobdata.seek(offset)
chunk = blobdata.read(4096)
if not chunk:
del self.blob_loads[key]
return chunk
# The following four methods return values, so they must acquire # The following four methods return values, so they must acquire
# the storage lock and begin the transaction before returning. # the storage lock and begin the transaction before returning.
......
...@@ -59,12 +59,14 @@ class TransactionBuffer: ...@@ -59,12 +59,14 @@ class TransactionBuffer:
self.closed = 0 self.closed = 0
self.count = 0 self.count = 0
self.size = 0 self.size = 0
self.blobs = []
# It's safe to use a fast pickler because the only objects # It's safe to use a fast pickler because the only objects
# stored are builtin types -- strings or None. # stored are builtin types -- strings or None.
self.pickler = cPickle.Pickler(self.file, 1) self.pickler = cPickle.Pickler(self.file, 1)
self.pickler.fast = 1 self.pickler.fast = 1
def close(self): def close(self):
self.clear()
self.lock.acquire() self.lock.acquire()
try: try:
self.closed = 1 self.closed = 1
...@@ -82,6 +84,9 @@ class TransactionBuffer: ...@@ -82,6 +84,9 @@ class TransactionBuffer:
finally: finally:
self.lock.release() self.lock.release()
def storeBlob(self, oid, blobfilename):
self.blobs.append((oid, blobfilename))
def _store(self, oid, version, data): def _store(self, oid, version, data):
"""Store oid, version, data for later retrieval""" """Store oid, version, data for later retrieval"""
if self.closed: if self.closed:
...@@ -113,6 +118,10 @@ class TransactionBuffer: ...@@ -113,6 +118,10 @@ class TransactionBuffer:
self.file.seek(0) self.file.seek(0)
self.count = 0 self.count = 0
self.size = 0 self.size = 0
while self.blobs:
oid, serial, blobfilename = self.blobs.pop()
if os.path.exists(blobfilename):
os.remove(blobfilename)
finally: finally:
self.lock.release() self.lock.release()
......
...@@ -22,6 +22,7 @@ import random ...@@ -22,6 +22,7 @@ import random
import signal import signal
import socket import socket
import tempfile import tempfile
import threading
import time import time
import unittest import unittest
import shutil import shutil
...@@ -520,91 +521,99 @@ class CommonBlobTests: ...@@ -520,91 +521,99 @@ class CommonBlobTests:
self._storage.tpc_abort(t) self._storage.tpc_abort(t)
raise raise
filename = self._storage.loadBlob(oid, serial, version) filename = self._storage.loadBlob(oid, serial)
self.assertEquals(somedata, open(filename, 'rb').read()) self.assertEquals(somedata, open(filename, 'rb').read())
class BlobAdaptedFileStorageTests(GenericTests, CommonBlobTests): class BlobAdaptedFileStorageTests(GenericTests, CommonBlobTests):
"""ZEO backed by a BlobStorage-adapted FileStorage.""" """ZEO backed by a BlobStorage-adapted FileStorage."""
def setUp(self): def setUp(self):
self.blobdir = tempfile.mkdtemp() # This is the blob directory on the ZEO server self.blobdir = tempfile.mkdtemp() # blob directory on ZEO server
self.filestorage = tempfile.mktemp() self.filestorage = tempfile.mktemp()
super(BlobAdaptedFileStorageTests, self).setUp() super(BlobAdaptedFileStorageTests, self).setUp()
def checkLoadBlobLocks(self): def checkStoreAndLoadBlob(self):
from ZODB.utils import oid_repr, tid_repr
from ZODB.Blobs.Blob import Blob from ZODB.Blobs.Blob import Blob
from ZODB.Blobs.BlobStorage import BLOB_SUFFIX
from ZODB.tests.StorageTestBase import zodb_pickle, ZERO, \ from ZODB.tests.StorageTestBase import zodb_pickle, ZERO, \
handle_serials handle_serials
import transaction import transaction
version = '' somedata_path = os.path.join(self.blob_cache_dir, 'somedata')
somedata = 'a' * 10 somedata = open(somedata_path, 'w+b')
for i in range(1000000):
somedata.write("%s\n" % i)
somedata.seek(0)
blob = Blob() blob = Blob()
bd_fh = blob.open('w') bd_fh = blob.open('w')
bd_fh.write(somedata) ZODB.utils.cp(somedata, bd_fh)
bd_fh.close() bd_fh.close()
tfname = bd_fh.name tfname = bd_fh.name
oid = self._storage.new_oid() oid = self._storage.new_oid()
data = zodb_pickle(blob) data = zodb_pickle(blob)
self.assert_(os.path.exists(tfname))
t = transaction.Transaction() t = transaction.Transaction()
try: try:
self._storage.tpc_begin(t) self._storage.tpc_begin(t)
r1 = self._storage.storeBlob(oid, ZERO, data, tfname, '', t) r1 = self._storage.storeBlob(oid, ZERO, data, tfname, '', t)
r2 = self._storage.tpc_vote(t) r2 = self._storage.tpc_vote(t)
serial = handle_serials(oid, r1, r2) revid = handle_serials(oid, r1, r2)
self._storage.tpc_finish(t) self._storage.tpc_finish(t)
except: except:
self._storage.tpc_abort(t) self._storage.tpc_abort(t)
raise raise
# The uncommitted data file should have been removed
self.assert_(not os.path.exists(tfname))
class Dummy: def check_data(path):
def __init__(self): self.assert_(os.path.exists(path))
self.acquired = 0 f = open(path, 'rb')
self.released = 0 somedata.seek(0)
def acquire(self): d1 = d2 = 1
self.acquired += 1 while d1 or d2:
def release(self): d1 = f.read(8096)
self.released += 1 d2 = somedata.read(8096)
self.assertEqual(d1, d2)
class statusdict(dict):
def __init__(self):
self.added = []
self.removed = []
def __setitem__(self, k, v):
self.added.append(k)
super(statusdict, self).__setitem__(k, v)
def __delitem__(self, k):
self.removed.append(k)
super(statusdict, self).__delitem__(k)
# ensure that we do locking properly
filename = self._storage.fshelper.getBlobFilename(oid, serial)
thestatuslock = self._storage.blob_status_lock = Dummy()
thebloblock = Dummy()
def getBlobLock():
return thebloblock
# override getBlobLock to test that locking is performed
self._storage.getBlobLock = getBlobLock
thestatusdict = self._storage.blob_status = statusdict()
filename = self._storage.loadBlob(oid, serial, version)
self.assertEqual(thestatuslock.acquired, 2)
self.assertEqual(thestatuslock.released, 2)
self.assertEqual(thebloblock.acquired, 1) # The file should have been copied to the server:
self.assertEqual(thebloblock.released, 1) filename = os.path.join(self.blobdir, oid_repr(oid),
tid_repr(revid) + BLOB_SUFFIX)
check_data(filename)
self.assertEqual(thestatusdict.added, [(oid, serial)]) # It should also be in the cache:
self.assertEqual(thestatusdict.removed, [(oid, serial)]) filename = os.path.join(self.blob_cache_dir, oid_repr(oid),
tid_repr(revid) + BLOB_SUFFIX)
check_data(filename)
# If we remove it from the cache and call loadBlob, it should
# come back. We can do this in many threads. We'll instrument
# the method that is used to request data from teh server to
# verify that it is only called once.
sendBlob_org = ZEO.ServerStub.StorageServer.sendBlob
calls = []
def sendBlob(self, oid, serial):
calls.append((oid, serial))
sendBlob_org(self, oid, serial)
os.remove(filename)
returns = []
threads = [
threading.Thread(
target=lambda :
returns.append(self._storage.loadBlob(oid, revid))
)
for i in range(10)
]
[thread.start() for thread in threads]
[thread.join() for thread in threads]
[self.assertEqual(r, filename) for r in returns]
check_data(filename)
class BlobWritableCacheTests(GenericTests, CommonBlobTests): class BlobWritableCacheTests(GenericTests, CommonBlobTests):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment