Commit 5b50a2fe authored by Levin Zimmermann's avatar Levin Zimmermann Committed by Kirill Smelkov

ZBigFile: Add ZBlk format option 'h' (heuristic)

There are two formats to save data with a ZBigFile: ZBlk0 and ZBlk1.
They differ by adjusting the ratio between access-time and growing
disk-space, where ZBlk1 is better regarding to disk space, while ZBlk0
has a better access-time. Wendelin.core users may not always know yet or
care which format fits better for their data. In this case it may be
easier for users to just let the program automatically select the ZBlk
format. With this patch and the new 'h' (for heuristic) option of the
'ZBlk' argument of ZBigFile, this is now possible. The 'h' option isn't
really a new ZBlk format in itself, but it just tries to automatically
select the best ZBlk format option according to the characteristics
of the changes that the user applies to the ZBigFile.

In its current implementation, the heuristic tackles the use-case of
large arrays with many small append-only changes. In this case 'h' is
smaller in space than ZBlk0, but faster to read than ZBlk1. It does so,
by initally using ZBlk1 until a blk is filled up. Once a blk is full,
it switches to ZBlk1, as it was recommended by @kirr in
nexedi/wendelin.core!20 (comment 196084).

With this patch comes a test (bigfile/tests/test-zblk-fmt) that creates
benchmarks for different combinations and zblk formats. The test aims
to check how the 'heuristic' format performs in contrast to 'ZBlk0'
and 'ZBlk1':

---

Run append tests
---------------------------------------------
---------------------------------------------
Set change_percentage_set to 0.15
Set change_count to 500
Set arrsize to 500000
Set change_type to append

Run tests with format h:

	ZODB storage size: 318.565101 MB
	Access time: 0.747 ms / blk  (initially cold; might get warmer during benchmark)

Run tests with format ZBlk0:

	ZODB storage size: 704.347196 MB
	Access time: 0.737 ms / blk  (initially cold; might get warmer during benchmark)

Run tests with format ZBlk1:

	ZODB storage size: 163.367072 MB
	Access time: 74.628 ms / blk  (initially cold; might get warmer during benchmark)
parent da765ef7
...@@ -484,7 +484,7 @@ ZBlk_fmt_registry = { ...@@ -484,7 +484,7 @@ ZBlk_fmt_registry = {
# format for updated blocks # format for updated blocks
ZBlk_fmt_write = os.environ.get('WENDELIN_CORE_ZBLK_FMT', 'ZBlk0') ZBlk_fmt_write = os.environ.get('WENDELIN_CORE_ZBLK_FMT', 'ZBlk0')
if ZBlk_fmt_write not in ZBlk_fmt_registry: if ZBlk_fmt_write != "h" and ZBlk_fmt_write not in ZBlk_fmt_registry:
raise RuntimeError('E: Unknown ZBlk format %r' % ZBlk_fmt_write) raise RuntimeError('E: Unknown ZBlk format %r' % ZBlk_fmt_write)
...@@ -546,7 +546,13 @@ class ZBigFile(LivePersistent): ...@@ -546,7 +546,13 @@ class ZBigFile(LivePersistent):
# store data dirty page -> ZODB obj # store data dirty page -> ZODB obj
def storeblk(self, blk, buf): def storeblk(self, blk, buf):
zblk = self.blktab.get(blk) zblk = self.blktab.get(blk)
zblk_type_write = ZBlk_fmt_registry[ZBlk_fmt_write] zblk_fmt = ZBlk_fmt_write
if zblk_fmt == "h": # apply heuristic
zblk_fmt = self._zblk_fmt_heuristic(zblk, blk, buf)
self._setzblk(blk, zblk, buf, zblk_fmt)
def _setzblk(self, blk, zblk, buf, zblk_fmt): # helper
zblk_type_write = ZBlk_fmt_registry[zblk_fmt or ZBlk_fmt_write]
# if zblk was absent or of different type - we (re-)create it anew # if zblk was absent or of different type - we (re-)create it anew
if zblk is None or \ if zblk is None or \
type(zblk) is not zblk_type_write: type(zblk) is not zblk_type_write:
...@@ -567,6 +573,27 @@ class ZBigFile(LivePersistent): ...@@ -567,6 +573,27 @@ class ZBigFile(LivePersistent):
zblk.bindzfile(self, blk) zblk.bindzfile(self, blk)
# Heuristically determine zblk format by optimizing
# storage-space/access-speed ratio. Both can't be ideal, see
# module docstring: "Due to weakness of current ZODB storage
# servers, wendelin.core cannot provide at the same time both
# fast reads and small database size growth ..."
def _zblk_fmt_heuristic(self, zblk, blk, buf):
if _is_appending(zblk, buf):
if not zblk and blk > 0: # is new zblk?
# Set previous filled-up ZBlk to ZBlk0 for fast reads
previous_blk = blk - 1
previous_zblk = self.blktab.get(previous_blk)
self._setzblk(previous_blk, previous_zblk, previous_zblk.loadblkdata(), "ZBlk0")
return "ZBlk1"
else: # it's changing
# kirr: "to support sporadic small changes over initial big fillup [...]
# we could introduce e.g. a ZBlkδ object, which would refer to base
# underlying ZBlk object and add "patch" information on top of that [...]."
# See https://lab.nexedi.com/nexedi/wendelin.core/merge_requests/20#note_196084
return 'ZBlk1'
# invalidate data .blktab[blk] invalidated -> invalidate page # invalidate data .blktab[blk] invalidated -> invalidate page
def invalidateblk(self, blk): def invalidateblk(self, blk):
for fileh in self._v_filehset: for fileh in self._v_filehset:
...@@ -829,3 +856,11 @@ class _ZBigFileH(object): ...@@ -829,3 +856,11 @@ class _ZBigFileH(object):
# and also more right - tpc_finish is there assumed as non-failing by # and also more right - tpc_finish is there assumed as non-failing by
# ZODB design) # ZODB design)
self.abort(txn) self.abort(txn)
# Utility functions for heuristic
def _is_appending(zblk, buf):
if not zblk:
return True
old_buf = bytes(zblk.loadblkdata())
return bytes(buf).rstrip(b'\0')[:len(old_buf)] == old_buf
# Copyright (C) 2023 Nexedi SA and Contributors.
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
# Test to compare disk-space and access-speed of the different ZBlk format options:
#
# - ZBlk0
# - ZBlk1
# - h
import os
import random
import resource
import tempfile
import timeit
import sys
from time import time, sleep
# Add relative module path, to run tests on local code
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '.'))
from golang import defer, func
import numpy as np
import transaction
import ZODB, ZODB.FileStorage
from wendelin.bigarray.array_zodb import ZBigArray
ms = 1e-3
random.seed(10)
# Avoid error due to too many opened file descriptors.
cur_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
new_limit = (cur_limit[1], cur_limit[1])
resource.setrlimit(resource.RLIMIT_NOFILE, new_limit)
storage_path = tempfile.mktemp(prefix='zblkbenchmark')
# Declare test parameters.
change_percentage_set = tuple(
float(n) for n in os.environ.get('change_percentage_set', '0.2').split(','))
change_count = int(os.environ.get('change_count', '1000'))
arrsize = int(os.environ.get('arrsize', '1000000'))
change_type = os.environ.get('change_type', 'setitem') # setitem or append
# Utiliy functions
def randarr(size=1000000):
return np.array([[random.randint(1, 1000), random.randint(1, 1000)] for _ in range(size)])
def setrand(A, blksize_length, change_percentage=1):
size = int(blksize_length * change_percentage)
blk_index = random.randint(0, int(arrsize / blksize_length) - 1)
blk_offset = blk_index * blksize_length
# Ensure we don't always only change the beginning of a block
blk_offset = blk_offset + random.randint(0, blksize_length - size)
A[blk_offset:blk_offset+size][:] = randarr(size)
transaction.commit()
def accessrand(A):
# force load of ZBlk data via reading ndarray element
A[random.randint(0, arrsize), 0]
def fillup(root):
root.A.append([[0, 0] for _ in range(arrsize)])
transaction.commit()
def change_setitem(root):
A = root.A[:]
blksize_length = get_blksize_length(root)
for _ in range(change_count):
change_percentage = random.choice(change_percentage_set)
setrand(A, blksize_length, change_percentage)
transaction.commit()
def change_append(root):
A = root.A
blksize_length = get_blksize_length(root)
for _ in range(change_count):
change_percentage = random.choice(change_percentage_set)
size = int(blksize_length * change_percentage)
A.append(randarr(size))
transaction.commit()
def get_blksize_length(root):
return root.A.zfile.blksize / 16
traceload = False
delayload = False
@func
def root(func):
storage = ZODB.FileStorage.FileStorage(storage_path)
stor_load = storage.load
stor_loadBefore = storage.loadBefore
def loadBefore(oid, tid):
if traceload:
print 'loadBefore %r %r' % (oid, tid)
# simulate loading latency as actually seen on NEO.
# there I was seeing latencies up to _1_ millisecond, but even with
# "modest" 0.2 ms it really shows in the figures.
#
# (activated only during read benchmark to avoid avoid wasting time
# while preparing data)
if delayload:
sleep(0.2 * ms)
return stor_loadBefore(oid, tid)
def load(oid):
print 'load %r' % oid
1/0 # should not call load at all
return stor_load(oid)
storage.loadBefore = loadBefore
storage.load = load
db = ZODB.DB(storage)
connection = db.open()
root = connection.root
defer(connection.close)
defer(db.close)
defer(storage.close)
func(root)
@root
def setup(root):
root.A = A = ZBigArray(shape=[1, 2], dtype=int)
transaction.commit()
if change_type == "setitem":
root(fillup)
root(change_setitem)
elif change_type == "append":
root(change_append)
else:
raise NotImplementedError(change_type)
print("\tZODB storage size: %s MB" % (os.path.getsize(storage_path) / float(10**6)))
@root
def access(root):
global traceload, delayload
a = root.A[:] # create BigArray -> ndarray view only once
delayload = True
def _():
t0 = time()
accessrand(a)
t1 = time()
random.seed(10)
# niter should be small to avoid getting into situation when most blocks becomes loaded into cache
# and we start to measure time of hot access without any ZODB loading
niter=10
taccess = timeit.timeit(_, number=niter) / niter
print("\tAccess time: %.3f ms / blk (initially cold; might get warmer during benchmark)" % (taccess/ms))
#!/usr/bin/env bash
# Copyright (C) 2023 Nexedi SA and Contributors.
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
# Test to compare disk-space and access-speed of the different ZBlk format options:
#
# - ZBlk0
# - ZBlk1
# - h
#
# The heuristic 'h' should behave as good as ZBlk0 in case of wide changes
# and as good as ZBlk1 in case of small changes.
function test {
function t {
zblkfmt=$1
echo "Run tests with format $zblkfmt:"
echo ""
export WENDELIN_CORE_ZBLK_FMT=$zblkfmt
python bigfile/tests/_test_zblk_fmt
echo ""
echo ""
}
change_percentage_set=$1
change_count=$2
arrsize=$3
change_type=$4
echo "---------------------------------------------"
echo "---------------------------------------------"
echo "Set change_percentage_set to $change_percentage_set"
echo "Set change_count to $change_count"
echo "Set arrsize to $arrsize"
echo "Set change_type to $change_type"
echo ""
export change_percentage_set=$change_percentage_set
export change_count=$change_count
export arrsize=$arrsize
export change_type=$change_type
t h
t ZBlk0
t ZBlk1
echo ""
echo "---------------------------------------------"
echo "---------------------------------------------"
echo ""
}
echo "Run append tests"
test 0.15 500 500000 "append"
# TODO(add 'small changes after initial fillup' optimization, see
# 'bigfile/file_zodb/ZBigFile_zblk_fmt_heuristic' for more details)
# echo "Run setitem tests"
#
# echo "Use only a very small change size, so that heuristic always uses ZBlk1"
# test 0.2 500 1000000 "setitem"
#
# echo "Use only a very big change size, so that heuristic always uses ZBlk0"
# test 1 500 1000000 "setitem"
#
# echo "Mix between change size so that heuristic switches between ZBlk0 and ZBlk1"
# test 0.2,1 500 1000000 "setitem"
# Wendelin.core.bigfile | Tests for ZODB BigFile backend # Wendelin.core.bigfile | Tests for ZODB BigFile backend
# Copyright (C) 2014-2021 Nexedi SA and Contributors. # Copyright (C) 2014-2023 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -690,3 +690,35 @@ def test_bigfile_zblk1_zdata_reuse(): ...@@ -690,3 +690,35 @@ def test_bigfile_zblk1_zdata_reuse():
assert len(zdata_v1) == len(zdata_v2) assert len(zdata_v1) == len(zdata_v2)
for i in range(len(zdata_v1)): for i in range(len(zdata_v1)):
assert zdata_v1[i] is zdata_v2[i] assert zdata_v1[i] is zdata_v2[i]
# Minimal test to ensure normal operations work as expected
# with zblk fmt 'h'
@func
def test_bigfile_zblk_fmt_heuristic():
root = dbopen()
defer(lambda: dbclose(root))
# set ZBlk_fmt_write to 'h' for this test
fmt_write_save = file_zodb.ZBlk_fmt_write
file_zodb.ZBlk_fmt_write = 'h'
def _():
file_zodb.ZBlk_fmt_write = fmt_write_save
defer(_)
root['zfile8'] = f = ZBigFile(blksize)
transaction.commit()
fh = f.fileh_open()
vma = fh.mmap(0, blen)
b = Blk(vma, 0)
b[:] = 1
transaction.commit()
assert (b == 1).all()
b[0] = 2
transaction.commit()
assert b[0] == 2
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment