stat_zodb.py 5.6 KB
Newer Older
1 2
#!/usr/bin/env python
# -*- coding: utf-8 -*-
3
import math, os, random, sys, time
4
from cStringIO import StringIO
5 6
from persistent.TimeStamp import TimeStamp
from ZODB.utils import p64, newTid
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
from ZODB.BaseStorage import TransactionRecord
from ZODB.FileStorage import FileStorage

# Stats of a 43.5 GB production Data.fs
#                          µ               σ
# size of object           6.04237779991   1.55811487853
# # objects / transaction  1.04108991045   0.906703192546
# size of transaction      7.98615420517   1.6624220402
#
# % of new object / transaction: 0.810080409164
# # of transactions: 1541194
# compression ratio: 28.5 % (gzip -6)
PROD1 = lambda random=random: DummyZODB(6.04237779991, 1.55811487853,
                                        1.04108991045, 0.906703192546,
                                        0.810080409164, random)

def DummyData(random=random):
    # returns data that gzip at about 28.5 %
    # make sure sample is bigger than dictionary of compressor
    data = ''.join(chr(int(random.gauss(0, .8)) % 256) for x in xrange(100000))
    return StringIO(data)


class DummyZODB(object):
    """
    Object size and count of generated transaction follows a log normal
    distribution, where *_mu and *_sigma are their parameters.
    """

    def __init__(self, obj_size_mu, obj_size_sigma,
                       obj_count_mu, obj_count_sigma,
                       new_ratio, random=random):
        self.obj_size_mu = obj_size_mu
        self.obj_size_sigma = obj_size_sigma
        self.obj_count_mu = obj_count_mu
        self.obj_count_sigma = obj_count_sigma
        self.random = random
        self.new_ratio = new_ratio
        self.next_oid = 0
        self.err_count = 0

    def __call__(self):
        variate = self.random.lognormvariate
        oid_set = set()
        for i in xrange(int(round(variate(self.obj_count_mu,
                                          self.obj_count_sigma))) or 1):
            if len(oid_set) >= self.next_oid or \
               self.random.random() < self.new_ratio:
                oid = self.next_oid
                self.next_oid = oid + 1
            else:
                while True:
                    oid = self.random.randrange(self.next_oid)
                    if oid not in oid_set:
                        break
            oid_set.add(oid)
            yield p64(oid), int(round(variate(self.obj_size_mu,
                                              self.obj_size_sigma))) or 1

    def as_storage(self, transaction_count, dummy_data_file=None):
        if dummy_data_file is None:
            dummy_data_file = DummyData(self.random)
        class dummy_change(object):
            data_txn = None
            version = ''
            def __init__(self, tid, oid, size):
                self.tid = tid
                self.oid = oid
                data = ''
                while size:
                    d = dummy_data_file.read(size)
                    size -= len(d)
                    data += d
                    if size:
                        dummy_data_file.seek(0)
                self.data = data
        class dummy_transaction(TransactionRecord):
            def __init__(transaction, *args):
                TransactionRecord.__init__(transaction, *args)
                transaction_size = 0
                transaction.record_list = []
                add_record = transaction.record_list.append
                for x in self():
                    oid, size = x
                    transaction_size += size
                    add_record(dummy_change(transaction.tid, oid, size))
                transaction.size = transaction_size
            def __iter__(transaction):
                return iter(transaction.record_list)
        class dummy_storage(object):
            size = 0
            def iterator(storage, *args):
                args = ' ', '', '', {}
100
                tid = None
101
                for i in xrange(1, transaction_count+1):
102 103
                    tid = newTid(tid)
                    t =  dummy_transaction(tid, *args)
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
                    storage.size += t.size
                    yield t
            def getSize(self):
                return self.size
        return dummy_storage()

def lognorm_stat(X):
    Y = map(math.log, X)
    n = len(Y)
    mu = sum(Y) / n
    s2 = sum(d*d for d in (y - mu for y in Y)) / n
    return mu, math.sqrt(s2)

def stat(*storages):
    obj_size_list = []
    obj_count_list = []
    tr_size_list = []
    oid_set = set()
    for storage in storages:
        for transaction in storage.iterator():
            obj_count = tr_size = 0
            for r in transaction:
                if r.data:
                    obj_count += 1
                    oid = r.oid
                    if oid not in oid_set:
                        oid_set.add(oid)
                    size = len(r.data)
                    tr_size += size
                    obj_size_list.append(size)
            obj_count_list.append(obj_count)
            tr_size_list.append(tr_size)
    new_ratio = float(len(oid_set)) / len(obj_size_list)
    return (lognorm_stat(obj_size_list),
            lognorm_stat(obj_count_list),
            lognorm_stat(tr_size_list),
            new_ratio, len(tr_size_list))

def main():
    s = stat(*(FileStorage(x, read_only=True) for x in sys.argv[1:]))
    print(u"                         %-15s σ\n"
           "size of object           %-15s %s\n"
           "# objects / transaction  %-15s %s\n"
           "size of transaction      %-15s %s\n"
           "\n%% of new object / transaction: %s"
           "\n# of transactions: %s"
           % ((u"µ",) + s[0] + s[1] + s[2] + s[3:]))


if __name__ == "__main__":
    sys.exit(main())