Commit 3444d43f authored by Leif Walsh's avatar Leif Walsh Committed by Yoni Fogel

refs #5781 #5670 clean up stress test script, email about more failures


git-svn-id: file:///svn/toku/tokudb@51326 c7de825b-a66e-492c-adef-691d508d4ae1
parent 2a118bd9
...@@ -33,6 +33,7 @@ from signal import signal, SIGHUP, SIGINT, SIGPIPE, SIGALRM, SIGTERM ...@@ -33,6 +33,7 @@ from signal import signal, SIGHUP, SIGINT, SIGPIPE, SIGALRM, SIGTERM
from smtplib import SMTP from smtplib import SMTP
from socket import gethostname from socket import gethostname
from subprocess import call, Popen, PIPE, STDOUT from subprocess import call, Popen, PIPE, STDOUT
from traceback import format_exc
from tempfile import mkdtemp, mkstemp from tempfile import mkdtemp, mkstemp
from threading import Event, Thread, Timer from threading import Event, Thread, Timer
...@@ -110,10 +111,9 @@ class Killed(Exception): ...@@ -110,10 +111,9 @@ class Killed(Exception):
pass pass
class TestRunnerBase(object): class TestRunnerBase(object):
def __init__(self, scheduler, builddir, installdir, rev, jemalloc, execf, tsize, csize, default_test_time, savedir): def __init__(self, scheduler, builddir, rev, execf, tsize, csize, default_test_time, savedir):
self.scheduler = scheduler self.scheduler = scheduler
self.builddir = builddir self.builddir = builddir
self.installdir = installdir
self.rev = rev self.rev = rev
self.execf = execf self.execf = execf
self.tsize = tsize self.tsize = tsize
...@@ -123,18 +123,6 @@ class TestRunnerBase(object): ...@@ -123,18 +123,6 @@ class TestRunnerBase(object):
self.savedir = savedir self.savedir = savedir
self.env = os.environ self.env = os.environ
libpath = os.path.join(self.installdir, 'lib')
if 'LD_LIBRARY_PATH' in self.env:
self.env['LD_LIBRARY_PATH'] = '%s:%s' % (libpath, self.env['LD_LIBRARY_PATH'])
else:
self.env['LD_LIBRARY_PATH'] = libpath
if jemalloc is not None and len(jemalloc) > 0:
preload = os.path.normpath(jemalloc)
if 'LD_PRELOAD' in self.env:
self.env['LD_PRELOAD'] = '%s:%s' % (preload, self.env['LD_PRELOAD'])
else:
self.env['LD_PRELOAD'] = preload
self.nruns = 0 self.nruns = 0
self.num_ptquery = 1 self.num_ptquery = 1
...@@ -226,10 +214,13 @@ class TestRunnerBase(object): ...@@ -226,10 +214,13 @@ class TestRunnerBase(object):
pass pass
except TestFailure: except TestFailure:
self.times[1] = time.time() self.times[1] = time.time()
savedtarfile = self.save() savepfx = '%(execf)s-%(rev)s-%(tsize)d-%(csize)d-%(num_ptquery)d-%(num_update)d-%(phase)s-' % self
savedir = mkdtemp(dir=self.savedir, prefix=savepfx)
tarfile = '%s.tar' % savedir
self.scheduler.email_failure(self, tarfile)
self.save(savedir, tarfile)
self.scheduler.report_failure(self) self.scheduler.report_failure(self)
warning('Saved environment to %s', savedtarfile) warning('Saved environment to %s', tarfile)
self.scheduler.email_failure(self, savedtarfile)
else: else:
self.scheduler.report_success(self) self.scheduler.report_success(self)
finally: finally:
...@@ -239,9 +230,7 @@ class TestRunnerBase(object): ...@@ -239,9 +230,7 @@ class TestRunnerBase(object):
self.times = [0, 0] self.times = [0, 0]
self.nruns += 1 self.nruns += 1
def save(self): def save(self, savedir, tarfile):
savepfx = '%(execf)s-%(rev)s-%(tsize)d-%(csize)d-%(num_ptquery)d-%(num_update)d-%(phase)s-' % self
savedir = mkdtemp(dir=self.savedir, prefix=savepfx)
def targetfor(path): def targetfor(path):
return os.path.join(savedir, os.path.basename(path)) return os.path.join(savedir, os.path.basename(path))
...@@ -261,15 +250,12 @@ class TestRunnerBase(object): ...@@ -261,15 +250,12 @@ class TestRunnerBase(object):
os.makedirs(targetdir) os.makedirs(targetdir)
copy(fulllibpath, targetpath) copy(fulllibpath, targetpath)
tarfile = '%s.tar' % savedir
r = call(['tar', 'cf', os.path.basename(tarfile), os.path.basename(savedir)], cwd=os.path.dirname(savedir)) r = call(['tar', 'cf', os.path.basename(tarfile), os.path.basename(savedir)], cwd=os.path.dirname(savedir))
if r != 0: if r != 0:
error('tarring up %s failed.' % savedir) error('tarring up %s failed.' % savedir)
sys.exit(r) sys.exit(r)
os.chmod(tarfile, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH) os.chmod(tarfile, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
return tarfile
def waitfor(self, proc): def waitfor(self, proc):
while proc.poll() is None: while proc.poll() is None:
self.scheduler.stopping.wait(1) self.scheduler.stopping.wait(1)
...@@ -407,7 +393,7 @@ class Worker(Thread): ...@@ -407,7 +393,7 @@ class Worker(Thread):
except Exception, e: except Exception, e:
exception('Fatal error in worker thread.') exception('Fatal error in worker thread.')
info('Killing all workers.') info('Killing all workers.')
self.scheduler.error = e self.scheduler.error = format_exc()
self.scheduler.stop() self.scheduler.stop()
if test_runner.is_large: if test_runner.is_large:
self.scheduler.nlarge -= 1 self.scheduler.nlarge -= 1
...@@ -456,6 +442,9 @@ class Scheduler(Queue): ...@@ -456,6 +442,9 @@ class Scheduler(Queue):
else: else:
debug('Scheduler stopped by someone else. Joining threads.') debug('Scheduler stopped by someone else. Joining threads.')
self.join() self.join()
if self.error:
send_mail(['leif@tokutek.com'], 'Stress tests scheduler stopped by something, on %s' % gethostname(), self.error)
sys.exit(77)
def join(self): def join(self):
if self.timer is not None: if self.timer is not None:
...@@ -489,12 +478,14 @@ class Scheduler(Queue): ...@@ -489,12 +478,14 @@ class Scheduler(Queue):
h = gethostname() h = gethostname()
if isinstance(runner, UpgradeTestRunnerMixin): if isinstance(runner, UpgradeTestRunnerMixin):
upgradestr = '''The test was upgrading from %s. upgradestr = '''
''' % runner.oldversionstr The test was upgrading from %s.''' % runner.oldversionstr
else: else:
upgradestr = '' upgradestr = ''
m = MIMEText('''A stress test failed on %(hostname)s running %(branch)s at svn revision %(rev)s after %(test_duration)d seconds. send_mail(['tokueng@tokutek.com'],
%(upgradestr)sIts environment is saved to %(tarfile)s on that machine. 'Stress test failure on %(hostname)s running %(branch)s.' % { 'hostname': h, 'branch': self.branch },
('''A stress test failed on %(hostname)s running %(branch)s at svn revision %(rev)s after %(test_duration)d seconds.%(upgradestr)s
Its environment is saved to %(tarfile)s on that machine.
The test configuration was: The test configuration was:
...@@ -503,6 +494,12 @@ num_elements: %(tsize)d ...@@ -503,6 +494,12 @@ num_elements: %(tsize)d
cachetable_size: %(csize)d cachetable_size: %(csize)d
num_ptquery_threads: %(num_ptquery)d num_ptquery_threads: %(num_ptquery)d
num_update_threads: %(num_update)d num_update_threads: %(num_update)d
Commands run:
%(commands)s
Test output:
%(output)s
''' % { ''' % {
'hostname': h, 'hostname': h,
'rev': runner.rev, 'rev': runner.rev,
...@@ -515,49 +512,31 @@ num_update_threads: %(num_update)d ...@@ -515,49 +512,31 @@ num_update_threads: %(num_update)d
'num_ptquery': runner.num_ptquery, 'num_ptquery': runner.num_ptquery,
'num_update': runner.num_update, 'num_update': runner.num_update,
'branch': self.branch, 'branch': self.branch,
}) }))
def send_mail(toaddrs, subject, body):
m = MIMEText(body)
fromaddr = 'tim@tokutek.com' fromaddr = 'tim@tokutek.com'
toaddrs = ['tokueng@tokutek.com']
m['From'] = fromaddr m['From'] = fromaddr
m['To'] = ', '.join(toaddrs) m['To'] = ', '.join(toaddrs)
m['Subject'] = 'Stress test failure on %(hostname)s running %(branch)s.' % { 'hostname': h, 'branch': self.branch } m['Subject'] = subject
s = SMTP('192.168.1.114') s = SMTP('192.168.1.114')
s.sendmail(fromaddr, toaddrs, str(m)) s.sendmail(fromaddr, toaddrs, str(m))
s.quit() s.quit()
def compiler_works(cc): def rebuild(tokudb, builddir, cc, cxx, tests):
try:
devnull = open(os.devnull, 'w')
r = call([cc, '-v'], stdout=devnull, stderr=STDOUT)
devnull.close()
return r == 0
except OSError:
exception('Error running %s.', cc)
return False
def rebuild(tokudb, builddir, installdir, cc, tests):
info('Updating from svn.') info('Updating from svn.')
devnull = open(os.devnull, 'w') devnull = open(os.devnull, 'w')
call(['svn', 'up'], stdout=devnull, stderr=STDOUT, cwd=tokudb) call(['svn', 'up'], stdout=devnull, stderr=STDOUT, cwd=tokudb)
devnull.close() devnull.close()
if not compiler_works(cc):
error('Cannot find working compiler named "%s". Try sourcing the icc env script or providing another compiler with --cc.', cc)
sys.exit(2)
if cc == 'icc':
iccstr = 'ON'
else:
iccstr = 'OFF'
info('Building tokudb.') info('Building tokudb.')
if not os.path.exists(builddir): if not os.path.exists(builddir):
os.mkdir(builddir) os.mkdir(builddir)
newenv = os.environ newenv = os.environ
newenv['CC'] = 'gcc47' newenv['CC'] = cc
newenv['CXX'] = 'g++47' newenv['CXX'] = cxx
r = call(['cmake', r = call(['cmake',
'-DCMAKE_BUILD_TYPE=Debug', '-DCMAKE_BUILD_TYPE=Debug',
'-DINTEL_CC=%s' % iccstr,
'-DCMAKE_INSTALL_PREFIX=%s' % installdir,
'-DUSE_BDB=OFF', '-DUSE_BDB=OFF',
'-DUSE_GTAGS=OFF', '-DUSE_GTAGS=OFF',
'-DUSE_CTAGS=OFF', '-DUSE_CTAGS=OFF',
...@@ -567,10 +546,12 @@ def rebuild(tokudb, builddir, installdir, cc, tests): ...@@ -567,10 +546,12 @@ def rebuild(tokudb, builddir, installdir, cc, tests):
env=newenv, env=newenv,
cwd=builddir) cwd=builddir)
if r != 0: if r != 0:
send_mail(['leif@tokutek.com'], 'Stress tests on %s failed to build.' % gethostname(), '')
error('Building the tests failed.') error('Building the tests failed.')
sys.exit(r) sys.exit(r)
r = call(['make', '-j8', 'install'], cwd=builddir) r = call(['make', '-j8'], cwd=builddir)
if r != 0: if r != 0:
send_mail(['leif@tokutek.com'], 'Stress tests on %s failed to build.' % gethostname(), '')
error('Building the tests failed.') error('Building the tests failed.')
sys.exit(r) sys.exit(r)
...@@ -584,9 +565,8 @@ def revfor(tokudb): ...@@ -584,9 +565,8 @@ def revfor(tokudb):
def main(opts): def main(opts):
builddir = os.path.join(opts.tokudb, 'build') builddir = os.path.join(opts.tokudb, 'build')
installdir = os.path.join(opts.tokudb, 'install')
if opts.build: if opts.build:
rebuild(opts.tokudb, builddir, installdir, opts.cc, opts.testnames + opts.recover_testnames) rebuild(opts.tokudb, builddir, opts.cc, opts.cxx, opts.testnames + opts.recover_testnames)
rev = revfor(opts.tokudb) rev = revfor(opts.tokudb)
if not os.path.exists(opts.savedir): if not os.path.exists(opts.savedir):
...@@ -608,9 +588,7 @@ def main(opts): ...@@ -608,9 +588,7 @@ def main(opts):
kwargs = { kwargs = {
'scheduler': scheduler, 'scheduler': scheduler,
'builddir': builddir, 'builddir': builddir,
'installdir': installdir,
'rev': rev, 'rev': rev,
'jemalloc': opts.jemalloc,
'tsize': tsize, 'tsize': tsize,
'csize': csize, 'csize': csize,
'default_test_time': opts.test_time, 'default_test_time': opts.test_time,
...@@ -676,7 +654,7 @@ def main(opts): ...@@ -676,7 +654,7 @@ def main(opts):
if scheduler.error is not None: if scheduler.error is not None:
error('Scheduler reported an error.') error('Scheduler reported an error.')
raise scheduler.error raise scheduler.error
rebuild(opts.tokudb, builddir, installdir, opts.cc, opts.testnames + opts.recover_testnames) rebuild(opts.tokudb, builddir, opts.cc, opts.cxx, opts.testnames + opts.recover_testnames)
rev = revfor(opts.tokudb) rev = revfor(opts.tokudb)
for runner in runners: for runner in runners:
runner.rev = rev runner.rev = rev
...@@ -684,6 +662,7 @@ def main(opts): ...@@ -684,6 +662,7 @@ def main(opts):
sys.exit(0) sys.exit(0)
except Exception, e: except Exception, e:
exception('Unhandled exception caught in main.') exception('Unhandled exception caught in main.')
send_mail(['leif@tokutek.com'], 'Stress tests caught unhandled exception in main, on %s' % gethostname(), format_exc())
raise e raise e
if __name__ == '__main__': if __name__ == '__main__':
...@@ -748,10 +727,10 @@ if __name__ == '__main__': ...@@ -748,10 +727,10 @@ if __name__ == '__main__':
help='skip the svn up and build phase before testing [default=False]') help='skip the svn up and build phase before testing [default=False]')
build_group.add_option('--rebuild_period', type='int', dest='rebuild_period', default=60 * 60 * 24, build_group.add_option('--rebuild_period', type='int', dest='rebuild_period', default=60 * 60 * 24,
help='how many seconds between doing an svn up and rebuild, 0 means never rebuild [default=24 hours]') help='how many seconds between doing an svn up and rebuild, 0 means never rebuild [default=24 hours]')
build_group.add_option('--cc', type='string', dest='cc', default='gcc', build_group.add_option('--cc', type='string', dest='cc', default='gcc47',
help='which compiler to use [default=gcc]') help='which compiler to use [default=gcc47]')
build_group.add_option('--jemalloc', type='string', dest='jemalloc', build_group.add_option('--cxx', type='string', dest='cxx', default='g++47',
help='a libjemalloc.so to put in LD_PRELOAD when running tests') help='which compiler to use [default=g++47]')
build_group.add_option('--add_test', action='append', type='string', dest='testnames', default=default_testnames, build_group.add_option('--add_test', action='append', type='string', dest='testnames', default=default_testnames,
help=('add a stress test to run [default=%r]' % default_testnames)) help=('add a stress test to run [default=%r]' % default_testnames))
build_group.add_option('--add_recover_test', action='append', type='string', dest='recover_testnames', default=default_recover_testnames, build_group.add_option('--add_recover_test', action='append', type='string', dest='recover_testnames', default=default_recover_testnames,
...@@ -776,6 +755,9 @@ if __name__ == '__main__': ...@@ -776,6 +755,9 @@ if __name__ == '__main__':
if len(args) > 0: if len(args) > 0:
parser.error('Invalid arguments: %r' % args) parser.error('Invalid arguments: %r' % args)
if len(opts.old_versions) > 0:
opts.run_upgrade = True
if opts.run_upgrade: if opts.run_upgrade:
if not os.path.isdir(opts.old_environments_dir): if not os.path.isdir(opts.old_environments_dir):
parser.error('You specified --run_upgrade but did not specify an --old_environments_dir that exists.') parser.error('You specified --run_upgrade but did not specify an --old_environments_dir that exists.')
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment