Commit 8d50e9a3 authored by Kirill Smelkov's avatar Kirill Smelkov

oomtop: First draft

This is first draft of utility to process dmesg stream and report
information about OOM events there with usuful postprocessing.

For now useful = sort process dump list by RSS so it is easier to track
which processes were using memory.
parents
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2017 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""oomtop - extract OOM events from dmesg-like stream and report them handy"""
from __future__ import print_function
import re
import sys
# DmesgReader wraps IO reader's .readline() to read lines from dmesg-like steam
class DmesgReader(object):
def __init__(self, f):
self._f = f
self._buf = [] # [] of lines already read from f
self.lineno = 0 # line number of last readline
self._line = None # last read line
def _readline(self):
if self._buf:
l = self._buf.pop(0)
else:
l = self._f.readline()
if l:
self.lineno += 1
self._line = l
return l
def _unreadline(self, l):
self._buf.insert(0, l)
self.lineno -= 1
# report a problem found around currently-read line
def badline(self, msg):
raise RuntimeError("%s+%d: invalid line: %s (%r)" % (self._f.name, self.lineno, msg, self._line))
# readline reads one line from dmesg stream and returns (timestamp, text) or (None, None) at EOF
def readline(self):
# e.g. `[ 0.789905] pci 0000:ff:0f.0: [8086:6ff8] type 00 class 0x088000`
l = self._readline()
if not l:
return (None, None) # EOF
if l[0] != '[':
self.badline('no timestamp start')
ket = l.find(']')
if ket == -1:
self.badline('no timestamp end')
if l[ket+1:ket+2] != ' ':
self.badline('no SP after timestamp')
tstr = l[:ket+1] # [ 0.789905]
t = float(tstr[1:-1])
text = l[ket+1+1:]
# read rest of the text if it was multiline passed to printk
contindent = ' '*(len(tstr)+1)
while 1:
l = self._readline()
if not l:
break # EOF
if l[0] == '[':
self._unreadline(l) # next line
break
if l == '\n':
# sometimes completely empty line goes in dmesg, e.g.
# [ 0.331453] NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter.
#
# [ 0.331583] .... node #0, CPUs: #2
# let's assume it is continuation.
l = contindent + '\n'
# this is continuation - check it has proper indent and append to text if so
if not l.startswith(contindent):
self.badline('bad continuation')
text += l[len(contindent):]
text = text.rstrip() # without trailing NL
return (t, text)
# start of an OOM:
# [1129511.760942] mysqld invoked oom-killer: gfp_mask=0x24201ca, order=0, oom_score_adj=0
oom_start_re = re.compile('.* invoked oom-killer:')
# end of an OOM:
# [1129511.762174] Killed process 176988 (mysqld) total-vm:61525744kB, anon-rss:49241736kB, file-rss:0kB
oom_end_re = re.compile('Killed process ')
# OOMReader wraps DmesgReader to read OOM events one by one
class OOMReader(object):
def __init__(self, r):
self._r = r
# readoom reads one OOM event from input stream and returns OOMEvent instance or None at EOF
def readoom(self):
r = self._r
# search for OOM event header
while 1:
t, l = r.readline()
if t is None:
return None
if oom_start_re.match(l):
break
ev = OOMEvent()
ev.timestamp = t
ev.lineno = r.lineno
ev.linev = [l]
# read till OOM event footer
while 1:
t, l = r.readline()
if t is None:
r.badline('oom: unexpected EOF while searching for event end (started @ line %d)' % ev.lineno)
if t - ev.timestamp > 0.1:
r.badline('oom: too much time drift while searching for event end (started @ line %d)' % ev.lineno)
ev.linev.append(l)
if oom_end_re.match(l):
break
ev._parse()
return ev
# start of proc list in OOM dump
# [ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name
oom_procv_start = re.compile(r"""\[ pid \] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name""")
# OOMEvent represents one OOM event extracted from a dmesg log
class OOMEvent(object):
# .timestamp - time of the event
# .lineno - first line of the event
# .linev - full event text lines
# .actionv - text about OOM action taken
# .procv - information about all processes at the time of the event ([] of Proc)
# .detailv - text about other details
# parse parses event text to build e.g. procv
def _parse(self):
# search for process dump start
for i, l in enumerate(self.linev):
if oom_procv_start.match(l):
break
else:
raise RuntimeError("oom: event @ line %d: cannot find procv start" % self.lineno)
# parse process dump
proc_start = i
proc_header = l
procv = []
for i, l in enumerate(self.linev[proc_start + 1:]):
# [ 1501] 0 1501 44175 106 19 3 0 0 lvmetad
# does not start with [ <pid> ] - end of process dump
if l[0] != '[':
break
ket = l.find(']')
if ket == -1:
break
P = Proc()
P.pid = int(l[1:ket-1])
fv = l[ket+1:].split(None, 9)
P.uid = int(fv[0])
P.tgid = int(fv[1])
P.total_vm = int(fv[2])
P.rss = int(fv[3])
P.nr_ptes = int(fv[4])
P.nr_pmds = int(fv[5])
P.swapents = int(fv[6])
P.oom_score_adj = int(fv[7])
P.name = fv[8]
procv.append(P)
proc_end = proc_start + 1 + i
self.procv = procv
self.actionv = self.linev[proc_end:]
self.detailv = self.linev[:proc_start]
# Proc represent one process in OOM dump
class Proc(object):
# .pid
# .uid
# .tgid
# .total_vm
# .rss
# .nr_ptes
# .nr_pmds
# .swapents
# .oom_score_adj
# .name
pass
# read/dump dmesg back
def echo():
f = DmesgReader(sys.stdin)
while 1:
t, line = f.readline()
if t is None:
break
print('%.6f\t%s' % (t, line))
# format_aligned formats table (e.g. [] of []) to be printed with columns
# aligned and returns formatted text.
def format_aligned(rowv):
maxw = []
# collect max column width
for row in rowv:
if len(row) > len(maxw):
maxw.extend( [0] * (len(row) - len(maxw)) )
for i, f in enumerate(row):
if len(unicode(f)) > maxw[i]:
maxw[i] = len(unicode(f))
# prepare aligned output
linev = []
for row in rowv:
line = ''
for f, w in zip(row, maxw):
# numbers are right-aligned, everything else - left
if isinstance(f, (int,long,float)):
fstr = ('%%%is' % w) % f
else:
fstr = ('%%-%is' % w) % f
line = ' '.join((line,fstr))
# strip trailing empty field, if any
line = line.rstrip()
linev.append(line)
return '\n'.join(linev)
def main():
f = OOMReader(DmesgReader(sys.stdin))
while 1:
oom = f.readoom()
if oom is None:
break
print()
print('%.6f OOM ---- 8< ----' % oom.timestamp)
#print('\n'.join(oom.linev))
print('\n'.join(oom.actionv))
outv = []
fieldv = ('pid', 'uid', 'tgid', 'total_vm', 'rss', 'nr_ptes', 'nr_pmds', 'swapents', 'oom_score_adj', 'name')
outv.append(fieldv)
for p in sorted(oom.procv, key = lambda _: _.rss, reverse=True)[:20]:
outv.append( tuple(getattr(p, _) for _ in fieldv) )
outv.append(('...',)) # we cut at 20
out_text = format_aligned(outv)
print(out_text)
print()
print('\n'.join(oom.detailv))
print('---- 8< ----')
if __name__ == '__main__':
main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment