oomtop: First draft

This is first draft of utility to process dmesg stream and report information about OOM events there with usuful postprocessing. For now useful = sort process dump list by RSS so it is easier to track which processes were using memory.

oomtop: First draft
This is first draft of utility to process dmesg stream and report information about OOM events there with usuful postprocessing. For now useful = sort process dump list by RSS so it is easier to track which processes were using memory.
8d50e9a3 · Kirill Smelkov · 8d50e9a3
Commit 8d50e9a3 authored Nov 29, 2017 by Kirill Smelkov
Show whitespace changes
Inline Side-by-side

Showing with 311 additions and 0 deletions

oomtop oomtop +311 -0

No files found.
--- a/oomtop
+++ b/oomtop
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2017  Nexedi SA and Contributors.
+#                     Kirill Smelkov <kirr@nexedi.com>
+#
+# This program is free software: you can Use, Study, Modify and Redistribute
+# it under the terms of the GNU General Public License version 3, or (at your
+# option) any later version, as published by the Free Software Foundation.
+#
+# You can also Link and Combine this program with other software covered by
+# the terms of any of the Free Software licenses or any of the Open Source
+# Initiative approved licenses and Convey the resulting work. Corresponding
+# source of such a combination shall include the source code for all other
+# software used.
+#
+# This program is distributed WITHOUT ANY WARRANTY; without even the implied
+# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See COPYING file for full licensing terms.
+# See https://www.nexedi.com/licensing for rationale and options.
+"""oomtop - extract OOM events from dmesg-like stream and report them handy"""
+
+from __future__ import print_function
+
+import re
+import sys
+
+# DmesgReader wraps IO reader's .readline() to read lines from dmesg-like steam
+class DmesgReader(object):
+
+    def __init__(self, f):
+        self._f      = f
+        self._buf    = []   # [] of lines already read from f
+        self.lineno  = 0    # line number of last readline
+        self._line   = None # last read line
+
+    def _readline(self):
+        if self._buf:
+            l = self._buf.pop(0)
+        else:
+            l = self._f.readline()
+
+        if l:
+            self.lineno += 1
+
+        self._line = l
+        return l
+
+    def _unreadline(self, l):
+        self._buf.insert(0, l)
+        self.lineno -= 1
+
+    # report a problem found around currently-read line
+    def badline(self, msg):
+        raise RuntimeError("%s+%d: invalid line: %s (%r)" % (self._f.name, self.lineno, msg, self._line))
+
+    # readline reads one line from dmesg stream and returns (timestamp, text) or (None, None) at EOF
+    def readline(self):
+        # e.g. `[    0.789905] pci 0000:ff:0f.0: [8086:6ff8] type 00 class 0x088000`
+        l = self._readline()
+        if not l:
+            return (None, None) # EOF
+
+
+        if l[0] != '[':
+            self.badline('no timestamp start')
+        ket = l.find(']')
+        if ket == -1:
+            self.badline('no timestamp end')
+
+        if l[ket+1:ket+2] != ' ':
+            self.badline('no SP after timestamp')
+
+        tstr = l[:ket+1]            # [    0.789905]
+        t = float(tstr[1:-1])
+
+        text = l[ket+1+1:]
+
+        # read rest of the text if it was multiline passed to printk
+        contindent = ' '*(len(tstr)+1)
+        while 1:
+            l = self._readline()
+            if not l:
+                break   # EOF
+
+            if l[0] == '[':
+                self._unreadline(l) # next line
+                break
+
+            if l == '\n':
+                # sometimes completely empty line goes in dmesg, e.g.
+                # [    0.331453] NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter.
+                #
+                # [    0.331583] .... node  #0, CPUs:     #2
+                # let's assume it is continuation.
+                l = contindent + '\n'
+
+            # this is continuation - check it has proper indent and append to text if so
+            if not l.startswith(contindent):
+                self.badline('bad continuation')
+
+            text += l[len(contindent):]
+
+        text = text.rstrip()    # without trailing NL
+
+        return (t, text)
+
+
+# start of an OOM:
+# [1129511.760942] mysqld invoked oom-killer: gfp_mask=0x24201ca, order=0, oom_score_adj=0
+oom_start_re = re.compile('.* invoked oom-killer:')
+
+# end of an OOM:
+# [1129511.762174] Killed process 176988 (mysqld) total-vm:61525744kB, anon-rss:49241736kB, file-rss:0kB
+oom_end_re = re.compile('Killed process ')
+
+
+# OOMReader wraps DmesgReader to read OOM events one by one
+class OOMReader(object):
+
+    def __init__(self, r):
+        self._r = r
+
+    # readoom reads one OOM event from input stream and returns OOMEvent instance or None at EOF
+    def readoom(self):
+
+        r = self._r
+
+        # search for OOM event header
+        while 1:
+            t, l = r.readline()
+            if t is None:
+                return None
+
+            if oom_start_re.match(l):
+                break
+
+        ev = OOMEvent()
+        ev.timestamp = t
+        ev.lineno    = r.lineno
+        ev.linev     = [l]
+
+        # read till OOM event footer
+        while 1:
+            t, l = r.readline()
+            if t is None:
+                r.badline('oom: unexpected EOF while searching for event end (started @ line %d)' % ev.lineno)
+
+            if t - ev.timestamp > 0.1:
+                r.badline('oom: too much time drift while searching for event end (started @ line %d)' % ev.lineno)
+
+            ev.linev.append(l)
+
+            if oom_end_re.match(l):
+                break
+
+        ev._parse()
+        return ev
+
+# start of proc list in OOM dump
+# [ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name
+oom_procv_start = re.compile(r"""\[ pid \]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name""")
+
+# OOMEvent represents one OOM event extracted from a dmesg log
+class OOMEvent(object):
+
+    # .timestamp    - time of the event
+    # .lineno       - first line of the event
+    # .linev        - full event text lines
+
+    # .actionv      - text about OOM action taken
+    # .procv        - information about all processes at the time of the event ([] of Proc)
+    # .detailv      - text about other details
+
+    # parse parses event text to build e.g. procv
+    def _parse(self):
+        # search for process dump start
+        for i, l in enumerate(self.linev):
+            if oom_procv_start.match(l):
+                break
+        else:
+            raise RuntimeError("oom: event @ line %d: cannot find procv start" % self.lineno)
+
+        # parse process dump
+        proc_start  = i
+        proc_header = l
+        procv       = []
+
+        for i, l in enumerate(self.linev[proc_start + 1:]):
+            # [ 1501]     0  1501    44175      106      19       3        0             0 lvmetad
+
+            # does not start with [ <pid> ] - end of process dump
+            if l[0] != '[':
+                break
+            ket = l.find(']')
+            if ket == -1:
+                break
+
+            P = Proc()
+            P.pid = int(l[1:ket-1])
+            fv  = l[ket+1:].split(None, 9)
+            P.uid           = int(fv[0])
+            P.tgid          = int(fv[1])
+            P.total_vm      = int(fv[2])
+            P.rss           = int(fv[3])
+            P.nr_ptes       = int(fv[4])
+            P.nr_pmds       = int(fv[5])
+            P.swapents      = int(fv[6])
+            P.oom_score_adj = int(fv[7])
+            P.name          = fv[8]
+
+            procv.append(P)
+
+        proc_end = proc_start + 1 + i
+
+
+        self.procv = procv
+        self.actionv = self.linev[proc_end:]
+        self.detailv = self.linev[:proc_start]
+
+
+# Proc represent one process in OOM dump
+class Proc(object):
+    # .pid
+    # .uid
+    # .tgid
+    # .total_vm
+    # .rss
+    # .nr_ptes
+    # .nr_pmds
+    # .swapents
+    # .oom_score_adj
+    # .name
+    pass
+
+
+
+# read/dump dmesg back
+def echo():
+    f = DmesgReader(sys.stdin)
+    while 1:
+        t, line = f.readline()
+        if t is None:
+            break
+
+        print('%.6f\t%s' % (t, line))
+
+
+# format_aligned formats table (e.g. [] of []) to be printed with columns
+# aligned and returns formatted text.
+def format_aligned(rowv):
+    maxw = []
+
+    # collect max column width
+    for row in rowv:
+        if len(row) > len(maxw):
+            maxw.extend( [0] * (len(row) - len(maxw)) )
+
+        for i, f in enumerate(row):
+            if len(unicode(f)) > maxw[i]:
+                maxw[i] = len(unicode(f))
+
+    # prepare aligned output
+    linev = []
+    for row in rowv:
+        line = ''
+        for f, w in zip(row, maxw):
+            # numbers are right-aligned, everything else - left
+            if isinstance(f, (int,long,float)):
+                fstr = ('%%%is'  % w) % f
+            else:
+                fstr = ('%%-%is' % w) % f
+
+
+            line = ' '.join((line,fstr))
+
+        # strip trailing empty field, if any
+        line = line.rstrip()
+        linev.append(line)
+
+    return '\n'.join(linev)
+
+
+
+def main():
+    f = OOMReader(DmesgReader(sys.stdin))
+    while 1:
+        oom = f.readoom()
+        if oom is None:
+            break
+
+        print()
+        print('%.6f OOM ---- 8< ----' % oom.timestamp)
+        #print('\n'.join(oom.linev))
+        print('\n'.join(oom.actionv))
+        outv = []
+        fieldv = ('pid', 'uid', 'tgid', 'total_vm', 'rss', 'nr_ptes', 'nr_pmds', 'swapents', 'oom_score_adj', 'name')
+        outv.append(fieldv)
+        for p in sorted(oom.procv, key = lambda _: _.rss, reverse=True)[:20]:
+            outv.append( tuple(getattr(p, _) for _ in fieldv) )
+        outv.append(('...',))  # we cut at 20
+        out_text = format_aligned(outv)
+        print(out_text)
+        print()
+        print('\n'.join(oom.detailv))
+        print('---- 8< ----')
+
+
+
+if __name__ == '__main__':
+    main()