Commit 4da6a16f authored by Kirill Smelkov's avatar Kirill Smelkov

Y wcfs: Switch to custom lsof

Previously when error on umount we were invoking lsof(8) to show list of
files that are still opened on the filesystem. But lsof(8) turned out to
be unreliable because it stats the filesystem and if e.g. wcfs server
process is stopped lsof only prints

    WARNING:wcfs:# lsof /dev/shm/wcfs/1439df02dfcc41ab9dfb68e7ac4ad615f3b7d46e
    WARNING:wcfs:lsof: status error on /dev/shm/wcfs/1439df02dfcc41ab9dfb68e7ac4ad615f3b7d46e: Transport endpoint is not connected
    ...
    WARNING:wcfs:(lsof failed)

fuser(1) from psmisc works a bit better: it can show list of still
opened files on the mounted tree even if filesystem server is crashed.

However with some version of fuser I still saw "Transport endpoint is
not connected" once, and in the next patches we will also need to
inspect "using" processes more, so if we are to use fuser we will need
to parse its output which might get fragile.

-> Do our own lsof utility instead.

We have all the infrastructure in place to do so in the form of MountDB
and ProcDB, and as implemented Mount.lsof() emits Proc'esses which can
be inspected further conveniently. For now we do not do such inspection,
but for `wcfs status` and `wcfs stop` we will want to poke with kernel
tracebacks of those processes.
parent a50e2ce9
...@@ -593,29 +593,15 @@ def _mnt_fuse_unmount(mnt, *optv): ...@@ -593,29 +593,15 @@ def _mnt_fuse_unmount(mnt, *optv):
ret, out = _sysproccallout(["fusermount", "-u"] + list(optv) + [mnt.point]) ret, out = _sysproccallout(["fusermount", "-u"] + list(optv) + [mnt.point])
if ret != 0: if ret != 0:
# unmount failed, usually due to "device is busy". # unmount failed, usually due to "device is busy".
# Log which files are still opened and reraise # Log which files are still opened by who and reraise
def _(): def _():
log.warn("# lsof %s" % mnt.point) log.warn("# lsof %s" % mnt.point)
# -w to avoid lots of try:
# lsof: WARNING: can't stat() fuse.wcfs file system /dev/shm/wcfs/X _ = _lsof(mnt)
# Output information may be incomplete. except:
# if there are other uncleaned wcfs mountpoints. log.exception("lsof failed")
# (lsof stats all filesystems on startup) else:
# NOTE lsof +D misbehaves - don't use it log.warn(_)
ret, out = _sysproccallout(["lsof", "-w", mnt.point])
log.warn(out)
if ret:
log.warn("(lsof failed)")
defer(_)
# XXX fuser should work where lsof starts to fail after wcfs going to EIO mode
# ref:ZzO3wtEdQVDw5Wz5@deca.navytux.spb.ru
def _():
log.warn("# fuser -vmM %s" % mnt.point)
ret, out = _sysproccallout(["fuser", "-vmM", mnt.point])
log.warn(out)
if ret:
log.warn("(fuser failed)")
defer(_) defer(_)
out = out.rstrip() # kill trailing \n\n out = out.rstrip() # kill trailing \n\n
...@@ -626,6 +612,19 @@ def _mnt_fuse_unmount(mnt, *optv): ...@@ -626,6 +612,19 @@ def _mnt_fuse_unmount(mnt, *optv):
log.warn(emsg) log.warn(emsg)
raise _FUSEUnmountError("%s\n(more details logged)" % emsg) raise _FUSEUnmountError("%s\n(more details logged)" % emsg)
# lsof returns text description of which processes and which their file
# descriptors use specified mount.
def _lsof(mnt): # -> str
# NOTE lsof(8) fails to work after wcfs goes into EIO mode
# fuser(1) works a bit better, but we still do it ourselves because we
# anyway need to customize output and integrate it with ktraceback
s = ""
for (proc, use) in mnt.lsof():
s += " %s %s\n" % (proc, proc.get("argv", eperm="strerror", gone="strerror"))
for key, path in use.items():
s += "\t%s\t-> %s\n" % (key, path)
return s
# _is_mountpoint returns whether path is a mountpoint # _is_mountpoint returns whether path is a mountpoint
def _is_mountpoint(path): # -> bool def _is_mountpoint(path): # -> bool
# NOTE we don't call mountpoint directly on path, because if FUSE # NOTE we don't call mountpoint directly on path, because if FUSE
......
...@@ -475,6 +475,43 @@ def _pidlist(pdbc, gt): # []pid↑ : pid > gt ...@@ -475,6 +475,43 @@ def _pidlist(pdbc, gt): # []pid↑ : pid > gt
# ---- Mount ----
# lsof returns information about which processes use the mount and how.
@func(Mount)
def lsof(mnt, pdbc=None): # -> i() of (proc, {}key -> path) ; key = fd/X, mmap/Y, cwd, ...
if pdbc is None:
pdbc = ProcDB.open(isolation_level=ISOLATION_REPEATABLE_READ)
assert isinstance(pdbc, ProcDBConn)
# NOTE we must not access the filesystem on mnt at all or it might hang
# if filesystem server is deadlocked
def _(proc):
use = {}
for key in ('cwd', 'exe', 'root'):
path = proc.get(key)
# XXX better somehow to check via devid, but we can't stat link
# target because it will touch fs
if path is not None and (path == mnt.point or path.startswith(mnt.point + "/")):
use[key] = path
for ifd in proc.fd.values():
if ifd.mnt_id == mnt.id:
use["fd/%d" % ifd.fd] = ifd.path
for mmap in proc.mmaps.values():
if mmap.dev == mnt.dev:
use["mmap/%s" % mmap.addr] = mmap.path
if len(use) > 0:
return (proc, use)
return False
return pdbc.query(_, eperm="warn")
# ---- Proc/Task/... ---- # ---- Proc/Task/... ----
# get retrieves process property with specified name. # get retrieves process property with specified name.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment