Commit 63c882a0 authored by Eric Paris's avatar Eric Paris

inotify: reimplement inotify using fsnotify

Reimplement inotify_user using fsnotify.  This should be feature for feature
exactly the same as the original inotify_user.  This does not make any changes
to the in kernel inotify feature used by audit.  Those patches (and the eventual
removal of in kernel inotify) will come after the new inotify_user proves to be
working correctly.
Signed-off-by: default avatarEric Paris <eparis@redhat.com>
Acked-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
parent 164bc619
...@@ -2858,6 +2858,8 @@ P: John McCutchan ...@@ -2858,6 +2858,8 @@ P: John McCutchan
M: john@johnmccutchan.com M: john@johnmccutchan.com
P: Robert Love P: Robert Love
M: rlove@rlove.org M: rlove@rlove.org
P: Eric Paris
M: eparis@parisplace.org
L: linux-kernel@vger.kernel.org L: linux-kernel@vger.kernel.org
S: Maintained S: Maintained
F: Documentation/filesystems/inotify.txt F: Documentation/filesystems/inotify.txt
......
config INOTIFY config INOTIFY
bool "Inotify file change notification support" bool "Inotify file change notification support"
default y default n
---help--- ---help---
Say Y here to enable inotify support. Inotify is a file change Say Y here to enable legacy in kernel inotify support. Inotify is a
notification system and a replacement for dnotify. Inotify fixes file change notification system. It is a replacement for dnotify.
numerous shortcomings in dnotify and introduces several new features This option only provides the legacy inotify in kernel API. There
including multiple file events, one-shot support, and unmount are no in tree kernel users of this interface since it is deprecated.
notification. You only need this if you are loading an out of tree kernel module
that uses inotify.
For more information, see <file:Documentation/filesystems/inotify.txt> For more information, see <file:Documentation/filesystems/inotify.txt>
If unsure, say Y. If unsure, say N.
config INOTIFY_USER config INOTIFY_USER
bool "Inotify support for userspace" bool "Inotify support for userspace"
depends on INOTIFY depends on FSNOTIFY
default y default y
---help--- ---help---
Say Y here to enable inotify support for userspace, including the Say Y here to enable inotify support for userspace, including the
associated system calls. Inotify allows monitoring of both files and associated system calls. Inotify allows monitoring of both files and
directories via a single open fd. Events are read from the file directories via a single open fd. Events are read from the file
descriptor, which is also select()- and poll()-able. descriptor, which is also select()- and poll()-able.
Inotify fixes numerous shortcomings in dnotify and introduces several
new features including multiple file events, one-shot support, and
unmount notification.
For more information, see <file:Documentation/filesystems/inotify.txt> For more information, see <file:Documentation/filesystems/inotify.txt>
......
obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_INOTIFY_USER) += inotify_user.o obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o
#include <linux/fsnotify_backend.h>
#include <linux/inotify.h>
#include <linux/slab.h> /* struct kmem_cache */
extern struct kmem_cache *event_priv_cachep;
struct inotify_event_private_data {
struct fsnotify_event_private_data fsnotify_event_priv_data;
int wd;
};
struct inotify_inode_mark_entry {
/* fsnotify_mark_entry MUST be the first thing */
struct fsnotify_mark_entry fsn_entry;
int wd;
};
extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
extern const struct fsnotify_ops inotify_fsnotify_ops;
/*
* fs/inotify_user.c - inotify support for userspace
*
* Authors:
* John McCutchan <ttb@tentacle.dhs.org>
* Robert Love <rml@novell.com>
*
* Copyright (C) 2005 John McCutchan
* Copyright 2006 Hewlett-Packard Development Company, L.P.
*
* Copyright (C) 2009 Eric Paris <Red Hat Inc>
* inotify was largely rewriten to make use of the fsnotify infrastructure
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2, or (at your option) any
* later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/fs.h> /* struct inode */
#include <linux/fsnotify_backend.h>
#include <linux/inotify.h>
#include <linux/path.h> /* struct path */
#include <linux/slab.h> /* kmem_* */
#include <linux/types.h>
#include "inotify.h"
static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
{
struct fsnotify_mark_entry *entry;
struct inotify_inode_mark_entry *ientry;
struct inode *to_tell;
struct inotify_event_private_data *event_priv;
struct fsnotify_event_private_data *fsn_event_priv;
int wd, ret;
to_tell = event->to_tell;
spin_lock(&to_tell->i_lock);
entry = fsnotify_find_mark_entry(group, to_tell);
spin_unlock(&to_tell->i_lock);
/* race with watch removal? We already passes should_send */
if (unlikely(!entry))
return 0;
ientry = container_of(entry, struct inotify_inode_mark_entry,
fsn_entry);
wd = ientry->wd;
event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
if (unlikely(!event_priv))
return -ENOMEM;
fsn_event_priv = &event_priv->fsnotify_event_priv_data;
fsn_event_priv->group = group;
event_priv->wd = wd;
ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
/* EEXIST is not an error */
if (ret == -EEXIST)
ret = 0;
/* did event_priv get attached? */
if (list_empty(&fsn_event_priv->event_list))
inotify_free_event_priv(fsn_event_priv);
/*
* If we hold the entry until after the event is on the queue
* IN_IGNORED won't be able to pass this event in the queue
*/
fsnotify_put_mark(entry);
return ret;
}
static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
{
inotify_destroy_mark_entry(entry, group);
}
static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
{
struct fsnotify_mark_entry *entry;
bool send;
spin_lock(&inode->i_lock);
entry = fsnotify_find_mark_entry(group, inode);
spin_unlock(&inode->i_lock);
if (!entry)
return false;
send = (entry->mask & mask);
/* find took a reference */
fsnotify_put_mark(entry);
return send;
}
static int idr_callback(int id, void *p, void *data)
{
BUG();
return 0;
}
static void inotify_free_group_priv(struct fsnotify_group *group)
{
/* ideally the idr is empty and we won't hit the BUG in teh callback */
idr_for_each(&group->inotify_data.idr, idr_callback, NULL);
idr_remove_all(&group->inotify_data.idr);
idr_destroy(&group->inotify_data.idr);
}
void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
{
struct inotify_event_private_data *event_priv;
event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
fsnotify_event_priv_data);
kmem_cache_free(event_priv_cachep, event_priv);
}
const struct fsnotify_ops inotify_fsnotify_ops = {
.handle_event = inotify_handle_event,
.should_send_event = inotify_should_send_event,
.free_group_priv = inotify_free_group_priv,
.free_event_priv = inotify_free_event_priv,
.freeing_mark = inotify_freeing_mark,
};
...@@ -8,6 +8,9 @@ ...@@ -8,6 +8,9 @@
* Copyright (C) 2005 John McCutchan * Copyright (C) 2005 John McCutchan
* Copyright 2006 Hewlett-Packard Development Company, L.P. * Copyright 2006 Hewlett-Packard Development Company, L.P.
* *
* Copyright (C) 2009 Eric Paris <Red Hat Inc>
* inotify was largely rewriten to make use of the fsnotify infrastructure
*
* This program is free software; you can redistribute it and/or modify it * This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the * under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2, or (at your option) any * Free Software Foundation; either version 2, or (at your option) any
...@@ -19,94 +22,48 @@ ...@@ -19,94 +22,48 @@
* General Public License for more details. * General Public License for more details.
*/ */
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/mount.h> #include <linux/fs.h> /* struct inode */
#include <linux/namei.h> #include <linux/fsnotify_backend.h>
#include <linux/poll.h> #include <linux/idr.h>
#include <linux/init.h> #include <linux/init.h> /* module_init */
#include <linux/list.h>
#include <linux/inotify.h> #include <linux/inotify.h>
#include <linux/kernel.h> /* roundup() */
#include <linux/magic.h> /* superblock magic number */
#include <linux/mount.h> /* mntget */
#include <linux/namei.h> /* LOOKUP_FOLLOW */
#include <linux/path.h> /* struct path */
#include <linux/sched.h> /* struct user */
#include <linux/slab.h> /* struct kmem_cache */
#include <linux/syscalls.h> #include <linux/syscalls.h>
#include <linux/magic.h> #include <linux/types.h>
#include <linux/uaccess.h>
#include <linux/poll.h>
#include <linux/wait.h>
#include <asm/ioctls.h> #include "inotify.h"
static struct kmem_cache *watch_cachep __read_mostly; #include <asm/ioctls.h>
static struct kmem_cache *event_cachep __read_mostly;
static struct vfsmount *inotify_mnt __read_mostly; static struct vfsmount *inotify_mnt __read_mostly;
/* this just sits here and wastes global memory. used to just pad userspace messages with zeros */
static struct inotify_event nul_inotify_event;
/* these are configurable via /proc/sys/fs/inotify/ */ /* these are configurable via /proc/sys/fs/inotify/ */
static int inotify_max_user_instances __read_mostly; static int inotify_max_user_instances __read_mostly;
static int inotify_max_user_watches __read_mostly;
static int inotify_max_queued_events __read_mostly; static int inotify_max_queued_events __read_mostly;
int inotify_max_user_watches __read_mostly;
/* static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
* Lock ordering: struct kmem_cache *event_priv_cachep __read_mostly;
* static struct fsnotify_event *inotify_ignored_event;
* inotify_dev->up_mutex (ensures we don't re-add the same watch)
* inode->inotify_mutex (protects inode's watch list)
* inotify_handle->mutex (protects inotify_handle's watch list)
* inotify_dev->ev_mutex (protects device's event queue)
*/
/*
* Lifetimes of the main data structures:
*
* inotify_device: Lifetime is managed by reference count, from
* sys_inotify_init() until release. Additional references can bump the count
* via get_inotify_dev() and drop the count via put_inotify_dev().
*
* inotify_user_watch: Lifetime is from create_watch() to the receipt of an
* IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
* first event, or to inotify_destroy().
*/
/* /*
* struct inotify_device - represents an inotify instance * When inotify registers a new group it increments this and uses that
* * value as an offset to set the fsnotify group "name" and priority.
* This structure is protected by the mutex 'mutex'.
*/ */
struct inotify_device { static atomic_t inotify_grp_num;
wait_queue_head_t wq; /* wait queue for i/o */
struct mutex ev_mutex; /* protects event queue */
struct mutex up_mutex; /* synchronizes watch updates */
struct list_head events; /* list of queued events */
struct user_struct *user; /* user who opened this dev */
struct inotify_handle *ih; /* inotify handle */
struct fasync_struct *fa; /* async notification */
atomic_t count; /* reference count */
unsigned int queue_size; /* size of the queue (bytes) */
unsigned int event_count; /* number of pending events */
unsigned int max_events; /* maximum number of events */
};
/*
* struct inotify_kernel_event - An inotify event, originating from a watch and
* queued for user-space. A list of these is attached to each instance of the
* device. In read(), this list is walked and all events that can fit in the
* buffer are returned.
*
* Protected by dev->ev_mutex of the device in which we are queued.
*/
struct inotify_kernel_event {
struct inotify_event event; /* the user-space event */
struct list_head list; /* entry in inotify_device's list */
char *name; /* filename, if any */
};
/*
* struct inotify_user_watch - our version of an inotify_watch, we add
* a reference to the associated inotify_device.
*/
struct inotify_user_watch {
struct inotify_device *dev; /* associated device */
struct inotify_watch wdata; /* inotify watch data */
};
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
...@@ -149,280 +106,36 @@ ctl_table inotify_table[] = { ...@@ -149,280 +106,36 @@ ctl_table inotify_table[] = {
}; };
#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SYSCTL */
static inline void get_inotify_dev(struct inotify_device *dev) static inline __u32 inotify_arg_to_mask(u32 arg)
{ {
atomic_inc(&dev->count); __u32 mask;
}
static inline void put_inotify_dev(struct inotify_device *dev) /* everything should accept their own ignored and cares about children */
{ mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
if (atomic_dec_and_test(&dev->count)) {
atomic_dec(&dev->user->inotify_devs);
free_uid(dev->user);
kfree(dev);
}
}
/* /* mask off the flags used to open the fd */
* free_inotify_user_watch - cleans up the watch and its references mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
*/
static void free_inotify_user_watch(struct inotify_watch *w)
{
struct inotify_user_watch *watch;
struct inotify_device *dev;
watch = container_of(w, struct inotify_user_watch, wdata);
dev = watch->dev;
atomic_dec(&dev->user->inotify_watches);
put_inotify_dev(dev);
kmem_cache_free(watch_cachep, watch);
}
/*
* kernel_event - create a new kernel event with the given parameters
*
* This function can sleep.
*/
static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
const char *name)
{
struct inotify_kernel_event *kevent;
kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
if (unlikely(!kevent))
return NULL;
/* we hand this out to user-space, so zero it just in case */
memset(&kevent->event, 0, sizeof(struct inotify_event));
kevent->event.wd = wd;
kevent->event.mask = mask;
kevent->event.cookie = cookie;
INIT_LIST_HEAD(&kevent->list);
if (name) {
size_t len, rem, event_size = sizeof(struct inotify_event);
/* return mask;
* We need to pad the filename so as to properly align an
* array of inotify_event structures. Because the structure is
* small and the common case is a small filename, we just round
* up to the next multiple of the structure's sizeof. This is
* simple and safe for all architectures.
*/
len = strlen(name) + 1;
rem = event_size - len;
if (len > event_size) {
rem = event_size - (len % event_size);
if (len % event_size == 0)
rem = 0;
}
kevent->name = kmalloc(len + rem, GFP_NOFS);
if (unlikely(!kevent->name)) {
kmem_cache_free(event_cachep, kevent);
return NULL;
}
memcpy(kevent->name, name, len);
if (rem)
memset(kevent->name + len, 0, rem);
kevent->event.len = len + rem;
} else {
kevent->event.len = 0;
kevent->name = NULL;
}
return kevent;
} }
/* static inline u32 inotify_mask_to_arg(__u32 mask)
* inotify_dev_get_event - return the next event in the given dev's queue
*
* Caller must hold dev->ev_mutex.
*/
static inline struct inotify_kernel_event *
inotify_dev_get_event(struct inotify_device *dev)
{ {
return list_entry(dev->events.next, struct inotify_kernel_event, list); return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
IN_Q_OVERFLOW);
} }
/* /* intofiy userspace file descriptor functions */
* inotify_dev_get_last_event - return the last event in the given dev's queue
*
* Caller must hold dev->ev_mutex.
*/
static inline struct inotify_kernel_event *
inotify_dev_get_last_event(struct inotify_device *dev)
{
if (list_empty(&dev->events))
return NULL;
return list_entry(dev->events.prev, struct inotify_kernel_event, list);
}
/*
* inotify_dev_queue_event - event handler registered with core inotify, adds
* a new event to the given device
*
* Can sleep (calls kernel_event()).
*/
static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
u32 cookie, const char *name,
struct inode *ignored)
{
struct inotify_user_watch *watch;
struct inotify_device *dev;
struct inotify_kernel_event *kevent, *last;
watch = container_of(w, struct inotify_user_watch, wdata);
dev = watch->dev;
mutex_lock(&dev->ev_mutex);
/* we can safely put the watch as we don't reference it while
* generating the event
*/
if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
put_inotify_watch(w); /* final put */
/* coalescing: drop this event if it is a dupe of the previous */
last = inotify_dev_get_last_event(dev);
if (last && last->event.mask == mask && last->event.wd == wd &&
last->event.cookie == cookie) {
const char *lastname = last->name;
if (!name && !lastname)
goto out;
if (name && lastname && !strcmp(lastname, name))
goto out;
}
/* the queue overflowed and we already sent the Q_OVERFLOW event */
if (unlikely(dev->event_count > dev->max_events))
goto out;
/* if the queue overflows, we need to notify user space */
if (unlikely(dev->event_count == dev->max_events))
kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
else
kevent = kernel_event(wd, mask, cookie, name);
if (unlikely(!kevent))
goto out;
/* queue the event and wake up anyone waiting */
dev->event_count++;
dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
list_add_tail(&kevent->list, &dev->events);
wake_up_interruptible(&dev->wq);
kill_fasync(&dev->fa, SIGIO, POLL_IN);
out:
mutex_unlock(&dev->ev_mutex);
}
/*
* remove_kevent - cleans up the given kevent
*
* Caller must hold dev->ev_mutex.
*/
static void remove_kevent(struct inotify_device *dev,
struct inotify_kernel_event *kevent)
{
list_del(&kevent->list);
dev->event_count--;
dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
}
/*
* free_kevent - frees the given kevent.
*/
static void free_kevent(struct inotify_kernel_event *kevent)
{
kfree(kevent->name);
kmem_cache_free(event_cachep, kevent);
}
/*
* inotify_dev_event_dequeue - destroy an event on the given device
*
* Caller must hold dev->ev_mutex.
*/
static void inotify_dev_event_dequeue(struct inotify_device *dev)
{
if (!list_empty(&dev->events)) {
struct inotify_kernel_event *kevent;
kevent = inotify_dev_get_event(dev);
remove_kevent(dev, kevent);
free_kevent(kevent);
}
}
/*
* find_inode - resolve a user-given path to a specific inode
*/
static int find_inode(const char __user *dirname, struct path *path,
unsigned flags)
{
int error;
error = user_path_at(AT_FDCWD, dirname, flags, path);
if (error)
return error;
/* you can only watch an inode if you have read permissions on it */
error = inode_permission(path->dentry->d_inode, MAY_READ);
if (error)
path_put(path);
return error;
}
/*
* create_watch - creates a watch on the given device.
*
* Callers must hold dev->up_mutex.
*/
static int create_watch(struct inotify_device *dev, struct inode *inode,
u32 mask)
{
struct inotify_user_watch *watch;
int ret;
if (atomic_read(&dev->user->inotify_watches) >=
inotify_max_user_watches)
return -ENOSPC;
watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
if (unlikely(!watch))
return -ENOMEM;
/* save a reference to device and bump the count to make it official */
get_inotify_dev(dev);
watch->dev = dev;
atomic_inc(&dev->user->inotify_watches);
inotify_init_watch(&watch->wdata);
ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
if (ret < 0)
free_inotify_user_watch(&watch->wdata);
return ret;
}
/* Device Interface */
static unsigned int inotify_poll(struct file *file, poll_table *wait) static unsigned int inotify_poll(struct file *file, poll_table *wait)
{ {
struct inotify_device *dev = file->private_data; struct fsnotify_group *group = file->private_data;
int ret = 0; int ret = 0;
poll_wait(file, &dev->wq, wait); poll_wait(file, &group->notification_waitq, wait);
mutex_lock(&dev->ev_mutex); mutex_lock(&group->notification_mutex);
if (!list_empty(&dev->events)) if (!fsnotify_notify_queue_is_empty(group))
ret = POLLIN | POLLRDNORM; ret = POLLIN | POLLRDNORM;
mutex_unlock(&dev->ev_mutex); mutex_unlock(&group->notification_mutex);
return ret; return ret;
} }
...@@ -432,26 +145,29 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait) ...@@ -432,26 +145,29 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
* enough to fit in "count". Return an error pointer if * enough to fit in "count". Return an error pointer if
* not large enough. * not large enough.
* *
* Called with the device ev_mutex held. * Called with the group->notification_mutex held.
*/ */
static struct inotify_kernel_event *get_one_event(struct inotify_device *dev, static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
size_t count) size_t count)
{ {
size_t event_size = sizeof(struct inotify_event); size_t event_size = sizeof(struct inotify_event);
struct inotify_kernel_event *kevent; struct fsnotify_event *event;
if (list_empty(&dev->events)) if (fsnotify_notify_queue_is_empty(group))
return NULL; return NULL;
kevent = inotify_dev_get_event(dev); event = fsnotify_peek_notify_event(group);
if (kevent->name)
event_size += kevent->event.len; event_size += roundup(event->name_len, event_size);
if (event_size > count) if (event_size > count)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
remove_kevent(dev, kevent); /* held the notification_mutex the whole time, so this is the
return kevent; * same event we peeked above */
fsnotify_remove_notify_event(group);
return event;
} }
/* /*
...@@ -460,51 +176,90 @@ static struct inotify_kernel_event *get_one_event(struct inotify_device *dev, ...@@ -460,51 +176,90 @@ static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
* We already checked that the event size is smaller than the * We already checked that the event size is smaller than the
* buffer we had in "get_one_event()" above. * buffer we had in "get_one_event()" above.
*/ */
static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent, static ssize_t copy_event_to_user(struct fsnotify_group *group,
struct fsnotify_event *event,
char __user *buf) char __user *buf)
{ {
struct inotify_event inotify_event;
struct fsnotify_event_private_data *fsn_priv;
struct inotify_event_private_data *priv;
size_t event_size = sizeof(struct inotify_event); size_t event_size = sizeof(struct inotify_event);
size_t name_len;
/* we get the inotify watch descriptor from the event private data */
spin_lock(&event->lock);
fsn_priv = fsnotify_remove_priv_from_event(group, event);
spin_unlock(&event->lock);
if (!fsn_priv)
inotify_event.wd = -1;
else {
priv = container_of(fsn_priv, struct inotify_event_private_data,
fsnotify_event_priv_data);
inotify_event.wd = priv->wd;
inotify_free_event_priv(fsn_priv);
}
/* round up event->name_len so it is a multiple of event_size */
name_len = roundup(event->name_len, event_size);
inotify_event.len = name_len;
if (copy_to_user(buf, &kevent->event, event_size)) inotify_event.mask = inotify_mask_to_arg(event->mask);
inotify_event.cookie = event->sync_cookie;
/* send the main event */
if (copy_to_user(buf, &inotify_event, event_size))
return -EFAULT; return -EFAULT;
if (kevent->name) {
buf += event_size; buf += event_size;
if (copy_to_user(buf, kevent->name, kevent->event.len)) /*
* fsnotify only stores the pathname, so here we have to send the pathname
* and then pad that pathname out to a multiple of sizeof(inotify_event)
* with zeros. I get my zeros from the nul_inotify_event.
*/
if (name_len) {
unsigned int len_to_zero = name_len - event->name_len;
/* copy the path name */
if (copy_to_user(buf, event->file_name, event->name_len))
return -EFAULT; return -EFAULT;
buf += event->name_len;
event_size += kevent->event.len; /* fill userspace with 0's from nul_inotify_event */
if (copy_to_user(buf, &nul_inotify_event, len_to_zero))
return -EFAULT;
buf += len_to_zero;
event_size += name_len;
} }
return event_size; return event_size;
} }
static ssize_t inotify_read(struct file *file, char __user *buf, static ssize_t inotify_read(struct file *file, char __user *buf,
size_t count, loff_t *pos) size_t count, loff_t *pos)
{ {
struct inotify_device *dev; struct fsnotify_group *group;
struct fsnotify_event *kevent;
char __user *start; char __user *start;
int ret; int ret;
DEFINE_WAIT(wait); DEFINE_WAIT(wait);
start = buf; start = buf;
dev = file->private_data; group = file->private_data;
while (1) { while (1) {
struct inotify_kernel_event *kevent; prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
mutex_lock(&dev->ev_mutex); mutex_lock(&group->notification_mutex);
kevent = get_one_event(dev, count); kevent = get_one_event(group, count);
mutex_unlock(&dev->ev_mutex); mutex_unlock(&group->notification_mutex);
if (kevent) { if (kevent) {
ret = PTR_ERR(kevent); ret = PTR_ERR(kevent);
if (IS_ERR(kevent)) if (IS_ERR(kevent))
break; break;
ret = copy_event_to_user(kevent, buf); ret = copy_event_to_user(group, kevent, buf);
free_kevent(kevent); fsnotify_put_event(kevent);
if (ret < 0) if (ret < 0)
break; break;
buf += ret; buf += ret;
...@@ -525,7 +280,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf, ...@@ -525,7 +280,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
schedule(); schedule();
} }
finish_wait(&dev->wq, &wait); finish_wait(&group->notification_waitq, &wait);
if (start != buf && ret != -EFAULT) if (start != buf && ret != -EFAULT)
ret = buf - start; ret = buf - start;
return ret; return ret;
...@@ -533,25 +288,19 @@ static ssize_t inotify_read(struct file *file, char __user *buf, ...@@ -533,25 +288,19 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
static int inotify_fasync(int fd, struct file *file, int on) static int inotify_fasync(int fd, struct file *file, int on)
{ {
struct inotify_device *dev = file->private_data; struct fsnotify_group *group = file->private_data;
return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO; return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
} }
static int inotify_release(struct inode *ignored, struct file *file) static int inotify_release(struct inode *ignored, struct file *file)
{ {
struct inotify_device *dev = file->private_data; struct fsnotify_group *group = file->private_data;
inotify_destroy(dev->ih);
/* destroy all of the events on this device */ fsnotify_clear_marks_by_group(group);
mutex_lock(&dev->ev_mutex);
while (!list_empty(&dev->events))
inotify_dev_event_dequeue(dev);
mutex_unlock(&dev->ev_mutex);
/* free this device: the put matching the get in inotify_init() */ /* free this group, matching get was inotify_init->fsnotify_obtain_group */
put_inotify_dev(dev); fsnotify_put_group(group);
return 0; return 0;
} }
...@@ -559,16 +308,27 @@ static int inotify_release(struct inode *ignored, struct file *file) ...@@ -559,16 +308,27 @@ static int inotify_release(struct inode *ignored, struct file *file)
static long inotify_ioctl(struct file *file, unsigned int cmd, static long inotify_ioctl(struct file *file, unsigned int cmd,
unsigned long arg) unsigned long arg)
{ {
struct inotify_device *dev; struct fsnotify_group *group;
struct fsnotify_event_holder *holder;
struct fsnotify_event *event;
void __user *p; void __user *p;
int ret = -ENOTTY; int ret = -ENOTTY;
size_t send_len = 0;
dev = file->private_data; group = file->private_data;
p = (void __user *) arg; p = (void __user *) arg;
switch (cmd) { switch (cmd) {
case FIONREAD: case FIONREAD:
ret = put_user(dev->queue_size, (int __user *) p); mutex_lock(&group->notification_mutex);
list_for_each_entry(holder, &group->notification_list, event_list) {
event = holder->event;
send_len += sizeof(struct inotify_event);
send_len += roundup(event->name_len,
sizeof(struct inotify_event));
}
mutex_unlock(&group->notification_mutex);
ret = put_user(send_len, (int __user *) p);
break; break;
} }
...@@ -584,15 +344,225 @@ static const struct file_operations inotify_fops = { ...@@ -584,15 +344,225 @@ static const struct file_operations inotify_fops = {
.compat_ioctl = inotify_ioctl, .compat_ioctl = inotify_ioctl,
}; };
static const struct inotify_operations inotify_user_ops = {
.handle_event = inotify_dev_queue_event,
.destroy_watch = free_inotify_user_watch,
};
/*
* find_inode - resolve a user-given path to a specific inode
*/
static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
{
int error;
error = user_path_at(AT_FDCWD, dirname, flags, path);
if (error)
return error;
/* you can only watch an inode if you have read permissions on it */
error = inode_permission(path->dentry->d_inode, MAY_READ);
if (error)
path_put(path);
return error;
}
/*
* When, for whatever reason, inotify is done with a mark (or what used to be a
* watch) we need to remove that watch from the idr and we need to send IN_IGNORED
* for the given wd.
*
* There is a bit of recursion here. The loop looks like:
* inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry ->
* inotify_freeing_mark -> inotify_destory_mark_entry -> restart
* But the loop is broken in 2 places. fsnotify_destroy_mark_by_entry sets
* entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup)
* test below will not call back to fsnotify again. But even if that test wasn't
* there this would still be safe since fsnotify_destroy_mark_by_entry() is
* safe from recursion.
*/
void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
{
struct inotify_inode_mark_entry *ientry;
struct inotify_event_private_data *event_priv;
struct fsnotify_event_private_data *fsn_event_priv;
struct fsnotify_group *egroup;
struct idr *idr;
spin_lock(&entry->lock);
egroup = entry->group;
/* if egroup we aren't really done and something might still send events
* for this inode, on the callback we'll send the IN_IGNORED */
if (egroup) {
spin_unlock(&entry->lock);
fsnotify_destroy_mark_by_entry(entry);
return;
}
spin_unlock(&entry->lock);
ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
if (unlikely(!event_priv))
goto skip_send_ignore;
fsn_event_priv = &event_priv->fsnotify_event_priv_data;
fsn_event_priv->group = group;
event_priv->wd = ientry->wd;
fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
/* did the private data get added? */
if (list_empty(&fsn_event_priv->event_list))
inotify_free_event_priv(fsn_event_priv);
skip_send_ignore:
/* remove this entry from the idr */
spin_lock(&group->inotify_data.idr_lock);
idr = &group->inotify_data.idr;
idr_remove(idr, ientry->wd);
spin_unlock(&group->inotify_data.idr_lock);
/* removed from idr, drop that reference */
fsnotify_put_mark(entry);
}
/* ding dong the mark is dead */
static void inotify_free_mark(struct fsnotify_mark_entry *entry)
{
struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
kmem_cache_free(inotify_inode_mark_cachep, ientry);
}
static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
{
struct fsnotify_mark_entry *entry = NULL;
struct inotify_inode_mark_entry *ientry;
int ret = 0;
int add = (arg & IN_MASK_ADD);
__u32 mask;
__u32 old_mask, new_mask;
/* don't allow invalid bits: we don't want flags set */
mask = inotify_arg_to_mask(arg);
if (unlikely(!mask))
return -EINVAL;
ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
if (unlikely(!ientry))
return -ENOMEM;
/* we set the mask at the end after attaching it */
fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
ientry->wd = 0;
find_entry:
spin_lock(&inode->i_lock);
entry = fsnotify_find_mark_entry(group, inode);
spin_unlock(&inode->i_lock);
if (entry) {
kmem_cache_free(inotify_inode_mark_cachep, ientry);
ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
} else {
if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
ret = -ENOSPC;
goto out_err;
}
ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
if (ret == -EEXIST)
goto find_entry;
else if (ret)
goto out_err;
entry = &ientry->fsn_entry;
retry:
ret = -ENOMEM;
if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
goto out_err;
spin_lock(&group->inotify_data.idr_lock);
/* if entry is added to the idr we keep the reference obtained
* through fsnotify_mark_add. remember to drop this reference
* when entry is removed from idr */
ret = idr_get_new_above(&group->inotify_data.idr, entry,
++group->inotify_data.last_wd,
&ientry->wd);
spin_unlock(&group->inotify_data.idr_lock);
if (ret) {
if (ret == -EAGAIN)
goto retry;
goto out_err;
}
atomic_inc(&group->inotify_data.user->inotify_watches);
}
spin_lock(&entry->lock);
old_mask = entry->mask;
if (add) {
entry->mask |= mask;
new_mask = entry->mask;
} else {
entry->mask = mask;
new_mask = entry->mask;
}
spin_unlock(&entry->lock);
if (old_mask != new_mask) {
/* more bits in old than in new? */
int dropped = (old_mask & ~new_mask);
/* more bits in this entry than the inode's mask? */
int do_inode = (new_mask & ~inode->i_fsnotify_mask);
/* more bits in this entry than the group? */
int do_group = (new_mask & ~group->mask);
/* update the inode with this new entry */
if (dropped || do_inode)
fsnotify_recalc_inode_mask(inode);
/* update the group mask with the new mask */
if (dropped || do_group)
fsnotify_recalc_group_mask(group);
}
return ientry->wd;
out_err:
/* see this isn't supposed to happen, just kill the watch */
if (entry) {
fsnotify_destroy_mark_by_entry(entry);
fsnotify_put_mark(entry);
}
return ret;
}
static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
{
struct fsnotify_group *group;
unsigned int grp_num;
/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
if (IS_ERR(group))
return group;
group->max_events = max_events;
spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr);
group->inotify_data.last_wd = 0;
group->inotify_data.user = user;
group->inotify_data.fa = NULL;
return group;
}
/* inotify syscalls */
SYSCALL_DEFINE1(inotify_init1, int, flags) SYSCALL_DEFINE1(inotify_init1, int, flags)
{ {
struct inotify_device *dev; struct fsnotify_group *group;
struct inotify_handle *ih;
struct user_struct *user; struct user_struct *user;
struct file *filp; struct file *filp;
int fd, ret; int fd, ret;
...@@ -621,45 +591,27 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) ...@@ -621,45 +591,27 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
goto out_free_uid; goto out_free_uid;
} }
dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL); /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
if (unlikely(!dev)) { group = inotify_new_group(user, inotify_max_queued_events);
ret = -ENOMEM; if (IS_ERR(group)) {
ret = PTR_ERR(group);
goto out_free_uid; goto out_free_uid;
} }
ih = inotify_init(&inotify_user_ops);
if (IS_ERR(ih)) {
ret = PTR_ERR(ih);
goto out_free_dev;
}
dev->ih = ih;
dev->fa = NULL;
filp->f_op = &inotify_fops; filp->f_op = &inotify_fops;
filp->f_path.mnt = mntget(inotify_mnt); filp->f_path.mnt = mntget(inotify_mnt);
filp->f_path.dentry = dget(inotify_mnt->mnt_root); filp->f_path.dentry = dget(inotify_mnt->mnt_root);
filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
filp->f_mode = FMODE_READ; filp->f_mode = FMODE_READ;
filp->f_flags = O_RDONLY | (flags & O_NONBLOCK); filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
filp->private_data = dev; filp->private_data = group;
INIT_LIST_HEAD(&dev->events);
init_waitqueue_head(&dev->wq);
mutex_init(&dev->ev_mutex);
mutex_init(&dev->up_mutex);
dev->event_count = 0;
dev->queue_size = 0;
dev->max_events = inotify_max_queued_events;
dev->user = user;
atomic_set(&dev->count, 0);
get_inotify_dev(dev);
atomic_inc(&user->inotify_devs); atomic_inc(&user->inotify_devs);
fd_install(fd, filp); fd_install(fd, filp);
return fd; return fd;
out_free_dev:
kfree(dev);
out_free_uid: out_free_uid:
free_uid(user); free_uid(user);
put_filp(filp); put_filp(filp);
...@@ -676,8 +628,8 @@ SYSCALL_DEFINE0(inotify_init) ...@@ -676,8 +628,8 @@ SYSCALL_DEFINE0(inotify_init)
SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
u32, mask) u32, mask)
{ {
struct fsnotify_group *group;
struct inode *inode; struct inode *inode;
struct inotify_device *dev;
struct path path; struct path path;
struct file *filp; struct file *filp;
int ret, fput_needed; int ret, fput_needed;
...@@ -698,20 +650,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, ...@@ -698,20 +650,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
if (mask & IN_ONLYDIR) if (mask & IN_ONLYDIR)
flags |= LOOKUP_DIRECTORY; flags |= LOOKUP_DIRECTORY;
ret = find_inode(pathname, &path, flags); ret = inotify_find_inode(pathname, &path, flags);
if (unlikely(ret)) if (ret)
goto fput_and_out; goto fput_and_out;
/* inode held in place by reference to path; dev by fget on fd */ /* inode held in place by reference to path; group by fget on fd */
inode = path.dentry->d_inode; inode = path.dentry->d_inode;
dev = filp->private_data; group = filp->private_data;
mutex_lock(&dev->up_mutex); /* create/update an inode mark */
ret = inotify_find_update_watch(dev->ih, inode, mask); ret = inotify_update_watch(group, inode, mask);
if (ret == -ENOENT) if (unlikely(ret))
ret = create_watch(dev, inode, mask); goto path_put_and_out;
mutex_unlock(&dev->up_mutex);
path_put_and_out:
path_put(&path); path_put(&path);
fput_and_out: fput_and_out:
fput_light(filp, fput_needed); fput_light(filp, fput_needed);
...@@ -720,9 +672,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, ...@@ -720,9 +672,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
{ {
struct fsnotify_group *group;
struct fsnotify_mark_entry *entry;
struct file *filp; struct file *filp;
struct inotify_device *dev; int ret = 0, fput_needed;
int ret, fput_needed;
filp = fget_light(fd, &fput_needed); filp = fget_light(fd, &fput_needed);
if (unlikely(!filp)) if (unlikely(!filp))
...@@ -734,10 +687,20 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) ...@@ -734,10 +687,20 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
goto out; goto out;
} }
dev = filp->private_data; group = filp->private_data;
/* we free our watch data when we get IN_IGNORED */ spin_lock(&group->inotify_data.idr_lock);
ret = inotify_rm_wd(dev->ih, wd); entry = idr_find(&group->inotify_data.idr, wd);
if (unlikely(!entry)) {
spin_unlock(&group->inotify_data.idr_lock);
ret = -EINVAL;
goto out;
}
fsnotify_get_mark(entry);
spin_unlock(&group->inotify_data.idr_lock);
inotify_destroy_mark_entry(entry, group);
fsnotify_put_mark(entry);
out: out:
fput_light(filp, fput_needed); fput_light(filp, fput_needed);
...@@ -775,18 +738,16 @@ static int __init inotify_user_setup(void) ...@@ -775,18 +738,16 @@ static int __init inotify_user_setup(void)
if (IS_ERR(inotify_mnt)) if (IS_ERR(inotify_mnt))
panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt)); panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
if (!inotify_ignored_event)
panic("unable to allocate the inotify ignored event\n");
inotify_max_queued_events = 16384; inotify_max_queued_events = 16384;
inotify_max_user_instances = 128; inotify_max_user_instances = 128;
inotify_max_user_watches = 8192; inotify_max_user_watches = 8192;
watch_cachep = kmem_cache_create("inotify_watch_cache",
sizeof(struct inotify_user_watch),
0, SLAB_PANIC, NULL);
event_cachep = kmem_cache_create("inotify_event_cache",
sizeof(struct inotify_kernel_event),
0, SLAB_PANIC, NULL);
return 0; return 0;
} }
module_init(inotify_user_setup); module_init(inotify_user_setup);
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#ifdef __KERNEL__ #ifdef __KERNEL__
#include <linux/idr.h> /* inotify uses this */
#include <linux/fs.h> /* struct inode */ #include <linux/fs.h> /* struct inode */
#include <linux/list.h> #include <linux/list.h>
#include <linux/path.h> /* struct path */ #include <linux/path.h> /* struct path */
...@@ -59,6 +60,7 @@ ...@@ -59,6 +60,7 @@
/* listeners that hard code group numbers near the top */ /* listeners that hard code group numbers near the top */
#define DNOTIFY_GROUP_NUM UINT_MAX #define DNOTIFY_GROUP_NUM UINT_MAX
#define INOTIFY_GROUP_NUM (DNOTIFY_GROUP_NUM-1)
struct fsnotify_group; struct fsnotify_group;
struct fsnotify_event; struct fsnotify_event;
...@@ -141,6 +143,15 @@ struct fsnotify_group { ...@@ -141,6 +143,15 @@ struct fsnotify_group {
/* groups can define private fields here or use the void *private */ /* groups can define private fields here or use the void *private */
union { union {
void *private; void *private;
#ifdef CONFIG_INOTIFY_USER
struct inotify_group_private_data {
spinlock_t idr_lock;
struct idr idr;
u32 last_wd;
struct fasync_struct *fa; /* async notification */
struct user_struct *user;
} inotify_data;
#endif
}; };
}; };
......
...@@ -302,7 +302,8 @@ config AUDITSYSCALL ...@@ -302,7 +302,8 @@ config AUDITSYSCALL
config AUDIT_TREE config AUDIT_TREE
def_bool y def_bool y
depends on AUDITSYSCALL && INOTIFY depends on AUDITSYSCALL
select INOTIFY
menu "RCU Subsystem" menu "RCU Subsystem"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment