Commit 2b75869b authored by NeilBrown's avatar NeilBrown Committed by Greg Kroah-Hartman

sysfs/kernfs: allow attributes to request write buffer be pre-allocated.

md/raid allows metadata management to be performed in user-space.
A various times, particularly on device failure, the metadata needs
to be updated before further writes can be permitted.
This means that the user-space program which updates metadata much
not block on writeout, and so must not allocate memory.

mlockall(MCL_CURRENT|MCL_FUTURE) and pre-allocation can avoid all
memory allocation issues for user-memory, but that does not help
kernel memory.
Several kernel objects can be pre-allocated.  e.g. files opened before
any writes to the array are permitted.
However some kernel allocation happens in places that cannot be
pre-allocated.
In particular, writes to sysfs files (to tell md that it can now
allow writes to the array) allocate a buffer using GFP_KERNEL.

This patch allows attributes to be marked as "PREALLOC".  In that case
the maximal buffer is allocated when the file is opened, and then used
on each write instead of allocating a new buffer.

As the same buffer is now shared for all writes on the same file
description, the mutex is extended to cover full use of the buffer
including the copy_from_user().

The new __ATTR_PREALLOC() 'or's a new flag in to the 'mode', which is
inspected by sysfs_add_file_mode_ns() to determine if the file should be
marked as requiring prealloc.

Despite the comment, we *do* use ->seq_show together with ->prealloc
in this patch.  The next patch fixes that.
Signed-off-by: default avatarNeilBrown  <neilb@suse.de>
Reviewed-by: default avatarTejun Heo <tj@kernel.org>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 09368960
...@@ -106,7 +106,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) ...@@ -106,7 +106,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
const struct kernfs_ops *ops; const struct kernfs_ops *ops;
/* /*
* @of->mutex nests outside active ref and is just to ensure that * @of->mutex nests outside active ref and is primarily to ensure that
* the ops aren't called concurrently for the same open file. * the ops aren't called concurrently for the same open file.
*/ */
mutex_lock(&of->mutex); mutex_lock(&of->mutex);
...@@ -194,7 +194,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, ...@@ -194,7 +194,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
return -ENOMEM; return -ENOMEM;
/* /*
* @of->mutex nests outside active ref and is just to ensure that * @of->mutex nests outside active ref and is primarily to ensure that
* the ops aren't called concurrently for the same open file. * the ops aren't called concurrently for the same open file.
*/ */
mutex_lock(&of->mutex); mutex_lock(&of->mutex);
...@@ -278,19 +278,16 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, ...@@ -278,19 +278,16 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
len = min_t(size_t, count, PAGE_SIZE); len = min_t(size_t, count, PAGE_SIZE);
} }
buf = of->prealloc_buf;
if (!buf)
buf = kmalloc(len + 1, GFP_KERNEL); buf = kmalloc(len + 1, GFP_KERNEL);
if (!buf) if (!buf)
return -ENOMEM; return -ENOMEM;
if (copy_from_user(buf, user_buf, len)) {
len = -EFAULT;
goto out_free;
}
buf[len] = '\0'; /* guarantee string termination */
/* /*
* @of->mutex nests outside active ref and is just to ensure that * @of->mutex nests outside active ref and is used both to ensure that
* the ops aren't called concurrently for the same open file. * the ops aren't called concurrently for the same open file, and
* to provide exclusive access to ->prealloc_buf (when that exists).
*/ */
mutex_lock(&of->mutex); mutex_lock(&of->mutex);
if (!kernfs_get_active(of->kn)) { if (!kernfs_get_active(of->kn)) {
...@@ -299,18 +296,26 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, ...@@ -299,18 +296,26 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
goto out_free; goto out_free;
} }
if (copy_from_user(buf, user_buf, len)) {
len = -EFAULT;
goto out_unlock;
}
buf[len] = '\0'; /* guarantee string termination */
ops = kernfs_ops(of->kn); ops = kernfs_ops(of->kn);
if (ops->write) if (ops->write)
len = ops->write(of, buf, len, *ppos); len = ops->write(of, buf, len, *ppos);
else else
len = -EINVAL; len = -EINVAL;
kernfs_put_active(of->kn);
mutex_unlock(&of->mutex);
if (len > 0) if (len > 0)
*ppos += len; *ppos += len;
out_unlock:
kernfs_put_active(of->kn);
mutex_unlock(&of->mutex);
out_free: out_free:
if (buf != of->prealloc_buf)
kfree(buf); kfree(buf);
return len; return len;
} }
...@@ -685,6 +690,14 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) ...@@ -685,6 +690,14 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
*/ */
of->atomic_write_len = ops->atomic_write_len; of->atomic_write_len = ops->atomic_write_len;
if (ops->prealloc) {
int len = of->atomic_write_len ?: PAGE_SIZE;
of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL);
error = -ENOMEM;
if (!of->prealloc_buf)
goto err_free;
}
/* /*
* Always instantiate seq_file even if read access doesn't use * Always instantiate seq_file even if read access doesn't use
* seq_file or is not requested. This unifies private data access * seq_file or is not requested. This unifies private data access
...@@ -715,6 +728,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) ...@@ -715,6 +728,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
err_close: err_close:
seq_release(inode, file); seq_release(inode, file);
err_free: err_free:
kfree(of->prealloc_buf);
kfree(of); kfree(of);
err_out: err_out:
kernfs_put_active(kn); kernfs_put_active(kn);
...@@ -728,6 +742,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) ...@@ -728,6 +742,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
kernfs_put_open_node(kn, of); kernfs_put_open_node(kn, of);
seq_release(inode, filp); seq_release(inode, filp);
kfree(of->prealloc_buf);
kfree(of); kfree(of);
return 0; return 0;
......
...@@ -184,6 +184,17 @@ static const struct kernfs_ops sysfs_file_kfops_rw = { ...@@ -184,6 +184,17 @@ static const struct kernfs_ops sysfs_file_kfops_rw = {
.write = sysfs_kf_write, .write = sysfs_kf_write,
}; };
static const struct kernfs_ops sysfs_prealloc_kfops_wo = {
.write = sysfs_kf_write,
.prealloc = true,
};
static const struct kernfs_ops sysfs_prealloc_kfops_rw = {
.seq_show = sysfs_kf_seq_show,
.write = sysfs_kf_write,
.prealloc = true,
};
static const struct kernfs_ops sysfs_bin_kfops_ro = { static const struct kernfs_ops sysfs_bin_kfops_ro = {
.read = sysfs_kf_bin_read, .read = sysfs_kf_bin_read,
}; };
...@@ -222,13 +233,19 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, ...@@ -222,13 +233,19 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
kobject_name(kobj))) kobject_name(kobj)))
return -EINVAL; return -EINVAL;
if (sysfs_ops->show && sysfs_ops->store) if (sysfs_ops->show && sysfs_ops->store) {
if (mode & SYSFS_PREALLOC)
ops = &sysfs_prealloc_kfops_rw;
else
ops = &sysfs_file_kfops_rw; ops = &sysfs_file_kfops_rw;
else if (sysfs_ops->show) } else if (sysfs_ops->show)
ops = &sysfs_file_kfops_ro; ops = &sysfs_file_kfops_ro;
else if (sysfs_ops->store) else if (sysfs_ops->store) {
ops = &sysfs_file_kfops_wo; if (mode & SYSFS_PREALLOC)
ops = &sysfs_prealloc_kfops_wo;
else else
ops = &sysfs_file_kfops_wo;
} else
ops = &sysfs_file_kfops_empty; ops = &sysfs_file_kfops_empty;
size = PAGE_SIZE; size = PAGE_SIZE;
...@@ -253,7 +270,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, ...@@ -253,7 +270,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
if (!attr->ignore_lockdep) if (!attr->ignore_lockdep)
key = attr->key ?: (struct lock_class_key *)&attr->skey; key = attr->key ?: (struct lock_class_key *)&attr->skey;
#endif #endif
kn = __kernfs_create_file(parent, attr->name, mode, size, ops, kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
(void *)attr, ns, true, key); (void *)attr, ns, true, key);
if (IS_ERR(kn)) { if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST) if (PTR_ERR(kn) == -EEXIST)
......
...@@ -179,6 +179,7 @@ struct kernfs_open_file { ...@@ -179,6 +179,7 @@ struct kernfs_open_file {
struct mutex mutex; struct mutex mutex;
int event; int event;
struct list_head list; struct list_head list;
char *prealloc_buf;
size_t atomic_write_len; size_t atomic_write_len;
bool mmapped; bool mmapped;
...@@ -214,6 +215,13 @@ struct kernfs_ops { ...@@ -214,6 +215,13 @@ struct kernfs_ops {
* larger ones are rejected with -E2BIG. * larger ones are rejected with -E2BIG.
*/ */
size_t atomic_write_len; size_t atomic_write_len;
/*
* "prealloc" causes a buffer to be allocated at open for
* all read/write requests. As ->seq_show uses seq_read()
* which does its own allocation, it is incompatible with
* ->prealloc. Provide ->read and ->write with ->prealloc.
*/
bool prealloc;
ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes,
loff_t off); loff_t off);
......
...@@ -70,6 +70,8 @@ struct attribute_group { ...@@ -70,6 +70,8 @@ struct attribute_group {
* for examples.. * for examples..
*/ */
#define SYSFS_PREALLOC 010000
#define __ATTR(_name, _mode, _show, _store) { \ #define __ATTR(_name, _mode, _show, _store) { \
.attr = {.name = __stringify(_name), \ .attr = {.name = __stringify(_name), \
.mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \
...@@ -77,6 +79,13 @@ struct attribute_group { ...@@ -77,6 +79,13 @@ struct attribute_group {
.store = _store, \ .store = _store, \
} }
#define __ATTR_PREALLOC(_name, _mode, _show, _store) { \
.attr = {.name = __stringify(_name), \
.mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\
.show = _show, \
.store = _store, \
}
#define __ATTR_RO(_name) { \ #define __ATTR_RO(_name) { \
.attr = { .name = __stringify(_name), .mode = S_IRUGO }, \ .attr = { .name = __stringify(_name), .mode = S_IRUGO }, \
.show = _name##_show, \ .show = _name##_show, \
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment