Commit a06247c6 authored by Suren Baghdasaryan's avatar Suren Baghdasaryan Committed by Peter Zijlstra

psi: Fix uaf issue when psi trigger is destroyed while being polled

With write operation on psi files replacing old trigger with a new one,
the lifetime of its waitqueue is totally arbitrary. Overwriting an
existing trigger causes its waitqueue to be freed and pending poll()
will stumble on trigger->event_wait which was destroyed.
Fix this by disallowing to redefine an existing psi trigger. If a write
operation is used on a file descriptor with an already existing psi
trigger, the operation will fail with EBUSY error.
Also bypass a check for psi_disabled in the psi_trigger_destroy as the
flag can be flipped after the trigger is created, leading to a memory
leak.

Fixes: 0e94682b ("psi: introduce psi monitor")
Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com
Suggested-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
Analyzed-by: default avatarEric Biggers <ebiggers@kernel.org>
Signed-off-by: default avatarSuren Baghdasaryan <surenb@google.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: default avatarEric Biggers <ebiggers@google.com>
Acked-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20220111232309.1786347-1-surenb@google.com
parent fb3b0673
...@@ -92,7 +92,8 @@ Triggers can be set on more than one psi metric and more than one trigger ...@@ -92,7 +92,8 @@ Triggers can be set on more than one psi metric and more than one trigger
for the same psi metric can be specified. However for each trigger a separate for the same psi metric can be specified. However for each trigger a separate
file descriptor is required to be able to poll it separately from others, file descriptor is required to be able to poll it separately from others,
therefore for each trigger a separate open() syscall should be made even therefore for each trigger a separate open() syscall should be made even
when opening the same psi interface file. when opening the same psi interface file. Write operations to a file descriptor
with an already existing psi trigger will fail with EBUSY.
Monitors activate only when system enters stall state for the monitored Monitors activate only when system enters stall state for the monitored
psi metric and deactivates upon exit from the stall state. While system is psi metric and deactivates upon exit from the stall state. While system is
......
...@@ -33,7 +33,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to); ...@@ -33,7 +33,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to);
struct psi_trigger *psi_trigger_create(struct psi_group *group, struct psi_trigger *psi_trigger_create(struct psi_group *group,
char *buf, size_t nbytes, enum psi_res res); char *buf, size_t nbytes, enum psi_res res);
void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t); void psi_trigger_destroy(struct psi_trigger *t);
__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
poll_table *wait); poll_table *wait);
......
...@@ -141,9 +141,6 @@ struct psi_trigger { ...@@ -141,9 +141,6 @@ struct psi_trigger {
* events to one per window * events to one per window
*/ */
u64 last_event_time; u64 last_event_time;
/* Refcounting to prevent premature destruction */
struct kref refcount;
}; };
struct psi_group { struct psi_group {
......
...@@ -3643,6 +3643,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, ...@@ -3643,6 +3643,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
cgroup_get(cgrp); cgroup_get(cgrp);
cgroup_kn_unlock(of->kn); cgroup_kn_unlock(of->kn);
/* Allow only one trigger per file descriptor */
if (ctx->psi.trigger) {
cgroup_put(cgrp);
return -EBUSY;
}
psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
new = psi_trigger_create(psi, buf, nbytes, res); new = psi_trigger_create(psi, buf, nbytes, res);
if (IS_ERR(new)) { if (IS_ERR(new)) {
...@@ -3650,8 +3656,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, ...@@ -3650,8 +3656,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return PTR_ERR(new); return PTR_ERR(new);
} }
psi_trigger_replace(&ctx->psi.trigger, new); smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp); cgroup_put(cgrp);
return nbytes; return nbytes;
...@@ -3690,7 +3695,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) ...@@ -3690,7 +3695,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
{ {
struct cgroup_file_ctx *ctx = of->priv; struct cgroup_file_ctx *ctx = of->priv;
psi_trigger_replace(&ctx->psi.trigger, NULL); psi_trigger_destroy(ctx->psi.trigger);
} }
bool cgroup_psi_enabled(void) bool cgroup_psi_enabled(void)
......
...@@ -1162,7 +1162,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, ...@@ -1162,7 +1162,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->event = 0; t->event = 0;
t->last_event_time = 0; t->last_event_time = 0;
init_waitqueue_head(&t->event_wait); init_waitqueue_head(&t->event_wait);
kref_init(&t->refcount);
mutex_lock(&group->trigger_lock); mutex_lock(&group->trigger_lock);
...@@ -1191,15 +1190,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, ...@@ -1191,15 +1190,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
return t; return t;
} }
static void psi_trigger_destroy(struct kref *ref) void psi_trigger_destroy(struct psi_trigger *t)
{ {
struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); struct psi_group *group;
struct psi_group *group = t->group;
struct task_struct *task_to_destroy = NULL; struct task_struct *task_to_destroy = NULL;
if (static_branch_likely(&psi_disabled)) /*
* We do not check psi_disabled since it might have been disabled after
* the trigger got created.
*/
if (!t)
return; return;
group = t->group;
/* /*
* Wakeup waiters to stop polling. Can happen if cgroup is deleted * Wakeup waiters to stop polling. Can happen if cgroup is deleted
* from under a polling process. * from under a polling process.
...@@ -1235,9 +1238,9 @@ static void psi_trigger_destroy(struct kref *ref) ...@@ -1235,9 +1238,9 @@ static void psi_trigger_destroy(struct kref *ref)
mutex_unlock(&group->trigger_lock); mutex_unlock(&group->trigger_lock);
/* /*
* Wait for both *trigger_ptr from psi_trigger_replace and * Wait for psi_schedule_poll_work RCU to complete its read-side
* poll_task RCUs to complete their read-side critical sections * critical section before destroying the trigger and optionally the
* before destroying the trigger and optionally the poll_task * poll_task.
*/ */
synchronize_rcu(); synchronize_rcu();
/* /*
...@@ -1254,18 +1257,6 @@ static void psi_trigger_destroy(struct kref *ref) ...@@ -1254,18 +1257,6 @@ static void psi_trigger_destroy(struct kref *ref)
kfree(t); kfree(t);
} }
void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
{
struct psi_trigger *old = *trigger_ptr;
if (static_branch_likely(&psi_disabled))
return;
rcu_assign_pointer(*trigger_ptr, new);
if (old)
kref_put(&old->refcount, psi_trigger_destroy);
}
__poll_t psi_trigger_poll(void **trigger_ptr, __poll_t psi_trigger_poll(void **trigger_ptr,
struct file *file, poll_table *wait) struct file *file, poll_table *wait)
{ {
...@@ -1275,24 +1266,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, ...@@ -1275,24 +1266,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
if (static_branch_likely(&psi_disabled)) if (static_branch_likely(&psi_disabled))
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
rcu_read_lock(); t = smp_load_acquire(trigger_ptr);
if (!t)
t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
if (!t) {
rcu_read_unlock();
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
}
kref_get(&t->refcount);
rcu_read_unlock();
poll_wait(file, &t->event_wait, wait); poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1) if (cmpxchg(&t->event, 1, 0) == 1)
ret |= EPOLLPRI; ret |= EPOLLPRI;
kref_put(&t->refcount, psi_trigger_destroy);
return ret; return ret;
} }
...@@ -1316,14 +1298,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, ...@@ -1316,14 +1298,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
buf[buf_size - 1] = '\0'; buf[buf_size - 1] = '\0';
new = psi_trigger_create(&psi_system, buf, nbytes, res);
if (IS_ERR(new))
return PTR_ERR(new);
seq = file->private_data; seq = file->private_data;
/* Take seq->lock to protect seq->private from concurrent writes */ /* Take seq->lock to protect seq->private from concurrent writes */
mutex_lock(&seq->lock); mutex_lock(&seq->lock);
psi_trigger_replace(&seq->private, new);
/* Allow only one trigger per file descriptor */
if (seq->private) {
mutex_unlock(&seq->lock);
return -EBUSY;
}
new = psi_trigger_create(&psi_system, buf, nbytes, res);
if (IS_ERR(new)) {
mutex_unlock(&seq->lock);
return PTR_ERR(new);
}
smp_store_release(&seq->private, new);
mutex_unlock(&seq->lock); mutex_unlock(&seq->lock);
return nbytes; return nbytes;
...@@ -1358,7 +1350,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) ...@@ -1358,7 +1350,7 @@ static int psi_fop_release(struct inode *inode, struct file *file)
{ {
struct seq_file *seq = file->private_data; struct seq_file *seq = file->private_data;
psi_trigger_replace(&seq->private, NULL); psi_trigger_destroy(seq->private);
return single_release(inode, file); return single_release(inode, file);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment