Commit 4962a856 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'io_uring-5.10-2020-10-20' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "A mix of fixes and a few stragglers. In detail:

   - Revert the bogus __read_mostly that we discussed for the initial
     pull request.

   - Fix a merge window regression with fixed file registration error
     path handling.

   - Fix io-wq numa node affinities.

   - Series abstracting out an io_identity struct, making it both easier
     to see what the personality items are, and also easier to to adopt
     more. Use this to cover audit logging.

   - Fix for read-ahead disabled block condition in async buffered
     reads, and using single page read-ahead to unify what
     generic_file_buffer_read() path is used.

   - Series for REQ_F_COMP_LOCKED fix and removal of it (Pavel)

   - Poll fix (Pavel)"

* tag 'io_uring-5.10-2020-10-20' of git://git.kernel.dk/linux-block: (21 commits)
  io_uring: use blk_queue_nowait() to check if NOWAIT supported
  mm: use limited read-ahead to satisfy read
  mm: mark async iocb read as NOWAIT once some data has been copied
  io_uring: fix double poll mask init
  io-wq: inherit audit loginuid and sessionid
  io_uring: use percpu counters to track inflight requests
  io_uring: assign new io_identity for task if members have changed
  io_uring: store io_identity in io_uring_task
  io_uring: COW io_identity on mismatch
  io_uring: move io identity items into separate struct
  io_uring: rely solely on work flags to determine personality.
  io_uring: pass required context in as flags
  io-wq: assign NUMA node locality if appropriate
  io_uring: fix error path cleanup in io_sqe_files_register()
  Revert "io_uring: mark io_uring_fops/io_op_defs as __read_mostly"
  io_uring: fix REQ_F_COMP_LOCKED by killing it
  io_uring: dig out COMP_LOCK from deep call chain
  io_uring: don't put a poll req under spinlock
  io_uring: don't unnecessarily clear F_LINK_TIMEOUT
  io_uring: don't set COMP_LOCKED if won't put
  ...
parents 38525c69 9ba0d0c8
......@@ -18,6 +18,7 @@
#include <linux/fs_struct.h>
#include <linux/task_work.h>
#include <linux/blk-cgroup.h>
#include <linux/audit.h>
#include "io-wq.h"
......@@ -429,14 +430,10 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
mmput(worker->mm);
worker->mm = NULL;
}
if (!work->mm)
return;
if (mmget_not_zero(work->mm)) {
kthread_use_mm(work->mm);
worker->mm = work->mm;
/* hang on to this mm */
work->mm = NULL;
if (mmget_not_zero(work->identity->mm)) {
kthread_use_mm(work->identity->mm);
worker->mm = work->identity->mm;
return;
}
......@@ -448,9 +445,11 @@ static inline void io_wq_switch_blkcg(struct io_worker *worker,
struct io_wq_work *work)
{
#ifdef CONFIG_BLK_CGROUP
if (work->blkcg_css != worker->blkcg_css) {
kthread_associate_blkcg(work->blkcg_css);
worker->blkcg_css = work->blkcg_css;
if (!(work->flags & IO_WQ_WORK_BLKCG))
return;
if (work->identity->blkcg_css != worker->blkcg_css) {
kthread_associate_blkcg(work->identity->blkcg_css);
worker->blkcg_css = work->identity->blkcg_css;
}
#endif
}
......@@ -458,9 +457,9 @@ static inline void io_wq_switch_blkcg(struct io_worker *worker,
static void io_wq_switch_creds(struct io_worker *worker,
struct io_wq_work *work)
{
const struct cred *old_creds = override_creds(work->creds);
const struct cred *old_creds = override_creds(work->identity->creds);
worker->cur_creds = work->creds;
worker->cur_creds = work->identity->creds;
if (worker->saved_creds)
put_cred(old_creds); /* creds set by previous switch */
else
......@@ -470,20 +469,26 @@ static void io_wq_switch_creds(struct io_worker *worker,
static void io_impersonate_work(struct io_worker *worker,
struct io_wq_work *work)
{
if (work->files && current->files != work->files) {
if ((work->flags & IO_WQ_WORK_FILES) &&
current->files != work->identity->files) {
task_lock(current);
current->files = work->files;
current->nsproxy = work->nsproxy;
current->files = work->identity->files;
current->nsproxy = work->identity->nsproxy;
task_unlock(current);
}
if (work->fs && current->fs != work->fs)
current->fs = work->fs;
if (work->mm != worker->mm)
if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
current->fs = work->identity->fs;
if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm)
io_wq_switch_mm(worker, work);
if (worker->cur_creds != work->creds)
if ((work->flags & IO_WQ_WORK_CREDS) &&
worker->cur_creds != work->identity->creds)
io_wq_switch_creds(worker, work);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize;
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize;
io_wq_switch_blkcg(worker, work);
#ifdef CONFIG_AUDIT
current->loginuid = work->identity->loginuid;
current->sessionid = work->identity->sessionid;
#endif
}
static void io_assign_current_work(struct io_worker *worker,
......@@ -496,6 +501,11 @@ static void io_assign_current_work(struct io_worker *worker,
cond_resched();
}
#ifdef CONFIG_AUDIT
current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET);
current->sessionid = AUDIT_SID_UNSET;
#endif
spin_lock_irq(&worker->lock);
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
......@@ -676,6 +686,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
kfree(worker);
return false;
}
kthread_bind_mask(worker->task, cpumask_of_node(wqe->node));
raw_spin_lock_irq(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
......
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H
#include <linux/io_uring.h>
struct io_wq;
enum {
......@@ -10,6 +12,12 @@ enum {
IO_WQ_WORK_NO_CANCEL = 8,
IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_WORK_FILES = 32,
IO_WQ_WORK_FS = 64,
IO_WQ_WORK_MM = 128,
IO_WQ_WORK_CREDS = 256,
IO_WQ_WORK_BLKCG = 512,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
......@@ -85,15 +93,7 @@ static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work {
struct io_wq_work_node list;
struct files_struct *files;
struct mm_struct *mm;
#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *blkcg_css;
#endif
const struct cred *creds;
struct nsproxy *nsproxy;
struct fs_struct *fs;
unsigned long fsize;
struct io_identity *identity;
unsigned flags;
};
......
This diff is collapsed.
......@@ -1268,6 +1268,10 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
kuid_t kloginuid;
int rv;
/* Don't let kthreads write their own loginuid */
if (current->flags & PF_KTHREAD)
return -EPERM;
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
rcu_read_unlock();
......
......@@ -4,18 +4,33 @@
#include <linux/sched.h>
#include <linux/xarray.h>
#include <linux/percpu-refcount.h>
struct io_identity {
struct files_struct *files;
struct mm_struct *mm;
#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *blkcg_css;
#endif
const struct cred *creds;
struct nsproxy *nsproxy;
struct fs_struct *fs;
unsigned long fsize;
#ifdef CONFIG_AUDIT
kuid_t loginuid;
unsigned int sessionid;
#endif
refcount_t count;
};
struct io_uring_task {
/* submission side */
struct xarray xa;
struct wait_queue_head wait;
struct file *last;
atomic_long_t req_issue;
/* completion side */
bool in_idle ____cacheline_aligned_in_smp;
atomic_long_t req_complete;
struct percpu_counter inflight;
struct io_identity __identity;
struct io_identity *identity;
bool in_idle;
};
#if defined(CONFIG_IO_URING)
......
......@@ -2199,6 +2199,14 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
offset = *ppos & ~PAGE_MASK;
/*
* If we've already successfully copied some data, then we
* can no longer safely return -EIOCBQUEUED. Hence mark
* an async read NOWAIT at that point.
*/
if (written && (iocb->ki_flags & IOCB_WAITQ))
iocb->ki_flags |= IOCB_NOWAIT;
for (;;) {
struct page *page;
pgoff_t end_index;
......
......@@ -552,15 +552,23 @@ static void ondemand_readahead(struct readahead_control *ractl,
void page_cache_sync_ra(struct readahead_control *ractl,
struct file_ra_state *ra, unsigned long req_count)
{
/* no read-ahead */
if (!ra->ra_pages)
return;
bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
if (blk_cgroup_congested())
return;
/*
* Even if read-ahead is disabled, issue this request as read-ahead
* as we'll need it to satisfy the requested range. The forced
* read-ahead will do the right thing and limit the read to just the
* requested range, which we'll set to 1 page for this case.
*/
if (!ra->ra_pages || blk_cgroup_congested()) {
if (!ractl->file)
return;
req_count = 1;
do_forced_ra = true;
}
/* be dumb */
if (ractl->file && (ractl->file->f_mode & FMODE_RANDOM)) {
if (do_forced_ra) {
force_page_cache_ra(ractl, ra, req_count);
return;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment