Commit d418d070 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-2019-10-18' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:

 - NVMe pull request from Keith that address deadlocks, double resets,
   memory leaks, and other regression.

 - Fixup elv_support_iosched() for bio based devices (Damien)

 - Fixup for the ahci PCS quirk (Dan)

 - Socket O_NONBLOCK handling fix for io_uring (me)

 - Timeout sequence io_uring fixes (yangerkun)

 - MD warning fix for parameter default_layout (Song)

 - blkcg activation fixes (Tejun)

 - blk-rq-qos node deletion fix (Tejun)

* tag 'for-linus-2019-10-18' of git://git.kernel.dk/linux-block:
  nvme-pci: Set the prp2 correctly when using more than 4k page
  io_uring: fix logic error in io_timeout
  io_uring: fix up O_NONBLOCK handling for sockets
  md/raid0: fix warning message for parameter default_layout
  libata/ahci: Fix PCS quirk application
  blk-rq-qos: fix first node deletion of rq_qos_del()
  blkcg: Fix multiple bugs in blkcg_activate_policy()
  io_uring: consider the overflow of sequence for timeout req
  nvme-tcp: fix possible leakage during error flow
  nvmet-loop: fix possible leakage during error flow
  block: Fix elv_support_iosched()
  nvme-tcp: Initialize sk->sk_ll_usec only with NET_RX_BUSY_POLL
  nvme: Wait for reset state when required
  nvme: Prevent resets during paused controller state
  nvme: Restart request timers in resetting state
  nvme: Remove ADMIN_ONLY state
  nvme-pci: Free tagset if no IO queues
  nvme: retain split access workaround for capability reads
  nvme: fix possible deadlock when nvme_update_formats fails
parents dfdcff32 b55f0097
...@@ -1362,7 +1362,7 @@ int blkcg_activate_policy(struct request_queue *q, ...@@ -1362,7 +1362,7 @@ int blkcg_activate_policy(struct request_queue *q,
const struct blkcg_policy *pol) const struct blkcg_policy *pol)
{ {
struct blkg_policy_data *pd_prealloc = NULL; struct blkg_policy_data *pd_prealloc = NULL;
struct blkcg_gq *blkg; struct blkcg_gq *blkg, *pinned_blkg = NULL;
int ret; int ret;
if (blkcg_policy_enabled(q, pol)) if (blkcg_policy_enabled(q, pol))
...@@ -1370,49 +1370,82 @@ int blkcg_activate_policy(struct request_queue *q, ...@@ -1370,49 +1370,82 @@ int blkcg_activate_policy(struct request_queue *q,
if (queue_is_mq(q)) if (queue_is_mq(q))
blk_mq_freeze_queue(q); blk_mq_freeze_queue(q);
pd_prealloc: retry:
if (!pd_prealloc) {
pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, &blkcg_root);
if (!pd_prealloc) {
ret = -ENOMEM;
goto out_bypass_end;
}
}
spin_lock_irq(&q->queue_lock); spin_lock_irq(&q->queue_lock);
/* blkg_list is pushed at the head, reverse walk to init parents first */ /* blkg_list is pushed at the head, reverse walk to allocate parents first */
list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
struct blkg_policy_data *pd; struct blkg_policy_data *pd;
if (blkg->pd[pol->plid]) if (blkg->pd[pol->plid])
continue; continue;
pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, &blkcg_root); /* If prealloc matches, use it; otherwise try GFP_NOWAIT */
if (!pd) if (blkg == pinned_blkg) {
swap(pd, pd_prealloc); pd = pd_prealloc;
pd_prealloc = NULL;
} else {
pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q,
blkg->blkcg);
}
if (!pd) { if (!pd) {
/*
* GFP_NOWAIT failed. Free the existing one and
* prealloc for @blkg w/ GFP_KERNEL.
*/
if (pinned_blkg)
blkg_put(pinned_blkg);
blkg_get(blkg);
pinned_blkg = blkg;
spin_unlock_irq(&q->queue_lock); spin_unlock_irq(&q->queue_lock);
goto pd_prealloc;
if (pd_prealloc)
pol->pd_free_fn(pd_prealloc);
pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q,
blkg->blkcg);
if (pd_prealloc)
goto retry;
else
goto enomem;
} }
blkg->pd[pol->plid] = pd; blkg->pd[pol->plid] = pd;
pd->blkg = blkg; pd->blkg = blkg;
pd->plid = pol->plid; pd->plid = pol->plid;
if (pol->pd_init_fn)
pol->pd_init_fn(pd);
} }
/* all allocated, init in the same order */
if (pol->pd_init_fn)
list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
pol->pd_init_fn(blkg->pd[pol->plid]);
__set_bit(pol->plid, q->blkcg_pols); __set_bit(pol->plid, q->blkcg_pols);
ret = 0; ret = 0;
spin_unlock_irq(&q->queue_lock); spin_unlock_irq(&q->queue_lock);
out_bypass_end: out:
if (queue_is_mq(q)) if (queue_is_mq(q))
blk_mq_unfreeze_queue(q); blk_mq_unfreeze_queue(q);
if (pinned_blkg)
blkg_put(pinned_blkg);
if (pd_prealloc) if (pd_prealloc)
pol->pd_free_fn(pd_prealloc); pol->pd_free_fn(pd_prealloc);
return ret; return ret;
enomem:
/* alloc failed, nothing's initialized yet, free everything */
spin_lock_irq(&q->queue_lock);
list_for_each_entry(blkg, &q->blkg_list, q_node) {
if (blkg->pd[pol->plid]) {
pol->pd_free_fn(blkg->pd[pol->plid]);
blkg->pd[pol->plid] = NULL;
}
}
spin_unlock_irq(&q->queue_lock);
ret = -ENOMEM;
goto out;
} }
EXPORT_SYMBOL_GPL(blkcg_activate_policy); EXPORT_SYMBOL_GPL(blkcg_activate_policy);
......
...@@ -108,16 +108,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) ...@@ -108,16 +108,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
{ {
struct rq_qos *cur, *prev = NULL; struct rq_qos **cur;
for (cur = q->rq_qos; cur; cur = cur->next) {
if (cur == rqos) { for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
if (prev) if (*cur == rqos) {
prev->next = rqos->next; *cur = rqos->next;
else
q->rq_qos = cur;
break; break;
} }
prev = cur;
} }
blk_mq_debugfs_unregister_rqos(rqos); blk_mq_debugfs_unregister_rqos(rqos);
......
...@@ -616,7 +616,8 @@ int elevator_switch_mq(struct request_queue *q, ...@@ -616,7 +616,8 @@ int elevator_switch_mq(struct request_queue *q,
static inline bool elv_support_iosched(struct request_queue *q) static inline bool elv_support_iosched(struct request_queue *q)
{ {
if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) if (!q->mq_ops ||
(q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)))
return false; return false;
return true; return true;
} }
......
...@@ -1600,7 +1600,9 @@ static void ahci_intel_pcs_quirk(struct pci_dev *pdev, struct ahci_host_priv *hp ...@@ -1600,7 +1600,9 @@ static void ahci_intel_pcs_quirk(struct pci_dev *pdev, struct ahci_host_priv *hp
*/ */
if (!id || id->vendor != PCI_VENDOR_ID_INTEL) if (!id || id->vendor != PCI_VENDOR_ID_INTEL)
return; return;
if (((enum board_ids) id->driver_data) < board_ahci_pcs7)
/* Skip applying the quirk on Denverton and beyond */
if (((enum board_ids) id->driver_data) >= board_ahci_pcs7)
return; return;
/* /*
......
...@@ -154,7 +154,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -154,7 +154,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
} else { } else {
pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n",
mdname(mddev)); mdname(mddev));
pr_err("md/raid0: please set raid.default_layout to 1 or 2\n"); pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n");
err = -ENOTSUPP; err = -ENOTSUPP;
goto abort; goto abort;
} }
......
...@@ -116,10 +116,26 @@ static void nvme_queue_scan(struct nvme_ctrl *ctrl) ...@@ -116,10 +116,26 @@ static void nvme_queue_scan(struct nvme_ctrl *ctrl)
/* /*
* Only new queue scan work when admin and IO queues are both alive * Only new queue scan work when admin and IO queues are both alive
*/ */
if (ctrl->state == NVME_CTRL_LIVE) if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
queue_work(nvme_wq, &ctrl->scan_work); queue_work(nvme_wq, &ctrl->scan_work);
} }
/*
* Use this function to proceed with scheduling reset_work for a controller
* that had previously been set to the resetting state. This is intended for
* code paths that can't be interrupted by other reset attempts. A hot removal
* may prevent this from succeeding.
*/
int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
{
if (ctrl->state != NVME_CTRL_RESETTING)
return -EBUSY;
if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
return -EBUSY;
return 0;
}
EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl) int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{ {
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
...@@ -137,8 +153,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) ...@@ -137,8 +153,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
ret = nvme_reset_ctrl(ctrl); ret = nvme_reset_ctrl(ctrl);
if (!ret) { if (!ret) {
flush_work(&ctrl->reset_work); flush_work(&ctrl->reset_work);
if (ctrl->state != NVME_CTRL_LIVE && if (ctrl->state != NVME_CTRL_LIVE)
ctrl->state != NVME_CTRL_ADMIN_ONLY)
ret = -ENETRESET; ret = -ENETRESET;
} }
...@@ -315,15 +330,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, ...@@ -315,15 +330,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
old_state = ctrl->state; old_state = ctrl->state;
switch (new_state) { switch (new_state) {
case NVME_CTRL_ADMIN_ONLY:
switch (old_state) {
case NVME_CTRL_CONNECTING:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_LIVE: case NVME_CTRL_LIVE:
switch (old_state) { switch (old_state) {
case NVME_CTRL_NEW: case NVME_CTRL_NEW:
...@@ -339,7 +345,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, ...@@ -339,7 +345,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
switch (old_state) { switch (old_state) {
case NVME_CTRL_NEW: case NVME_CTRL_NEW:
case NVME_CTRL_LIVE: case NVME_CTRL_LIVE:
case NVME_CTRL_ADMIN_ONLY:
changed = true; changed = true;
/* FALLTHRU */ /* FALLTHRU */
default: default:
...@@ -359,7 +364,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, ...@@ -359,7 +364,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_DELETING: case NVME_CTRL_DELETING:
switch (old_state) { switch (old_state) {
case NVME_CTRL_LIVE: case NVME_CTRL_LIVE:
case NVME_CTRL_ADMIN_ONLY:
case NVME_CTRL_RESETTING: case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING: case NVME_CTRL_CONNECTING:
changed = true; changed = true;
...@@ -381,8 +385,10 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, ...@@ -381,8 +385,10 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
break; break;
} }
if (changed) if (changed) {
ctrl->state = new_state; ctrl->state = new_state;
wake_up_all(&ctrl->state_wq);
}
spin_unlock_irqrestore(&ctrl->lock, flags); spin_unlock_irqrestore(&ctrl->lock, flags);
if (changed && ctrl->state == NVME_CTRL_LIVE) if (changed && ctrl->state == NVME_CTRL_LIVE)
...@@ -391,6 +397,39 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, ...@@ -391,6 +397,39 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
} }
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
/*
* Returns true for sink states that can't ever transition back to live.
*/
static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
{
switch (ctrl->state) {
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
return false;
case NVME_CTRL_DELETING:
case NVME_CTRL_DEAD:
return true;
default:
WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
return true;
}
}
/*
* Waits for the controller state to be resetting, or returns false if it is
* not possible to ever transition to that state.
*/
bool nvme_wait_reset(struct nvme_ctrl *ctrl)
{
wait_event(ctrl->state_wq,
nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
nvme_state_terminal(ctrl));
return ctrl->state == NVME_CTRL_RESETTING;
}
EXPORT_SYMBOL_GPL(nvme_wait_reset);
static void nvme_free_ns_head(struct kref *ref) static void nvme_free_ns_head(struct kref *ref)
{ {
struct nvme_ns_head *head = struct nvme_ns_head *head =
...@@ -1306,8 +1345,6 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl) ...@@ -1306,8 +1345,6 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl)
if (ns->disk && nvme_revalidate_disk(ns->disk)) if (ns->disk && nvme_revalidate_disk(ns->disk))
nvme_set_queue_dying(ns); nvme_set_queue_dying(ns);
up_read(&ctrl->namespaces_rwsem); up_read(&ctrl->namespaces_rwsem);
nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
} }
static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
...@@ -1323,6 +1360,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) ...@@ -1323,6 +1360,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
nvme_unfreeze(ctrl); nvme_unfreeze(ctrl);
nvme_mpath_unfreeze(ctrl->subsys); nvme_mpath_unfreeze(ctrl->subsys);
mutex_unlock(&ctrl->subsys->lock); mutex_unlock(&ctrl->subsys->lock);
nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
mutex_unlock(&ctrl->scan_lock); mutex_unlock(&ctrl->scan_lock);
} }
if (effects & NVME_CMD_EFFECTS_CCC) if (effects & NVME_CMD_EFFECTS_CCC)
...@@ -2874,7 +2912,6 @@ static int nvme_dev_open(struct inode *inode, struct file *file) ...@@ -2874,7 +2912,6 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
switch (ctrl->state) { switch (ctrl->state) {
case NVME_CTRL_LIVE: case NVME_CTRL_LIVE:
case NVME_CTRL_ADMIN_ONLY:
break; break;
default: default:
return -EWOULDBLOCK; return -EWOULDBLOCK;
...@@ -3168,7 +3205,6 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, ...@@ -3168,7 +3205,6 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
static const char *const state_name[] = { static const char *const state_name[] = {
[NVME_CTRL_NEW] = "new", [NVME_CTRL_NEW] = "new",
[NVME_CTRL_LIVE] = "live", [NVME_CTRL_LIVE] = "live",
[NVME_CTRL_ADMIN_ONLY] = "only-admin",
[NVME_CTRL_RESETTING] = "resetting", [NVME_CTRL_RESETTING] = "resetting",
[NVME_CTRL_CONNECTING] = "connecting", [NVME_CTRL_CONNECTING] = "connecting",
[NVME_CTRL_DELETING] = "deleting", [NVME_CTRL_DELETING] = "deleting",
...@@ -3679,11 +3715,10 @@ static void nvme_scan_work(struct work_struct *work) ...@@ -3679,11 +3715,10 @@ static void nvme_scan_work(struct work_struct *work)
struct nvme_id_ctrl *id; struct nvme_id_ctrl *id;
unsigned nn; unsigned nn;
if (ctrl->state != NVME_CTRL_LIVE) /* No tagset on a live ctrl means IO queues could not created */
if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
return; return;
WARN_ON_ONCE(!ctrl->tagset);
if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
dev_info(ctrl->device, "rescanning namespaces.\n"); dev_info(ctrl->device, "rescanning namespaces.\n");
nvme_clear_changed_ns_log(ctrl); nvme_clear_changed_ns_log(ctrl);
...@@ -3844,13 +3879,13 @@ static void nvme_fw_act_work(struct work_struct *work) ...@@ -3844,13 +3879,13 @@ static void nvme_fw_act_work(struct work_struct *work)
if (time_after(jiffies, fw_act_timeout)) { if (time_after(jiffies, fw_act_timeout)) {
dev_warn(ctrl->device, dev_warn(ctrl->device,
"Fw activation timeout, reset controller\n"); "Fw activation timeout, reset controller\n");
nvme_reset_ctrl(ctrl); nvme_try_sched_reset(ctrl);
break; return;
} }
msleep(100); msleep(100);
} }
if (ctrl->state != NVME_CTRL_LIVE) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
return; return;
nvme_start_queues(ctrl); nvme_start_queues(ctrl);
...@@ -3870,7 +3905,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) ...@@ -3870,7 +3905,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
nvme_queue_scan(ctrl); nvme_queue_scan(ctrl);
break; break;
case NVME_AER_NOTICE_FW_ACT_STARTING: case NVME_AER_NOTICE_FW_ACT_STARTING:
queue_work(nvme_wq, &ctrl->fw_act_work); /*
* We are (ab)using the RESETTING state to prevent subsequent
* recovery actions from interfering with the controller's
* firmware activation.
*/
if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
queue_work(nvme_wq, &ctrl->fw_act_work);
break; break;
#ifdef CONFIG_NVME_MULTIPATH #ifdef CONFIG_NVME_MULTIPATH
case NVME_AER_NOTICE_ANA: case NVME_AER_NOTICE_ANA:
...@@ -3993,6 +4034,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ...@@ -3993,6 +4034,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
init_waitqueue_head(&ctrl->state_wq);
INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
......
...@@ -182,8 +182,7 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, ...@@ -182,8 +182,7 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
bool queue_live) bool queue_live)
{ {
if (likely(ctrl->state == NVME_CTRL_LIVE || if (likely(ctrl->state == NVME_CTRL_LIVE))
ctrl->state == NVME_CTRL_ADMIN_ONLY))
return true; return true;
return __nvmf_check_ready(ctrl, rq, queue_live); return __nvmf_check_ready(ctrl, rq, queue_live);
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/sed-opal.h> #include <linux/sed-opal.h>
#include <linux/fault-inject.h> #include <linux/fault-inject.h>
#include <linux/rcupdate.h> #include <linux/rcupdate.h>
#include <linux/wait.h>
#include <trace/events/block.h> #include <trace/events/block.h>
...@@ -161,7 +162,6 @@ static inline u16 nvme_req_qid(struct request *req) ...@@ -161,7 +162,6 @@ static inline u16 nvme_req_qid(struct request *req)
enum nvme_ctrl_state { enum nvme_ctrl_state {
NVME_CTRL_NEW, NVME_CTRL_NEW,
NVME_CTRL_LIVE, NVME_CTRL_LIVE,
NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */
NVME_CTRL_RESETTING, NVME_CTRL_RESETTING,
NVME_CTRL_CONNECTING, NVME_CTRL_CONNECTING,
NVME_CTRL_DELETING, NVME_CTRL_DELETING,
...@@ -199,6 +199,7 @@ struct nvme_ctrl { ...@@ -199,6 +199,7 @@ struct nvme_ctrl {
struct cdev cdev; struct cdev cdev;
struct work_struct reset_work; struct work_struct reset_work;
struct work_struct delete_work; struct work_struct delete_work;
wait_queue_head_t state_wq;
struct nvme_subsystem *subsys; struct nvme_subsystem *subsys;
struct list_head subsys_entry; struct list_head subsys_entry;
...@@ -449,6 +450,7 @@ void nvme_complete_rq(struct request *req); ...@@ -449,6 +450,7 @@ void nvme_complete_rq(struct request *req);
bool nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state); enum nvme_ctrl_state new_state);
bool nvme_wait_reset(struct nvme_ctrl *ctrl);
int nvme_disable_ctrl(struct nvme_ctrl *ctrl); int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
...@@ -499,6 +501,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); ...@@ -499,6 +501,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
......
...@@ -773,7 +773,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, ...@@ -773,7 +773,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
struct bio_vec *bv) struct bio_vec *bv)
{ {
struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
unsigned int first_prp_len = dev->ctrl.page_size - bv->bv_offset; unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1);
unsigned int first_prp_len = dev->ctrl.page_size - offset;
iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
if (dma_mapping_error(dev->dev, iod->first_dma)) if (dma_mapping_error(dev->dev, iod->first_dma))
...@@ -2263,10 +2264,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) ...@@ -2263,10 +2264,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
return true; return true;
} }
/* static void nvme_dev_add(struct nvme_dev *dev)
* return error value only when tagset allocation failed
*/
static int nvme_dev_add(struct nvme_dev *dev)
{ {
int ret; int ret;
...@@ -2296,7 +2294,7 @@ static int nvme_dev_add(struct nvme_dev *dev) ...@@ -2296,7 +2294,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
if (ret) { if (ret) {
dev_warn(dev->ctrl.device, dev_warn(dev->ctrl.device,
"IO queues tagset allocation failed %d\n", ret); "IO queues tagset allocation failed %d\n", ret);
return ret; return;
} }
dev->ctrl.tagset = &dev->tagset; dev->ctrl.tagset = &dev->tagset;
} else { } else {
...@@ -2307,7 +2305,6 @@ static int nvme_dev_add(struct nvme_dev *dev) ...@@ -2307,7 +2305,6 @@ static int nvme_dev_add(struct nvme_dev *dev)
} }
nvme_dbbuf_set(dev); nvme_dbbuf_set(dev);
return 0;
} }
static int nvme_pci_enable(struct nvme_dev *dev) static int nvme_pci_enable(struct nvme_dev *dev)
...@@ -2467,6 +2464,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) ...@@ -2467,6 +2464,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
mutex_unlock(&dev->shutdown_lock); mutex_unlock(&dev->shutdown_lock);
} }
static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
{
if (!nvme_wait_reset(&dev->ctrl))
return -EBUSY;
nvme_dev_disable(dev, shutdown);
return 0;
}
static int nvme_setup_prp_pools(struct nvme_dev *dev) static int nvme_setup_prp_pools(struct nvme_dev *dev)
{ {
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
...@@ -2490,14 +2495,20 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) ...@@ -2490,14 +2495,20 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
dma_pool_destroy(dev->prp_small_pool); dma_pool_destroy(dev->prp_small_pool);
} }
static void nvme_free_tagset(struct nvme_dev *dev)
{
if (dev->tagset.tags)
blk_mq_free_tag_set(&dev->tagset);
dev->ctrl.tagset = NULL;
}
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
{ {
struct nvme_dev *dev = to_nvme_dev(ctrl); struct nvme_dev *dev = to_nvme_dev(ctrl);
nvme_dbbuf_dma_free(dev); nvme_dbbuf_dma_free(dev);
put_device(dev->dev); put_device(dev->dev);
if (dev->tagset.tags) nvme_free_tagset(dev);
blk_mq_free_tag_set(&dev->tagset);
if (dev->ctrl.admin_q) if (dev->ctrl.admin_q)
blk_put_queue(dev->ctrl.admin_q); blk_put_queue(dev->ctrl.admin_q);
kfree(dev->queues); kfree(dev->queues);
...@@ -2508,6 +2519,11 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) ...@@ -2508,6 +2519,11 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
static void nvme_remove_dead_ctrl(struct nvme_dev *dev) static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
{ {
/*
* Set state to deleting now to avoid blocking nvme_wait_reset(), which
* may be holding this pci_dev's device lock.
*/
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
nvme_get_ctrl(&dev->ctrl); nvme_get_ctrl(&dev->ctrl);
nvme_dev_disable(dev, false); nvme_dev_disable(dev, false);
nvme_kill_queues(&dev->ctrl); nvme_kill_queues(&dev->ctrl);
...@@ -2521,7 +2537,6 @@ static void nvme_reset_work(struct work_struct *work) ...@@ -2521,7 +2537,6 @@ static void nvme_reset_work(struct work_struct *work)
container_of(work, struct nvme_dev, ctrl.reset_work); container_of(work, struct nvme_dev, ctrl.reset_work);
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result; int result;
enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) { if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
result = -ENODEV; result = -ENODEV;
...@@ -2615,13 +2630,11 @@ static void nvme_reset_work(struct work_struct *work) ...@@ -2615,13 +2630,11 @@ static void nvme_reset_work(struct work_struct *work)
dev_warn(dev->ctrl.device, "IO queues not created\n"); dev_warn(dev->ctrl.device, "IO queues not created\n");
nvme_kill_queues(&dev->ctrl); nvme_kill_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl);
new_state = NVME_CTRL_ADMIN_ONLY; nvme_free_tagset(dev);
} else { } else {
nvme_start_queues(&dev->ctrl); nvme_start_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl); nvme_wait_freeze(&dev->ctrl);
/* hit this only when allocate tagset fails */ nvme_dev_add(dev);
if (nvme_dev_add(dev))
new_state = NVME_CTRL_ADMIN_ONLY;
nvme_unfreeze(&dev->ctrl); nvme_unfreeze(&dev->ctrl);
} }
...@@ -2629,9 +2642,9 @@ static void nvme_reset_work(struct work_struct *work) ...@@ -2629,9 +2642,9 @@ static void nvme_reset_work(struct work_struct *work)
* If only admin queue live, keep it to do further investigation or * If only admin queue live, keep it to do further investigation or
* recovery. * recovery.
*/ */
if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) { if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
dev_warn(dev->ctrl.device, dev_warn(dev->ctrl.device,
"failed to mark controller state %d\n", new_state); "failed to mark controller live state\n");
result = -ENODEV; result = -ENODEV;
goto out; goto out;
} }
...@@ -2672,7 +2685,7 @@ static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) ...@@ -2672,7 +2685,7 @@ static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{ {
*val = readq(to_nvme_dev(ctrl)->bar + off); *val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
return 0; return 0;
} }
...@@ -2836,19 +2849,28 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) ...@@ -2836,19 +2849,28 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
static void nvme_reset_prepare(struct pci_dev *pdev) static void nvme_reset_prepare(struct pci_dev *pdev)
{ {
struct nvme_dev *dev = pci_get_drvdata(pdev); struct nvme_dev *dev = pci_get_drvdata(pdev);
nvme_dev_disable(dev, false);
/*
* We don't need to check the return value from waiting for the reset
* state as pci_dev device lock is held, making it impossible to race
* with ->remove().
*/
nvme_disable_prepare_reset(dev, false);
nvme_sync_queues(&dev->ctrl);
} }
static void nvme_reset_done(struct pci_dev *pdev) static void nvme_reset_done(struct pci_dev *pdev)
{ {
struct nvme_dev *dev = pci_get_drvdata(pdev); struct nvme_dev *dev = pci_get_drvdata(pdev);
nvme_reset_ctrl_sync(&dev->ctrl);
if (!nvme_try_sched_reset(&dev->ctrl))
flush_work(&dev->ctrl.reset_work);
} }
static void nvme_shutdown(struct pci_dev *pdev) static void nvme_shutdown(struct pci_dev *pdev)
{ {
struct nvme_dev *dev = pci_get_drvdata(pdev); struct nvme_dev *dev = pci_get_drvdata(pdev);
nvme_dev_disable(dev, true); nvme_disable_prepare_reset(dev, true);
} }
/* /*
...@@ -2901,7 +2923,7 @@ static int nvme_resume(struct device *dev) ...@@ -2901,7 +2923,7 @@ static int nvme_resume(struct device *dev)
if (ndev->last_ps == U32_MAX || if (ndev->last_ps == U32_MAX ||
nvme_set_power_state(ctrl, ndev->last_ps) != 0) nvme_set_power_state(ctrl, ndev->last_ps) != 0)
nvme_reset_ctrl(ctrl); return nvme_try_sched_reset(&ndev->ctrl);
return 0; return 0;
} }
...@@ -2929,17 +2951,14 @@ static int nvme_suspend(struct device *dev) ...@@ -2929,17 +2951,14 @@ static int nvme_suspend(struct device *dev)
*/ */
if (pm_suspend_via_firmware() || !ctrl->npss || if (pm_suspend_via_firmware() || !ctrl->npss ||
!pcie_aspm_enabled(pdev) || !pcie_aspm_enabled(pdev) ||
(ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) { (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
nvme_dev_disable(ndev, true); return nvme_disable_prepare_reset(ndev, true);
return 0;
}
nvme_start_freeze(ctrl); nvme_start_freeze(ctrl);
nvme_wait_freeze(ctrl); nvme_wait_freeze(ctrl);
nvme_sync_queues(ctrl); nvme_sync_queues(ctrl);
if (ctrl->state != NVME_CTRL_LIVE && if (ctrl->state != NVME_CTRL_LIVE)
ctrl->state != NVME_CTRL_ADMIN_ONLY)
goto unfreeze; goto unfreeze;
ret = nvme_get_power_state(ctrl, &ndev->last_ps); ret = nvme_get_power_state(ctrl, &ndev->last_ps);
...@@ -2965,9 +2984,8 @@ static int nvme_suspend(struct device *dev) ...@@ -2965,9 +2984,8 @@ static int nvme_suspend(struct device *dev)
* Clearing npss forces a controller reset on resume. The * Clearing npss forces a controller reset on resume. The
* correct value will be resdicovered then. * correct value will be resdicovered then.
*/ */
nvme_dev_disable(ndev, true); ret = nvme_disable_prepare_reset(ndev, true);
ctrl->npss = 0; ctrl->npss = 0;
ret = 0;
} }
unfreeze: unfreeze:
nvme_unfreeze(ctrl); nvme_unfreeze(ctrl);
...@@ -2977,9 +2995,7 @@ static int nvme_suspend(struct device *dev) ...@@ -2977,9 +2995,7 @@ static int nvme_suspend(struct device *dev)
static int nvme_simple_suspend(struct device *dev) static int nvme_simple_suspend(struct device *dev)
{ {
struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
return nvme_disable_prepare_reset(ndev, true);
nvme_dev_disable(ndev, true);
return 0;
} }
static int nvme_simple_resume(struct device *dev) static int nvme_simple_resume(struct device *dev)
...@@ -2987,8 +3003,7 @@ static int nvme_simple_resume(struct device *dev) ...@@ -2987,8 +3003,7 @@ static int nvme_simple_resume(struct device *dev)
struct pci_dev *pdev = to_pci_dev(dev); struct pci_dev *pdev = to_pci_dev(dev);
struct nvme_dev *ndev = pci_get_drvdata(pdev); struct nvme_dev *ndev = pci_get_drvdata(pdev);
nvme_reset_ctrl(&ndev->ctrl); return nvme_try_sched_reset(&ndev->ctrl);
return 0;
} }
static const struct dev_pm_ops nvme_dev_pm_ops = { static const struct dev_pm_ops nvme_dev_pm_ops = {
......
...@@ -1701,6 +1701,14 @@ nvme_rdma_timeout(struct request *rq, bool reserved) ...@@ -1701,6 +1701,14 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
rq->tag, nvme_rdma_queue_idx(queue)); rq->tag, nvme_rdma_queue_idx(queue));
/*
* Restart the timer if a controller reset is already scheduled. Any
* timed out commands would be handled before entering the connecting
* state.
*/
if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
return BLK_EH_RESET_TIMER;
if (ctrl->ctrl.state != NVME_CTRL_LIVE) { if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
/* /*
* Teardown immediately if controller times out while starting * Teardown immediately if controller times out while starting
......
...@@ -1386,7 +1386,9 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ...@@ -1386,7 +1386,9 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
queue->sock->sk->sk_state_change = nvme_tcp_state_change; queue->sock->sk->sk_state_change = nvme_tcp_state_change;
queue->sock->sk->sk_write_space = nvme_tcp_write_space; queue->sock->sk->sk_write_space = nvme_tcp_write_space;
#ifdef CONFIG_NET_RX_BUSY_POLL
queue->sock->sk->sk_ll_usec = 1; queue->sock->sk->sk_ll_usec = 1;
#endif
write_unlock_bh(&queue->sock->sk->sk_callback_lock); write_unlock_bh(&queue->sock->sk->sk_callback_lock);
return 0; return 0;
...@@ -2044,6 +2046,14 @@ nvme_tcp_timeout(struct request *rq, bool reserved) ...@@ -2044,6 +2046,14 @@ nvme_tcp_timeout(struct request *rq, bool reserved)
struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
struct nvme_tcp_cmd_pdu *pdu = req->pdu; struct nvme_tcp_cmd_pdu *pdu = req->pdu;
/*
* Restart the timer if a controller reset is already scheduled. Any
* timed out commands would be handled before entering the connecting
* state.
*/
if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
return BLK_EH_RESET_TIMER;
dev_warn(ctrl->ctrl.device, dev_warn(ctrl->ctrl.device,
"queue %d: timeout request %#x type %d\n", "queue %d: timeout request %#x type %d\n",
nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
...@@ -2126,6 +2136,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, ...@@ -2126,6 +2136,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
ret = nvme_tcp_map_data(queue, rq); ret = nvme_tcp_map_data(queue, rq);
if (unlikely(ret)) { if (unlikely(ret)) {
nvme_cleanup_cmd(rq);
dev_err(queue->ctrl->ctrl.device, dev_err(queue->ctrl->ctrl.device,
"Failed to map data (%d)\n", ret); "Failed to map data (%d)\n", ret);
return ret; return ret;
......
...@@ -157,8 +157,10 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, ...@@ -157,8 +157,10 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
iod->sg_table.sgl = iod->first_sgl; iod->sg_table.sgl = iod->first_sgl;
if (sg_alloc_table_chained(&iod->sg_table, if (sg_alloc_table_chained(&iod->sg_table,
blk_rq_nr_phys_segments(req), blk_rq_nr_phys_segments(req),
iod->sg_table.sgl, SG_CHUNK_SIZE)) iod->sg_table.sgl, SG_CHUNK_SIZE)) {
nvme_cleanup_cmd(req);
return BLK_STS_RESOURCE; return BLK_STS_RESOURCE;
}
iod->req.sg = iod->sg_table.sgl; iod->req.sg = iod->sg_table.sgl;
iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl);
......
...@@ -322,6 +322,8 @@ struct io_kiocb { ...@@ -322,6 +322,8 @@ struct io_kiocb {
#define REQ_F_FAIL_LINK 256 /* fail rest of links */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */
#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ #define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */
#define REQ_F_TIMEOUT 1024 /* timeout request */ #define REQ_F_TIMEOUT 1024 /* timeout request */
#define REQ_F_ISREG 2048 /* regular file */
#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
u64 user_data; u64 user_data;
u32 result; u32 result;
u32 sequence; u32 sequence;
...@@ -914,26 +916,26 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, ...@@ -914,26 +916,26 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
return ret; return ret;
} }
static void kiocb_end_write(struct kiocb *kiocb) static void kiocb_end_write(struct io_kiocb *req)
{ {
if (kiocb->ki_flags & IOCB_WRITE) { /*
struct inode *inode = file_inode(kiocb->ki_filp); * Tell lockdep we inherited freeze protection from submission
* thread.
*/
if (req->flags & REQ_F_ISREG) {
struct inode *inode = file_inode(req->file);
/* __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
* Tell lockdep we inherited freeze protection from submission
* thread.
*/
if (S_ISREG(inode->i_mode))
__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
file_end_write(kiocb->ki_filp);
} }
file_end_write(req->file);
} }
static void io_complete_rw(struct kiocb *kiocb, long res, long res2) static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
{ {
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
kiocb_end_write(kiocb); if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
if ((req->flags & REQ_F_LINK) && res != req->result) if ((req->flags & REQ_F_LINK) && res != req->result)
req->flags |= REQ_F_FAIL_LINK; req->flags |= REQ_F_FAIL_LINK;
...@@ -945,7 +947,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) ...@@ -945,7 +947,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{ {
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
kiocb_end_write(kiocb); if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
if ((req->flags & REQ_F_LINK) && res != req->result) if ((req->flags & REQ_F_LINK) && res != req->result)
req->flags |= REQ_F_FAIL_LINK; req->flags |= REQ_F_FAIL_LINK;
...@@ -1059,8 +1062,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, ...@@ -1059,8 +1062,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
if (!req->file) if (!req->file)
return -EBADF; return -EBADF;
if (force_nonblock && !io_file_supports_async(req->file)) if (S_ISREG(file_inode(req->file)->i_mode))
force_nonblock = false; req->flags |= REQ_F_ISREG;
/*
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
if (force_nonblock && !io_file_supports_async(req->file)) {
req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_pos = READ_ONCE(sqe->off);
kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
...@@ -1081,7 +1093,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, ...@@ -1081,7 +1093,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
return ret; return ret;
/* don't allow async punt if RWF_NOWAIT was requested */ /* don't allow async punt if RWF_NOWAIT was requested */
if (kiocb->ki_flags & IOCB_NOWAIT) if ((kiocb->ki_flags & IOCB_NOWAIT) ||
(req->file->f_flags & O_NONBLOCK))
req->flags |= REQ_F_NOWAIT; req->flags |= REQ_F_NOWAIT;
if (force_nonblock) if (force_nonblock)
...@@ -1382,7 +1395,9 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, ...@@ -1382,7 +1395,9 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
* need async punt anyway, so it's more efficient to do it * need async punt anyway, so it's more efficient to do it
* here. * here.
*/ */
if (force_nonblock && ret2 > 0 && ret2 < read_size) if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
(req->flags & REQ_F_ISREG) &&
ret2 > 0 && ret2 < read_size)
ret2 = -EAGAIN; ret2 = -EAGAIN;
/* Catch -EAGAIN return for forced non-blocking submission */ /* Catch -EAGAIN return for forced non-blocking submission */
if (!force_nonblock || ret2 != -EAGAIN) { if (!force_nonblock || ret2 != -EAGAIN) {
...@@ -1447,7 +1462,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, ...@@ -1447,7 +1462,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
* released so that it doesn't complain about the held lock when * released so that it doesn't complain about the held lock when
* we return to userspace. * we return to userspace.
*/ */
if (S_ISREG(file_inode(file)->i_mode)) { if (req->flags & REQ_F_ISREG) {
__sb_start_write(file_inode(file)->i_sb, __sb_start_write(file_inode(file)->i_sb,
SB_FREEZE_WRITE, true); SB_FREEZE_WRITE, true);
__sb_writers_release(file_inode(file)->i_sb, __sb_writers_release(file_inode(file)->i_sb,
...@@ -1884,7 +1899,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) ...@@ -1884,7 +1899,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{ {
unsigned count, req_dist, tail_index; unsigned count;
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct list_head *entry; struct list_head *entry;
struct timespec64 ts; struct timespec64 ts;
...@@ -1907,21 +1922,36 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -1907,21 +1922,36 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
count = 1; count = 1;
req->sequence = ctx->cached_sq_head + count - 1; req->sequence = ctx->cached_sq_head + count - 1;
/* reuse it to store the count */
req->submit.sequence = count;
req->flags |= REQ_F_TIMEOUT; req->flags |= REQ_F_TIMEOUT;
/* /*
* Insertion sort, ensuring the first entry in the list is always * Insertion sort, ensuring the first entry in the list is always
* the one we need first. * the one we need first.
*/ */
tail_index = ctx->cached_cq_tail - ctx->rings->sq_dropped;
req_dist = req->sequence - tail_index;
spin_lock_irq(&ctx->completion_lock); spin_lock_irq(&ctx->completion_lock);
list_for_each_prev(entry, &ctx->timeout_list) { list_for_each_prev(entry, &ctx->timeout_list) {
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
unsigned dist; unsigned nxt_sq_head;
long long tmp, tmp_nxt;
dist = nxt->sequence - tail_index; /*
if (req_dist >= dist) * Since cached_sq_head + count - 1 can overflow, use type long
* long to store it.
*/
tmp = (long long)ctx->cached_sq_head + count - 1;
nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;
tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;
/*
* cached_sq_head may overflow, and it will never overflow twice
* once there is some timeout req still be valid.
*/
if (ctx->cached_sq_head < nxt_sq_head)
tmp += UINT_MAX;
if (tmp >= tmp_nxt)
break; break;
} }
list_add(&req->list, entry); list_add(&req->list, entry);
...@@ -2267,7 +2297,13 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ...@@ -2267,7 +2297,13 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
int ret; int ret;
ret = __io_submit_sqe(ctx, req, s, force_nonblock); ret = __io_submit_sqe(ctx, req, s, force_nonblock);
if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
(req->flags & REQ_F_MUST_PUNT))) {
struct io_uring_sqe *sqe_copy; struct io_uring_sqe *sqe_copy;
sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment