Commit 4443f8e6 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-20190412' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:
 "Set of fixes that should go into this round. This pull is larger than
  I'd like at this time, but there's really no specific reason for that.
  Some are fixes for issues that went into this merge window, others are
  not. Anyway, this contains:

   - Hardware queue limiting for virtio-blk/scsi (Dongli)

   - Multi-page bvec fixes for lightnvm pblk

   - Multi-bio dio error fix (Jason)

   - Remove the cache hint from the io_uring tool side, since we didn't
     move forward with that (me)

   - Make io_uring SETUP_SQPOLL root restricted (me)

   - Fix leak of page in error handling for pc requests (Jérôme)

   - Fix BFQ regression introduced in this merge window (Paolo)

   - Fix break logic for bio segment iteration (Ming)

   - Fix NVMe cancel request error handling (Ming)

   - NVMe pull request with two fixes (Christoph):
       - fix the initial CSN for nvme-fc (James)
       - handle log page offsets properly in the target (Keith)"

* tag 'for-linus-20190412' of git://git.kernel.dk/linux-block:
  block: fix the return errno for direct IO
  nvmet: fix discover log page when offsets are used
  nvme-fc: correct csn initialization and increments on error
  block: do not leak memory in bio_copy_user_iov()
  lightnvm: pblk: fix crash in pblk_end_partial_read due to multipage bvecs
  nvme: cancel request synchronously
  blk-mq: introduce blk_mq_complete_request_sync()
  scsi: virtio_scsi: limit number of hw queues by nr_cpu_ids
  virtio-blk: limit number of hw queues by nr_cpu_ids
  block, bfq: fix use after free in bfq_bfqq_expire
  io_uring: restrict IORING_SETUP_SQPOLL to root
  tools/io_uring: remove IOCQE_FLAG_CACHEHIT
  block: don't use for-inside-for in bio_for_each_segment_all
parents b60bc066 a89afe58
......@@ -2822,7 +2822,7 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
bfq_remove_request(q, rq);
}
static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
/*
* If this bfqq is shared between multiple processes, check
......@@ -2855,9 +2855,11 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
/*
* All in-service entities must have been properly deactivated
* or requeued before executing the next function, which
* resets all in-service entites as no more in service.
* resets all in-service entities as no more in service. This
* may cause bfqq to be freed. If this happens, the next
* function returns true.
*/
__bfq_bfqd_reset_in_service(bfqd);
return __bfq_bfqd_reset_in_service(bfqd);
}
/**
......@@ -3262,7 +3264,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
bool slow;
unsigned long delta = 0;
struct bfq_entity *entity = &bfqq->entity;
int ref;
/*
* Check whether the process is slow (see bfq_bfqq_is_slow).
......@@ -3347,10 +3348,8 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
* reason.
*/
__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
ref = bfqq->ref;
__bfq_bfqq_expire(bfqd, bfqq);
if (ref == 1) /* bfqq is gone, no more actions on it */
if (__bfq_bfqq_expire(bfqd, bfqq))
/* bfqq is gone, no more actions on it */
return;
bfqq->injected_service = 0;
......
......@@ -995,7 +995,7 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity,
bool ins_into_idle_tree);
bool next_queue_may_preempt(struct bfq_data *bfqd);
struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd);
void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool ins_into_idle_tree, bool expiration);
void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
......
......@@ -1605,7 +1605,8 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
return bfqq;
}
void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
/* returns true if the in-service queue gets freed */
bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
{
struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
......@@ -1629,8 +1630,20 @@ void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
* service tree either, then release the service reference to
* the queue it represents (taken with bfq_get_entity).
*/
if (!in_serv_entity->on_st)
if (!in_serv_entity->on_st) {
/*
* If no process is referencing in_serv_bfqq any
* longer, then the service reference may be the only
* reference to the queue. If this is the case, then
* bfqq gets freed here.
*/
int ref = in_serv_bfqq->ref;
bfq_put_queue(in_serv_bfqq);
if (ref == 1)
return true;
}
return false;
}
void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
......
......@@ -1298,8 +1298,11 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
}
}
if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) {
if (!map_data)
__free_page(page);
break;
}
len -= bytes;
offset = 0;
......
......@@ -654,6 +654,13 @@ bool blk_mq_complete_request(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_complete_request);
void blk_mq_complete_request_sync(struct request *rq)
{
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
rq->q->mq_ops->complete(rq);
}
EXPORT_SYMBOL_GPL(blk_mq_complete_request_sync);
int blk_mq_request_started(struct request *rq)
{
return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
......
......@@ -513,6 +513,8 @@ static int init_vq(struct virtio_blk *vblk)
if (err)
num_vqs = 1;
num_vqs = min_t(unsigned int, nr_cpu_ids, num_vqs);
vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
if (!vblk->vqs)
return -ENOMEM;
......
......@@ -231,14 +231,14 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
struct pblk_sec_meta *meta;
struct bio *new_bio = rqd->bio;
struct bio *bio = pr_ctx->orig_bio;
struct bio_vec src_bv, dst_bv;
void *meta_list = rqd->meta_list;
int bio_init_idx = pr_ctx->bio_init_idx;
unsigned long *read_bitmap = pr_ctx->bitmap;
struct bvec_iter orig_iter = BVEC_ITER_ALL_INIT;
struct bvec_iter new_iter = BVEC_ITER_ALL_INIT;
int nr_secs = pr_ctx->orig_nr_secs;
int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
void *src_p, *dst_p;
int hole, i;
int bit, i;
if (unlikely(nr_holes == 1)) {
struct ppa_addr ppa;
......@@ -257,18 +257,19 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
/* Fill the holes in the original bio */
i = 0;
hole = find_first_zero_bit(read_bitmap, nr_secs);
do {
for (bit = 0; bit < nr_secs; bit++) {
if (!test_bit(bit, read_bitmap)) {
struct bio_vec dst_bv, src_bv;
struct pblk_line *line;
line = pblk_ppa_to_line(pblk, rqd->ppa_list[i]);
kref_put(&line->ref, pblk_line_put);
meta = pblk_get_meta(pblk, meta_list, hole);
meta = pblk_get_meta(pblk, meta_list, bit);
meta->lba = cpu_to_le64(pr_ctx->lba_list_media[i]);
src_bv = new_bio->bi_io_vec[i++];
dst_bv = bio->bi_io_vec[bio_init_idx + hole];
dst_bv = bio_iter_iovec(bio, orig_iter);
src_bv = bio_iter_iovec(new_bio, new_iter);
src_p = kmap_atomic(src_bv.bv_page);
dst_p = kmap_atomic(dst_bv.bv_page);
......@@ -280,10 +281,15 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
kunmap_atomic(src_p);
kunmap_atomic(dst_p);
flush_dcache_page(dst_bv.bv_page);
mempool_free(src_bv.bv_page, &pblk->page_bio_pool);
hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
} while (hole < nr_secs);
bio_advance_iter(new_bio, &new_iter,
PBLK_EXPOSED_PAGE_SIZE);
i++;
}
bio_advance_iter(bio, &orig_iter, PBLK_EXPOSED_PAGE_SIZE);
}
bio_put(new_bio);
kfree(pr_ctx);
......
......@@ -288,7 +288,7 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
"Cancelling I/O %d", req->tag);
nvme_req(req)->status = NVME_SC_ABORT_REQ;
blk_mq_complete_request(req);
blk_mq_complete_request_sync(req);
return true;
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
......
......@@ -1845,7 +1845,7 @@ nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx)
memset(queue, 0, sizeof(*queue));
queue->ctrl = ctrl;
queue->qnum = idx;
atomic_set(&queue->csn, 1);
atomic_set(&queue->csn, 0);
queue->dev = ctrl->dev;
if (idx > 0)
......@@ -1887,7 +1887,7 @@ nvme_fc_free_queue(struct nvme_fc_queue *queue)
*/
queue->connection_id = 0;
atomic_set(&queue->csn, 1);
atomic_set(&queue->csn, 0);
}
static void
......@@ -2183,7 +2183,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
{
struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
struct nvme_command *sqe = &cmdiu->sqe;
u32 csn;
int ret, opstate;
/*
......@@ -2198,8 +2197,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
/* format the FC-NVME CMD IU and fcp_req */
cmdiu->connection_id = cpu_to_be64(queue->connection_id);
csn = atomic_inc_return(&queue->csn);
cmdiu->csn = cpu_to_be32(csn);
cmdiu->data_len = cpu_to_be32(data_len);
switch (io_dir) {
case NVMEFC_FCP_WRITE:
......@@ -2257,11 +2254,24 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
if (!(op->flags & FCOP_FLAGS_AEN))
blk_mq_start_request(op->rq);
cmdiu->csn = cpu_to_be32(atomic_inc_return(&queue->csn));
ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport,
&ctrl->rport->remoteport,
queue->lldd_handle, &op->fcp_req);
if (ret) {
/*
* If the lld fails to send the command is there an issue with
* the csn value? If the command that fails is the Connect,
* no - as the connection won't be live. If it is a command
* post-connect, it's possible a gap in csn may be created.
* Does this matter? As Linux initiators don't send fused
* commands, no. The gap would exist, but as there's nothing
* that depends on csn order to be delivered on the target
* side, it shouldn't hurt. It would be difficult for a
* target to even detect the csn gap as it has no idea when the
* cmd with the csn was supposed to arrive.
*/
opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
......
......@@ -24,6 +24,11 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd)
return len;
}
u64 nvmet_get_log_page_offset(struct nvme_command *cmd)
{
return le64_to_cpu(cmd->get_log_page.lpo);
}
static void nvmet_execute_get_log_page_noop(struct nvmet_req *req)
{
nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->data_len));
......
......@@ -131,54 +131,76 @@ static void nvmet_set_disc_traddr(struct nvmet_req *req, struct nvmet_port *port
memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
}
static size_t discovery_log_entries(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmet_subsys_link *p;
struct nvmet_port *r;
size_t entries = 0;
list_for_each_entry(p, &req->port->subsystems, entry) {
if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn))
continue;
entries++;
}
list_for_each_entry(r, &req->port->referrals, entry)
entries++;
return entries;
}
static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
{
const int entry_size = sizeof(struct nvmf_disc_rsp_page_entry);
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmf_disc_rsp_page_hdr *hdr;
u64 offset = nvmet_get_log_page_offset(req->cmd);
size_t data_len = nvmet_get_log_page_len(req->cmd);
size_t alloc_len = max(data_len, sizeof(*hdr));
int residual_len = data_len - sizeof(*hdr);
size_t alloc_len;
struct nvmet_subsys_link *p;
struct nvmet_port *r;
u32 numrec = 0;
u16 status = 0;
void *buffer;
/* Spec requires dword aligned offsets */
if (offset & 0x3) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
goto out;
}
/*
* Make sure we're passing at least a buffer of response header size.
* If host provided data len is less than the header size, only the
* number of bytes requested by host will be sent to host.
*/
hdr = kzalloc(alloc_len, GFP_KERNEL);
if (!hdr) {
down_read(&nvmet_config_sem);
alloc_len = sizeof(*hdr) + entry_size * discovery_log_entries(req);
buffer = kzalloc(alloc_len, GFP_KERNEL);
if (!buffer) {
up_read(&nvmet_config_sem);
status = NVME_SC_INTERNAL;
goto out;
}
down_read(&nvmet_config_sem);
hdr = buffer;
list_for_each_entry(p, &req->port->subsystems, entry) {
char traddr[NVMF_TRADDR_SIZE];
if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn))
continue;
if (residual_len >= entry_size) {
char traddr[NVMF_TRADDR_SIZE];
nvmet_set_disc_traddr(req, req->port, traddr);
nvmet_format_discovery_entry(hdr, req->port,
p->subsys->subsysnqn, traddr,
NVME_NQN_NVME, numrec);
residual_len -= entry_size;
}
numrec++;
}
list_for_each_entry(r, &req->port->referrals, entry) {
if (residual_len >= entry_size) {
nvmet_format_discovery_entry(hdr, r,
NVME_DISC_SUBSYS_NAME,
r->disc_addr.traddr,
NVME_NQN_DISC, numrec);
residual_len -= entry_size;
}
numrec++;
}
......@@ -190,8 +212,8 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
up_read(&nvmet_config_sem);
status = nvmet_copy_to_sgl(req, 0, hdr, data_len);
kfree(hdr);
status = nvmet_copy_to_sgl(req, 0, buffer + offset, data_len);
kfree(buffer);
out:
nvmet_req_complete(req, status);
}
......
......@@ -428,6 +428,7 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf,
u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len);
u32 nvmet_get_log_page_len(struct nvme_command *cmd);
u64 nvmet_get_log_page_offset(struct nvme_command *cmd);
extern struct list_head *nvmet_ports;
void nvmet_port_disc_changed(struct nvmet_port *port,
......
......@@ -793,6 +793,7 @@ static int virtscsi_probe(struct virtio_device *vdev)
/* We need to know how many queues before we allocate. */
num_queues = virtscsi_config_get(vdev, num_queues) ? : 1;
num_queues = min_t(unsigned int, nr_cpu_ids, num_queues);
num_targets = virtscsi_config_get(vdev, max_target) + 1;
......
......@@ -307,10 +307,10 @@ static void blkdev_bio_end_io(struct bio *bio)
struct blkdev_dio *dio = bio->bi_private;
bool should_dirty = dio->should_dirty;
if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
if (bio->bi_status && !dio->bio.bi_status)
dio->bio.bi_status = bio->bi_status;
} else {
if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
if (!dio->is_sync) {
struct kiocb *iocb = dio->iocb;
ssize_t ret;
......
......@@ -2245,6 +2245,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
goto err;
if (ctx->flags & IORING_SETUP_SQPOLL) {
ret = -EPERM;
if (!capable(CAP_SYS_ADMIN))
goto err;
if (p->flags & IORING_SETUP_SQ_AFF) {
int cpu;
......
......@@ -120,19 +120,23 @@ static inline bool bio_full(struct bio *bio)
return bio->bi_vcnt >= bio->bi_max_vecs;
}
#define mp_bvec_for_each_segment(bv, bvl, i, iter_all) \
for (bv = bvec_init_iter_all(&iter_all); \
(iter_all.done < (bvl)->bv_len) && \
(mp_bvec_next_segment((bvl), &iter_all), 1); \
iter_all.done += bv->bv_len, i += 1)
static inline bool bio_next_segment(const struct bio *bio,
struct bvec_iter_all *iter)
{
if (iter->idx >= bio->bi_vcnt)
return false;
bvec_advance(&bio->bi_io_vec[iter->idx], iter);
return true;
}
/*
* drivers should _never_ use the all version - the bio may have been split
* before it got to the driver and the driver won't own all of it
*/
#define bio_for_each_segment_all(bvl, bio, i, iter_all) \
for (i = 0, iter_all.idx = 0; iter_all.idx < (bio)->bi_vcnt; iter_all.idx++) \
mp_bvec_for_each_segment(bvl, &((bio)->bi_io_vec[iter_all.idx]), i, iter_all)
#define bio_for_each_segment_all(bvl, bio, i, iter) \
for (i = 0, bvl = bvec_init_iter_all(&iter); \
bio_next_segment((bio), &iter); i++)
static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
unsigned bytes)
......
......@@ -302,6 +302,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
void blk_mq_kick_requeue_list(struct request_queue *q);
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
bool blk_mq_complete_request(struct request *rq);
void blk_mq_complete_request_sync(struct request *rq);
bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
struct bio *bio);
bool blk_mq_queue_stopped(struct request_queue *q);
......
......@@ -145,18 +145,18 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv,
static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all)
{
iter_all->bv.bv_page = NULL;
iter_all->done = 0;
iter_all->idx = 0;
return &iter_all->bv;
}
static inline void mp_bvec_next_segment(const struct bio_vec *bvec,
static inline void bvec_advance(const struct bio_vec *bvec,
struct bvec_iter_all *iter_all)
{
struct bio_vec *bv = &iter_all->bv;
if (bv->bv_page) {
if (iter_all->done) {
bv->bv_page = nth_page(bv->bv_page, 1);
bv->bv_offset = 0;
} else {
......@@ -165,6 +165,12 @@ static inline void mp_bvec_next_segment(const struct bio_vec *bvec,
}
bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset,
bvec->bv_len - iter_all->done);
iter_all->done += bv->bv_len;
if (iter_all->done == bvec->bv_len) {
iter_all->idx++;
iter_all->done = 0;
}
}
/*
......
......@@ -967,8 +967,13 @@ struct nvme_get_log_page_command {
__le16 numdl;
__le16 numdu;
__u16 rsvd11;
union {
struct {
__le32 lpol;
__le32 lpou;
};
__le64 lpo;
};
__u32 rsvd14[2];
};
......
......@@ -32,10 +32,6 @@
#include "liburing.h"
#include "barrier.h"
#ifndef IOCQE_FLAG_CACHEHIT
#define IOCQE_FLAG_CACHEHIT (1U << 0)
#endif
#define min(a, b) ((a < b) ? (a) : (b))
struct io_sq_ring {
......@@ -85,7 +81,6 @@ struct submitter {
unsigned long reaps;
unsigned long done;
unsigned long calls;
unsigned long cachehit, cachemiss;
volatile int finish;
__s32 *fds;
......@@ -270,10 +265,6 @@ static int reap_events(struct submitter *s)
return -1;
}
}
if (cqe->flags & IOCQE_FLAG_CACHEHIT)
s->cachehit++;
else
s->cachemiss++;
reaped++;
head++;
} while (1);
......@@ -489,7 +480,7 @@ static void file_depths(char *buf)
int main(int argc, char *argv[])
{
struct submitter *s = &submitters[0];
unsigned long done, calls, reap, cache_hit, cache_miss;
unsigned long done, calls, reap;
int err, i, flags, fd;
char *fdepths;
void *ret;
......@@ -569,44 +560,29 @@ int main(int argc, char *argv[])
pthread_create(&s->thread, NULL, submitter_fn, s);
fdepths = malloc(8 * s->nr_files);
cache_hit = cache_miss = reap = calls = done = 0;
reap = calls = done = 0;
do {
unsigned long this_done = 0;
unsigned long this_reap = 0;
unsigned long this_call = 0;
unsigned long this_cache_hit = 0;
unsigned long this_cache_miss = 0;
unsigned long rpc = 0, ipc = 0;
double hit = 0.0;
sleep(1);
this_done += s->done;
this_call += s->calls;
this_reap += s->reaps;
this_cache_hit += s->cachehit;
this_cache_miss += s->cachemiss;
if (this_cache_hit && this_cache_miss) {
unsigned long hits, total;
hits = this_cache_hit - cache_hit;
total = hits + this_cache_miss - cache_miss;
hit = (double) hits / (double) total;
hit *= 100.0;
}
if (this_call - calls) {
rpc = (this_done - done) / (this_call - calls);
ipc = (this_reap - reap) / (this_call - calls);
} else
rpc = ipc = -1;
file_depths(fdepths);
printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s), Cachehit=%0.2f%%\n",
printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n",
this_done - done, rpc, ipc, s->inflight,
fdepths, hit);
fdepths);
done = this_done;
calls = this_call;
reap = this_reap;
cache_hit = s->cachehit;
cache_miss = s->cachemiss;
} while (!finish);
pthread_join(s->thread, &ret);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment