Commit 8cf1a3fc authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-3.6/core' of git://git.kernel.dk/linux-block

Pull core block IO bits from Jens Axboe:
 "The most complicated part if this is the request allocation rework by
  Tejun, which has been queued up for a long time and has been in
  for-next ditto as well.

  There are a few commits from yesterday and today, mostly trivial and
  obvious fixes.  So I'm pretty confident that it is sound.  It's also
  smaller than usual."

* 'for-3.6/core' of git://git.kernel.dk/linux-block:
  block: remove dead func declaration
  block: add partition resize function to blkpg ioctl
  block: uninitialized ioc->nr_tasks triggers WARN_ON
  block: do not artificially constrain max_sectors for stacking drivers
  blkcg: implement per-blkg request allocation
  block: prepare for multiple request_lists
  block: add q->nr_rqs[] and move q->rq.elvpriv to q->nr_rqs_elvpriv
  blkcg: inline bio_blkcg() and friends
  block: allocate io_context upfront
  block: refactor get_request[_wait]()
  block: drop custom queue draining used by scsi_transport_{iscsi|fc}
  mempool: add @gfp_mask to mempool_create_node()
  blkcg: make root blkcg allocation use %GFP_KERNEL
  blkcg: __blkg_lookup_create() doesn't need radix preload
parents fcff06c4 80799fbb
......@@ -38,6 +38,13 @@ read or write requests. Note that the total allocated number may be twice
this amount, since it applies only to reads or writes (not the accumulated
sum).
To avoid priority inversion through request starvation, a request
queue maintains a separate request pool per each cgroup when
CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such
per-block-cgroup request pool. IOW, if there are N block cgroups,
each request queue may have upto N request pools, each independently
regulated by nr_requests.
read_ahead_kb (RW)
------------------
Maximum number of kilobytes to read-ahead for filesystems on this block
......
......@@ -31,27 +31,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
{
return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
struct blkcg, css);
}
EXPORT_SYMBOL_GPL(cgroup_to_blkcg);
static struct blkcg *task_blkcg(struct task_struct *tsk)
{
return container_of(task_subsys_state(tsk, blkio_subsys_id),
struct blkcg, css);
}
struct blkcg *bio_blkcg(struct bio *bio)
{
if (bio && bio->bi_css)
return container_of(bio->bi_css, struct blkcg, css);
return task_blkcg(current);
}
EXPORT_SYMBOL_GPL(bio_blkcg);
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
{
......@@ -84,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg)
kfree(pd);
}
blk_exit_rl(&blkg->rl);
kfree(blkg);
}
......@@ -91,16 +71,18 @@ static void blkg_free(struct blkcg_gq *blkg)
* blkg_alloc - allocate a blkg
* @blkcg: block cgroup the new blkg is associated with
* @q: request_queue the new blkg is associated with
* @gfp_mask: allocation mask to use
*
* Allocate a new blkg assocating @blkcg and @q.
*/
static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
gfp_t gfp_mask)
{
struct blkcg_gq *blkg;
int i;
/* alloc and init base part */
blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
if (!blkg)
return NULL;
......@@ -109,6 +91,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
blkg->blkcg = blkcg;
blkg->refcnt = 1;
/* root blkg uses @q->root_rl, init rl only for !root blkgs */
if (blkcg != &blkcg_root) {
if (blk_init_rl(&blkg->rl, q, gfp_mask))
goto err_free;
blkg->rl.blkg = blkg;
}
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
struct blkg_policy_data *pd;
......@@ -117,11 +106,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
continue;
/* alloc per-policy data and attach it to blkg */
pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node);
if (!pd) {
blkg_free(blkg);
return NULL;
}
pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
if (!pd)
goto err_free;
blkg->pd[i] = pd;
pd->blkg = blkg;
......@@ -132,6 +119,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
}
return blkg;
err_free:
blkg_free(blkg);
return NULL;
}
static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
......@@ -175,9 +166,13 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blkg_lookup);
/*
* If @new_blkg is %NULL, this function tries to allocate a new one as
* necessary using %GFP_ATOMIC. @new_blkg is always consumed on return.
*/
static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
struct request_queue *q)
__releases(q->queue_lock) __acquires(q->queue_lock)
struct request_queue *q,
struct blkcg_gq *new_blkg)
{
struct blkcg_gq *blkg;
int ret;
......@@ -189,24 +184,26 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
blkg = __blkg_lookup(blkcg, q);
if (blkg) {
rcu_assign_pointer(blkcg->blkg_hint, blkg);
return blkg;
goto out_free;
}
/* blkg holds a reference to blkcg */
if (!css_tryget(&blkcg->css))
return ERR_PTR(-EINVAL);
if (!css_tryget(&blkcg->css)) {
blkg = ERR_PTR(-EINVAL);
goto out_free;
}
/* allocate */
ret = -ENOMEM;
blkg = blkg_alloc(blkcg, q);
if (unlikely(!blkg))
goto err_put;
if (!new_blkg) {
new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
if (unlikely(!new_blkg)) {
blkg = ERR_PTR(-ENOMEM);
goto out_put;
}
}
blkg = new_blkg;
/* insert */
ret = radix_tree_preload(GFP_ATOMIC);
if (ret)
goto err_free;
spin_lock(&blkcg->lock);
ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
if (likely(!ret)) {
......@@ -215,15 +212,15 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
}
spin_unlock(&blkcg->lock);
radix_tree_preload_end();
if (!ret)
return blkg;
err_free:
blkg_free(blkg);
err_put:
blkg = ERR_PTR(ret);
out_put:
css_put(&blkcg->css);
return ERR_PTR(ret);
out_free:
blkg_free(new_blkg);
return blkg;
}
struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
......@@ -235,7 +232,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
*/
if (unlikely(blk_queue_bypass(q)))
return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
return __blkg_lookup_create(blkcg, q);
return __blkg_lookup_create(blkcg, q, NULL);
}
EXPORT_SYMBOL_GPL(blkg_lookup_create);
......@@ -313,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg)
}
EXPORT_SYMBOL_GPL(__blkg_release);
/*
* The next function used by blk_queue_for_each_rl(). It's a bit tricky
* because the root blkg uses @q->root_rl instead of its own rl.
*/
struct request_list *__blk_queue_next_rl(struct request_list *rl,
struct request_queue *q)
{
struct list_head *ent;
struct blkcg_gq *blkg;
/*
* Determine the current blkg list_head. The first entry is
* root_rl which is off @q->blkg_list and mapped to the head.
*/
if (rl == &q->root_rl) {
ent = &q->blkg_list;
} else {
blkg = container_of(rl, struct blkcg_gq, rl);
ent = &blkg->q_node;
}
/* walk to the next list_head, skip root blkcg */
ent = ent->next;
if (ent == &q->root_blkg->q_node)
ent = ent->next;
if (ent == &q->blkg_list)
return NULL;
blkg = container_of(ent, struct blkcg_gq, q_node);
return &blkg->rl;
}
static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
u64 val)
{
......@@ -734,24 +763,36 @@ int blkcg_activate_policy(struct request_queue *q,
struct blkcg_gq *blkg;
struct blkg_policy_data *pd, *n;
int cnt = 0, ret;
bool preloaded;
if (blkcg_policy_enabled(q, pol))
return 0;
/* preallocations for root blkg */
blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
if (!blkg)
return -ENOMEM;
preloaded = !radix_tree_preload(GFP_KERNEL);
blk_queue_bypass_start(q);
/* make sure the root blkg exists and count the existing blkgs */
spin_lock_irq(q->queue_lock);
rcu_read_lock();
blkg = __blkg_lookup_create(&blkcg_root, q);
blkg = __blkg_lookup_create(&blkcg_root, q, blkg);
rcu_read_unlock();
if (preloaded)
radix_tree_preload_end();
if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg);
goto out_unlock;
}
q->root_blkg = blkg;
q->root_rl.blkg = blkg;
list_for_each_entry(blkg, &q->blkg_list, q_node)
cnt++;
......
......@@ -17,6 +17,7 @@
#include <linux/u64_stats_sync.h>
#include <linux/seq_file.h>
#include <linux/radix-tree.h>
#include <linux/blkdev.h>
/* Max limits for throttle policy */
#define THROTL_IOPS_MAX UINT_MAX
......@@ -93,6 +94,8 @@ struct blkcg_gq {
struct list_head q_node;
struct hlist_node blkcg_node;
struct blkcg *blkcg;
/* request allocation list for this blkcg-q pair */
struct request_list rl;
/* reference count */
int refcnt;
......@@ -120,8 +123,6 @@ struct blkcg_policy {
extern struct blkcg blkcg_root;
struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup);
struct blkcg *bio_blkcg(struct bio *bio);
struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
struct request_queue *q);
......@@ -160,6 +161,25 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
void blkg_conf_finish(struct blkg_conf_ctx *ctx);
static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
{
return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
struct blkcg, css);
}
static inline struct blkcg *task_blkcg(struct task_struct *tsk)
{
return container_of(task_subsys_state(tsk, blkio_subsys_id),
struct blkcg, css);
}
static inline struct blkcg *bio_blkcg(struct bio *bio)
{
if (bio && bio->bi_css)
return container_of(bio->bi_css, struct blkcg, css);
return task_blkcg(current);
}
/**
* blkg_to_pdata - get policy private data
* @blkg: blkg of interest
......@@ -233,6 +253,95 @@ static inline void blkg_put(struct blkcg_gq *blkg)
__blkg_release(blkg);
}
/**
* blk_get_rl - get request_list to use
* @q: request_queue of interest
* @bio: bio which will be attached to the allocated request (may be %NULL)
*
* The caller wants to allocate a request from @q to use for @bio. Find
* the request_list to use and obtain a reference on it. Should be called
* under queue_lock. This function is guaranteed to return non-%NULL
* request_list.
*/
static inline struct request_list *blk_get_rl(struct request_queue *q,
struct bio *bio)
{
struct blkcg *blkcg;
struct blkcg_gq *blkg;
rcu_read_lock();
blkcg = bio_blkcg(bio);
/* bypass blkg lookup and use @q->root_rl directly for root */
if (blkcg == &blkcg_root)
goto root_rl;
/*
* Try to use blkg->rl. blkg lookup may fail under memory pressure
* or if either the blkcg or queue is going away. Fall back to
* root_rl in such cases.
*/
blkg = blkg_lookup_create(blkcg, q);
if (unlikely(IS_ERR(blkg)))
goto root_rl;
blkg_get(blkg);
rcu_read_unlock();
return &blkg->rl;
root_rl:
rcu_read_unlock();
return &q->root_rl;
}
/**
* blk_put_rl - put request_list
* @rl: request_list to put
*
* Put the reference acquired by blk_get_rl(). Should be called under
* queue_lock.
*/
static inline void blk_put_rl(struct request_list *rl)
{
/* root_rl may not have blkg set */
if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
blkg_put(rl->blkg);
}
/**
* blk_rq_set_rl - associate a request with a request_list
* @rq: request of interest
* @rl: target request_list
*
* Associate @rq with @rl so that accounting and freeing can know the
* request_list @rq came from.
*/
static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
{
rq->rl = rl;
}
/**
* blk_rq_rl - return the request_list a request came from
* @rq: request of interest
*
* Return the request_list @rq is allocated from.
*/
static inline struct request_list *blk_rq_rl(struct request *rq)
{
return rq->rl;
}
struct request_list *__blk_queue_next_rl(struct request_list *rl,
struct request_queue *q);
/**
* blk_queue_for_each_rl - iterate through all request_lists of a request_queue
*
* Should be used under queue_lock.
*/
#define blk_queue_for_each_rl(rl, q) \
for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
/**
* blkg_stat_add - add a value to a blkg_stat
* @stat: target blkg_stat
......@@ -351,6 +460,7 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
#else /* CONFIG_BLK_CGROUP */
struct cgroup;
struct blkcg;
struct blkg_policy_data {
};
......@@ -361,8 +471,6 @@ struct blkcg_gq {
struct blkcg_policy {
};
static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
static inline void blkcg_drain_queue(struct request_queue *q) { }
......@@ -374,6 +482,9 @@ static inline int blkcg_activate_policy(struct request_queue *q,
static inline void blkcg_deactivate_policy(struct request_queue *q,
const struct blkcg_policy *pol) { }
static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
struct blkcg_policy *pol) { return NULL; }
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
......@@ -381,5 +492,14 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
static inline struct request_list *blk_get_rl(struct request_queue *q,
struct bio *bio) { return &q->root_rl; }
static inline void blk_put_rl(struct request_list *rl) { }
static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
#define blk_queue_for_each_rl(rl, q) \
for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
#endif /* CONFIG_BLK_CGROUP */
#endif /* _BLK_CGROUP_H */
This diff is collapsed.
......@@ -244,6 +244,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
/* initialize */
atomic_long_set(&ioc->refcount, 1);
atomic_set(&ioc->nr_tasks, 1);
atomic_set(&ioc->active_ref, 1);
spin_lock_init(&ioc->lock);
INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
......
......@@ -143,8 +143,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->discard_zeroes_data = 1;
lim->max_segments = USHRT_MAX;
lim->max_hw_sectors = UINT_MAX;
lim->max_sectors = BLK_DEF_MAX_SECTORS;
lim->max_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
......
......@@ -40,7 +40,7 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
static ssize_t
queue_requests_store(struct request_queue *q, const char *page, size_t count)
{
struct request_list *rl = &q->rq;
struct request_list *rl;
unsigned long nr;
int ret;
......@@ -55,6 +55,9 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
q->nr_requests = nr;
blk_queue_congestion_threshold(q);
/* congestion isn't cgroup aware and follows root blkcg for now */
rl = &q->root_rl;
if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
blk_set_queue_congested(q, BLK_RW_SYNC);
else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
......@@ -65,19 +68,22 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
blk_clear_queue_congested(q, BLK_RW_ASYNC);
blk_queue_for_each_rl(rl, q) {
if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
blk_set_queue_full(q, BLK_RW_SYNC);
blk_set_rl_full(rl, BLK_RW_SYNC);
} else {
blk_clear_queue_full(q, BLK_RW_SYNC);
blk_clear_rl_full(rl, BLK_RW_SYNC);
wake_up(&rl->wait[BLK_RW_SYNC]);
}
if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
blk_set_queue_full(q, BLK_RW_ASYNC);
blk_set_rl_full(rl, BLK_RW_ASYNC);
} else {
blk_clear_queue_full(q, BLK_RW_ASYNC);
blk_clear_rl_full(rl, BLK_RW_ASYNC);
wake_up(&rl->wait[BLK_RW_ASYNC]);
}
}
spin_unlock_irq(q->queue_lock);
return ret;
}
......@@ -476,7 +482,6 @@ static void blk_release_queue(struct kobject *kobj)
{
struct request_queue *q =
container_of(kobj, struct request_queue, kobj);
struct request_list *rl = &q->rq;
blk_sync_queue(q);
......@@ -489,8 +494,7 @@ static void blk_release_queue(struct kobject *kobj)
elevator_exit(q->elevator);
}
if (rl->rq_pool)
mempool_destroy(rl->rq_pool);
blk_exit_rl(&q->root_rl);
if (q->queue_tags)
__blk_queue_free_tags(q);
......
......@@ -1123,9 +1123,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
goto out;
}
/* bio_associate_current() needs ioc, try creating */
create_io_context(GFP_ATOMIC, q->node);
/*
* A throtl_grp pointer retrieved under rcu can be used to access
* basic fields like stats and io rates. If a group has no rules,
......
......@@ -18,6 +18,9 @@ static inline void __blk_get_queue(struct request_queue *q)
kobject_get(&q->kobj);
}
int blk_init_rl(struct request_list *rl, struct request_queue *q,
gfp_t gfp_mask);
void blk_exit_rl(struct request_list *rl);
void init_request_from_bio(struct request *req, struct bio *bio);
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio);
......@@ -33,7 +36,6 @@ bool __blk_end_bidi_request(struct request *rq, int error,
void blk_rq_timed_out_timer(unsigned long data);
void blk_delete_timer(struct request *);
void blk_add_timer(struct request *);
void __generic_unplug_device(struct request_queue *);
/*
* Internal atomic flags for request handling
......
......@@ -243,56 +243,3 @@ int bsg_setup_queue(struct device *dev, struct request_queue *q,
return 0;
}
EXPORT_SYMBOL_GPL(bsg_setup_queue);
/**
* bsg_remove_queue - Deletes the bsg dev from the q
* @q: the request_queue that is to be torn down.
*
* Notes:
* Before unregistering the queue empty any requests that are blocked
*/
void bsg_remove_queue(struct request_queue *q)
{
struct request *req; /* block request */
int counts; /* totals for request_list count and starved */
if (!q)
return;
/* Stop taking in new requests */
spin_lock_irq(q->queue_lock);
blk_stop_queue(q);
/* drain all requests in the queue */
while (1) {
/* need the lock to fetch a request
* this may fetch the same reqeust as the previous pass
*/
req = blk_fetch_request(q);
/* save requests in use and starved */
counts = q->rq.count[0] + q->rq.count[1] +
q->rq.starved[0] + q->rq.starved[1];
spin_unlock_irq(q->queue_lock);
/* any requests still outstanding? */
if (counts == 0)
break;
/* This may be the same req as the previous iteration,
* always send the blk_end_request_all after a prefetch.
* It is not okay to not end the request because the
* prefetch started the request.
*/
if (req) {
/* return -ENXIO to indicate that this queue is
* going away
*/
req->errors = -ENXIO;
blk_end_request_all(req, -ENXIO);
}
msleep(200); /* allow bsg to possibly finish */
spin_lock_irq(q->queue_lock);
}
bsg_unregister_queue(q);
}
EXPORT_SYMBOL_GPL(bsg_remove_queue);
......@@ -154,7 +154,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
part = rcu_dereference(ptbl->part[piter->idx]);
if (!part)
continue;
if (!part->nr_sects &&
if (!part_nr_sects_read(part) &&
!(piter->flags & DISK_PITER_INCL_EMPTY) &&
!(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
piter->idx == 0))
......@@ -191,7 +191,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
static inline int sector_in_part(struct hd_struct *part, sector_t sector)
{
return part->start_sect <= sector &&
sector < part->start_sect + part->nr_sects;
sector < part->start_sect + part_nr_sects_read(part);
}
/**
......@@ -769,8 +769,8 @@ void __init printk_all_partitions(void)
printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
bdevt_str(part_devt(part), devt_buf),
(unsigned long long)part->nr_sects >> 1,
disk_name(disk, part->partno, name_buf),
(unsigned long long)part_nr_sects_read(part) >> 1
, disk_name(disk, part->partno, name_buf),
uuid_buf);
if (is_part0) {
if (disk->driverfs_dev != NULL &&
......@@ -862,7 +862,7 @@ static int show_partition(struct seq_file *seqf, void *v)
while ((part = disk_part_iter_next(&piter)))
seq_printf(seqf, "%4d %7d %10llu %s\n",
MAJOR(part_devt(part)), MINOR(part_devt(part)),
(unsigned long long)part->nr_sects >> 1,
(unsigned long long)part_nr_sects_read(part) >> 1,
disk_name(sgp, part->partno, buf));
disk_part_iter_exit(&piter);
......@@ -1268,6 +1268,16 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
}
disk->part_tbl->part[0] = &disk->part0;
/*
* set_capacity() and get_capacity() currently don't use
* seqcounter to read/update the part0->nr_sects. Still init
* the counter as we can read the sectors in IO submission
* patch using seqence counters.
*
* TODO: Ideally set_capacity() and get_capacity() should be
* converted to make use of bd_mutex and sequence counters.
*/
seqcount_init(&disk->part0.nr_sects_seq);
hd_ref_init(&disk->part0);
disk->minors = minors;
......
......@@ -13,7 +13,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
{
struct block_device *bdevp;
struct gendisk *disk;
struct hd_struct *part;
struct hd_struct *part, *lpart;
struct blkpg_ioctl_arg a;
struct blkpg_partition p;
struct disk_part_iter piter;
......@@ -91,6 +91,59 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
mutex_unlock(&bdevp->bd_mutex);
bdput(bdevp);
return 0;
case BLKPG_RESIZE_PARTITION:
start = p.start >> 9;
/* new length of partition in bytes */
length = p.length >> 9;
/* check for fit in a hd_struct */
if (sizeof(sector_t) == sizeof(long) &&
sizeof(long long) > sizeof(long)) {
long pstart = start, plength = length;
if (pstart != start || plength != length
|| pstart < 0 || plength < 0)
return -EINVAL;
}
part = disk_get_part(disk, partno);
if (!part)
return -ENXIO;
bdevp = bdget(part_devt(part));
if (!bdevp) {
disk_put_part(part);
return -ENOMEM;
}
mutex_lock(&bdevp->bd_mutex);
mutex_lock_nested(&bdev->bd_mutex, 1);
if (start != part->start_sect) {
mutex_unlock(&bdevp->bd_mutex);
mutex_unlock(&bdev->bd_mutex);
bdput(bdevp);
disk_put_part(part);
return -EINVAL;
}
/* overlap? */
disk_part_iter_init(&piter, disk,
DISK_PITER_INCL_EMPTY);
while ((lpart = disk_part_iter_next(&piter))) {
if (lpart->partno != partno &&
!(start + length <= lpart->start_sect ||
start >= lpart->start_sect + lpart->nr_sects)
) {
disk_part_iter_exit(&piter);
mutex_unlock(&bdevp->bd_mutex);
mutex_unlock(&bdev->bd_mutex);
bdput(bdevp);
disk_put_part(part);
return -EBUSY;
}
}
disk_part_iter_exit(&piter);
part_nr_sects_write(part, (sector_t)length);
i_size_write(bdevp->bd_inode, p.length);
mutex_unlock(&bdevp->bd_mutex);
mutex_unlock(&bdev->bd_mutex);
bdput(bdevp);
disk_put_part(part);
return 0;
default:
return -EINVAL;
......
......@@ -84,7 +84,7 @@ ssize_t part_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p));
}
static ssize_t part_ro_show(struct device *dev,
......@@ -294,6 +294,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
err = -ENOMEM;
goto out_free;
}
seqcount_init(&p->nr_sects_seq);
pdev = part_to_dev(p);
p->start_sect = start;
......
......@@ -4146,45 +4146,7 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
static void
fc_bsg_remove(struct request_queue *q)
{
struct request *req; /* block request */
int counts; /* totals for request_list count and starved */
if (q) {
/* Stop taking in new requests */
spin_lock_irq(q->queue_lock);
blk_stop_queue(q);
/* drain all requests in the queue */
while (1) {
/* need the lock to fetch a request
* this may fetch the same reqeust as the previous pass
*/
req = blk_fetch_request(q);
/* save requests in use and starved */
counts = q->rq.count[0] + q->rq.count[1] +
q->rq.starved[0] + q->rq.starved[1];
spin_unlock_irq(q->queue_lock);
/* any requests still outstanding? */
if (counts == 0)
break;
/* This may be the same req as the previous iteration,
* always send the blk_end_request_all after a prefetch.
* It is not okay to not end the request because the
* prefetch started the request.
*/
if (req) {
/* return -ENXIO to indicate that this queue is
* going away
*/
req->errors = -ENXIO;
blk_end_request_all(req, -ENXIO);
}
msleep(200); /* allow bsg to possibly finish */
spin_lock_irq(q->queue_lock);
}
bsg_unregister_queue(q);
blk_cleanup_queue(q);
}
......
......@@ -575,7 +575,7 @@ static int iscsi_remove_host(struct transport_container *tc,
struct iscsi_cls_host *ihost = shost->shost_data;
if (ihost->bsg_q) {
bsg_remove_queue(ihost->bsg_q);
bsg_unregister_queue(ihost->bsg_q);
blk_cleanup_queue(ihost->bsg_q);
}
return 0;
......
......@@ -46,16 +46,23 @@ struct blkcg_gq;
struct request;
typedef void (rq_end_io_fn)(struct request *, int);
#define BLK_RL_SYNCFULL (1U << 0)
#define BLK_RL_ASYNCFULL (1U << 1)
struct request_list {
struct request_queue *q; /* the queue this rl belongs to */
#ifdef CONFIG_BLK_CGROUP
struct blkcg_gq *blkg; /* blkg this request pool belongs to */
#endif
/*
* count[], starved[], and wait[] are indexed by
* BLK_RW_SYNC/BLK_RW_ASYNC
*/
int count[2];
int starved[2];
int elvpriv;
mempool_t *rq_pool;
wait_queue_head_t wait[2];
unsigned int flags;
};
/*
......@@ -138,6 +145,7 @@ struct request {
struct hd_struct *part;
unsigned long start_time;
#ifdef CONFIG_BLK_CGROUP
struct request_list *rl; /* rl this rq is alloced from */
unsigned long long start_time_ns;
unsigned long long io_start_time_ns; /* when passed to hardware */
#endif
......@@ -282,11 +290,16 @@ struct request_queue {
struct list_head queue_head;
struct request *last_merge;
struct elevator_queue *elevator;
int nr_rqs[2]; /* # allocated [a]sync rqs */
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
/*
* the queue request freelist, one for reads and one for writes
* If blkcg is not used, @q->root_rl serves all requests. If blkcg
* is used, root blkg allocates from @q->root_rl and all other
* blkgs from their own blkg->rl. Which one to use should be
* determined using bio_request_list().
*/
struct request_list rq;
struct request_list root_rl;
request_fn_proc *request_fn;
make_request_fn *make_request_fn;
......@@ -561,27 +574,25 @@ static inline bool rq_is_sync(struct request *rq)
return rw_is_sync(rq->cmd_flags);
}
static inline int blk_queue_full(struct request_queue *q, int sync)
static inline bool blk_rl_full(struct request_list *rl, bool sync)
{
if (sync)
return test_bit(QUEUE_FLAG_SYNCFULL, &q->queue_flags);
return test_bit(QUEUE_FLAG_ASYNCFULL, &q->queue_flags);
unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
return rl->flags & flag;
}
static inline void blk_set_queue_full(struct request_queue *q, int sync)
static inline void blk_set_rl_full(struct request_list *rl, bool sync)
{
if (sync)
queue_flag_set(QUEUE_FLAG_SYNCFULL, q);
else
queue_flag_set(QUEUE_FLAG_ASYNCFULL, q);
unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
rl->flags |= flag;
}
static inline void blk_clear_queue_full(struct request_queue *q, int sync)
static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
{
if (sync)
queue_flag_clear(QUEUE_FLAG_SYNCFULL, q);
else
queue_flag_clear(QUEUE_FLAG_ASYNCFULL, q);
unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
rl->flags &= ~flag;
}
......
......@@ -40,6 +40,7 @@ struct blkpg_ioctl_arg {
/* The subfunctions (for the op field) */
#define BLKPG_ADD_PARTITION 1
#define BLKPG_DEL_PARTITION 2
#define BLKPG_RESIZE_PARTITION 3
/* Sizes of name fields. Unused at present. */
#define BLKPG_DEVNAMELTH 64
......
......@@ -67,7 +67,6 @@ void bsg_job_done(struct bsg_job *job, int result,
int bsg_setup_queue(struct device *dev, struct request_queue *q, char *name,
bsg_job_fn *job_fn, int dd_job_size);
void bsg_request_fn(struct request_queue *q);
void bsg_remove_queue(struct request_queue *q);
void bsg_goose_queue(struct request_queue *q);
#endif
......@@ -97,7 +97,13 @@ struct partition_meta_info {
struct hd_struct {
sector_t start_sect;
/*
* nr_sects is protected by sequence counter. One might extend a
* partition while IO is happening to it and update of nr_sects
* can be non-atomic on 32bit machines with 64bit sector_t.
*/
sector_t nr_sects;
seqcount_t nr_sects_seq;
sector_t alignment_offset;
unsigned int discard_alignment;
struct device __dev;
......@@ -647,6 +653,57 @@ static inline void hd_struct_put(struct hd_struct *part)
__delete_partition(part);
}
/*
* Any access of part->nr_sects which is not protected by partition
* bd_mutex or gendisk bdev bd_mutex, should be done using this
* accessor function.
*
* Code written along the lines of i_size_read() and i_size_write().
* CONFIG_PREEMPT case optimizes the case of UP kernel with preemption
* on.
*/
static inline sector_t part_nr_sects_read(struct hd_struct *part)
{
#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
sector_t nr_sects;
unsigned seq;
do {
seq = read_seqcount_begin(&part->nr_sects_seq);
nr_sects = part->nr_sects;
} while (read_seqcount_retry(&part->nr_sects_seq, seq));
return nr_sects;
#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
sector_t nr_sects;
preempt_disable();
nr_sects = part->nr_sects;
preempt_enable();
return nr_sects;
#else
return part->nr_sects;
#endif
}
/*
* Should be called with mutex lock held (typically bd_mutex) of partition
* to provide mutual exlusion among writers otherwise seqcount might be
* left in wrong state leaving the readers spinning infinitely.
*/
static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
{
#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
write_seqcount_begin(&part->nr_sects_seq);
part->nr_sects = size;
write_seqcount_end(&part->nr_sects_seq);
#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
preempt_disable();
part->nr_sects = size;
preempt_enable();
#else
part->nr_sects = size;
#endif
}
#else /* CONFIG_BLOCK */
static inline void printk_all_partitions(void) { }
......
......@@ -26,7 +26,8 @@ typedef struct mempool_s {
extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data);
extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data, int nid);
mempool_free_t *free_fn, void *pool_data,
gfp_t gfp_mask, int nid);
extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask);
extern void mempool_destroy(mempool_t *pool);
......
......@@ -63,19 +63,21 @@ EXPORT_SYMBOL(mempool_destroy);
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
{
return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
GFP_KERNEL, NUMA_NO_NODE);
}
EXPORT_SYMBOL(mempool_create);
mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data, int node_id)
mempool_free_t *free_fn, void *pool_data,
gfp_t gfp_mask, int node_id)
{
mempool_t *pool;
pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id);
pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
if (!pool)
return NULL;
pool->elements = kmalloc_node(min_nr * sizeof(void *),
GFP_KERNEL, node_id);
gfp_mask, node_id);
if (!pool->elements) {
kfree(pool);
return NULL;
......@@ -93,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
while (pool->curr_nr < pool->min_nr) {
void *element;
element = pool->alloc(GFP_KERNEL, pool->pool_data);
element = pool->alloc(gfp_mask, pool->pool_data);
if (unlikely(!element)) {
mempool_destroy(pool);
return NULL;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment