Commit 7d080fa8 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.11/block-20240722' of git://git.kernel.dk/linux

Pull more block updates from Jens Axboe:

 - MD fixes via Song:
     - md-cluster fixes (Heming Zhao)
     - raid1 fix (Mateusz Jończyk)

 - s390/dasd module description (Jeff)

 - Series cleaning up and hardening the blk-mq debugfs flag handling
   (John, Christoph)

 - blk-cgroup cleanup (Xiu)

 - Error polled IO attempts if backend doesn't support it (hexue)

 - Fix for an sbitmap hang (Yang)

* tag 'for-6.11/block-20240722' of git://git.kernel.dk/linux: (23 commits)
  blk-cgroup: move congestion_count to struct blkcg
  sbitmap: fix io hung due to race on sbitmap_word::cleared
  block: avoid polling configuration errors
  block: Catch possible entries missing from rqf_name[]
  block: Simplify definition of RQF_NAME()
  block: Use enum to define RQF_x bit indexes
  block: Catch possible entries missing from cmd_flag_name[]
  block: Catch possible entries missing from alloc_policy_name[]
  block: Catch possible entries missing from hctx_flag_name[]
  block: Catch possible entries missing from hctx_state_name[]
  block: Catch possible entries missing from blk_queue_flag_name[]
  block: Make QUEUE_FLAG_x as an enum
  block: Relocate BLK_MQ_MAX_DEPTH
  block: Relocate BLK_MQ_CPU_WORK_BATCH
  block: remove QUEUE_FLAG_STOPPED
  block: Add missing entry to hctx_flag_name[]
  block: Add zone write plugging entry to rqf_name[]
  block: Add missing entries from cmd_flag_name[]
  s390/dasd: fix error checks in dasd_copy_pair_store()
  s390/dasd: add missing MODULE_DESCRIPTION() macros
  ...
parents 02569948 89ed6c9a
......@@ -2182,12 +2182,13 @@ void blk_cgroup_bio_start(struct bio *bio)
bool blk_cgroup_congested(void)
{
struct cgroup_subsys_state *css;
struct blkcg *blkcg;
bool ret = false;
rcu_read_lock();
for (css = blkcg_css(); css; css = css->parent) {
if (atomic_read(&css->cgroup->congestion_count)) {
for (blkcg = css_to_blkcg(blkcg_css()); blkcg;
blkcg = blkcg_parent(blkcg)) {
if (atomic_read(&blkcg->congestion_count)) {
ret = true;
break;
}
......
......@@ -95,6 +95,8 @@ struct blkcg {
struct cgroup_subsys_state css;
spinlock_t lock;
refcount_t online_pin;
/* If there is block congestion on this cgroup. */
atomic_t congestion_count;
struct radix_tree_root blkg_tree;
struct blkcg_gq __rcu *blkg_hint;
......@@ -374,7 +376,7 @@ static inline void blkcg_use_delay(struct blkcg_gq *blkg)
if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
return;
if (atomic_add_return(1, &blkg->use_delay) == 1)
atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
atomic_inc(&blkg->blkcg->congestion_count);
}
static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
......@@ -399,7 +401,7 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
if (old == 0)
return 0;
if (old == 1)
atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
atomic_dec(&blkg->blkcg->congestion_count);
return 1;
}
......@@ -418,7 +420,7 @@ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
/* We only want 1 person setting the congestion count for this blkg. */
if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1))
atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
atomic_inc(&blkg->blkcg->congestion_count);
atomic64_set(&blkg->delay_nsec, delay);
}
......@@ -435,7 +437,7 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
/* We only want 1 person clearing the congestion count for this blkg. */
if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0))
atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
atomic_dec(&blkg->blkcg->congestion_count);
}
/**
......
......@@ -791,8 +791,11 @@ void submit_bio_noacct(struct bio *bio)
}
}
if (!(q->limits.features & BLK_FEAT_POLL))
if (!(q->limits.features & BLK_FEAT_POLL) &&
(bio->bi_opf & REQ_POLLED)) {
bio_clear_polled(bio);
goto not_supported;
}
switch (bio_op(bio)) {
case REQ_OP_READ:
......
......@@ -5,6 +5,7 @@
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/build_bug.h>
#include <linux/debugfs.h>
#include "blk.h"
......@@ -79,7 +80,6 @@ static int queue_pm_only_show(void *data, struct seq_file *m)
#define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name
static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(STOPPED),
QUEUE_FLAG_NAME(DYING),
QUEUE_FLAG_NAME(NOMERGES),
QUEUE_FLAG_NAME(SAME_COMP),
......@@ -100,6 +100,7 @@ static int queue_state_show(void *data, struct seq_file *m)
{
struct request_queue *q = data;
BUILD_BUG_ON(ARRAY_SIZE(blk_queue_flag_name) != QUEUE_FLAG_MAX);
blk_flags_show(m, q->queue_flags, blk_queue_flag_name,
ARRAY_SIZE(blk_queue_flag_name));
seq_puts(m, "\n");
......@@ -164,6 +165,7 @@ static int hctx_state_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
BUILD_BUG_ON(ARRAY_SIZE(hctx_state_name) != BLK_MQ_S_MAX);
blk_flags_show(m, hctx->state, hctx_state_name,
ARRAY_SIZE(hctx_state_name));
seq_puts(m, "\n");
......@@ -181,10 +183,11 @@ static const char *const alloc_policy_name[] = {
static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(SHOULD_MERGE),
HCTX_FLAG_NAME(TAG_QUEUE_SHARED),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
HCTX_FLAG_NAME(STACKING),
HCTX_FLAG_NAME(TAG_HCTX_SHARED),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT),
};
#undef HCTX_FLAG_NAME
......@@ -193,6 +196,10 @@ static int hctx_flags_show(void *data, struct seq_file *m)
struct blk_mq_hw_ctx *hctx = data;
const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags);
BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) !=
BLK_MQ_F_ALLOC_POLICY_START_BIT);
BUILD_BUG_ON(ARRAY_SIZE(alloc_policy_name) != BLK_TAG_ALLOC_MAX);
seq_puts(m, "alloc_policy=");
if (alloc_policy < ARRAY_SIZE(alloc_policy_name) &&
alloc_policy_name[alloc_policy])
......@@ -223,12 +230,17 @@ static const char *const cmd_flag_name[] = {
CMD_FLAG_NAME(RAHEAD),
CMD_FLAG_NAME(BACKGROUND),
CMD_FLAG_NAME(NOWAIT),
CMD_FLAG_NAME(NOUNMAP),
CMD_FLAG_NAME(POLLED),
CMD_FLAG_NAME(ALLOC_CACHE),
CMD_FLAG_NAME(SWAP),
CMD_FLAG_NAME(DRV),
CMD_FLAG_NAME(FS_PRIVATE),
CMD_FLAG_NAME(ATOMIC),
CMD_FLAG_NAME(NOUNMAP),
};
#undef CMD_FLAG_NAME
#define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name
#define RQF_NAME(name) [__RQF_##name] = #name
static const char *const rqf_name[] = {
RQF_NAME(STARTED),
RQF_NAME(FLUSH_SEQ),
......@@ -243,6 +255,7 @@ static const char *const rqf_name[] = {
RQF_NAME(HASHED),
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_PLUGGING),
RQF_NAME(TIMED_OUT),
RQF_NAME(RESV),
};
......@@ -268,6 +281,9 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
const enum req_op op = req_op(rq);
const char *op_str = blk_op_str(op);
BUILD_BUG_ON(ARRAY_SIZE(cmd_flag_name) != __REQ_NR_BITS);
BUILD_BUG_ON(ARRAY_SIZE(rqf_name) != __RQF_BITS);
seq_printf(m, "%p {.op=", rq);
if (strcmp(op_str, "UNKNOWN") == 0)
seq_printf(m, "%u", op);
......
......@@ -36,6 +36,8 @@ enum {
BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1,
};
#define BLK_MQ_CPU_WORK_BATCH (8)
typedef unsigned int __bitwise blk_insert_t;
#define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01)
......
......@@ -15,6 +15,7 @@
#define LVB_SIZE 64
#define NEW_DEV_TIMEOUT 5000
#define WAIT_DLM_LOCK_TIMEOUT (30 * HZ)
struct dlm_lock_resource {
dlm_lockspace_t *ls;
......@@ -56,6 +57,7 @@ struct resync_info {
#define MD_CLUSTER_ALREADY_IN_CLUSTER 6
#define MD_CLUSTER_PENDING_RECV_EVENT 7
#define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8
#define MD_CLUSTER_WAITING_FOR_SYNC 9
struct md_cluster_info {
struct mddev *mddev; /* the md device which md_cluster_info belongs to */
......@@ -91,6 +93,7 @@ struct md_cluster_info {
sector_t sync_hi;
};
/* For compatibility, add the new msg_type at the end. */
enum msg_type {
METADATA_UPDATED = 0,
RESYNCING,
......@@ -100,6 +103,7 @@ enum msg_type {
BITMAP_NEEDS_SYNC,
CHANGE_CAPACITY,
BITMAP_RESIZE,
RESYNCING_START,
};
struct cluster_msg {
......@@ -130,8 +134,13 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
0, sync_ast, res, res->bast);
if (ret)
return ret;
wait_event(res->sync_locking, res->sync_locking_done);
ret = wait_event_timeout(res->sync_locking, res->sync_locking_done,
WAIT_DLM_LOCK_TIMEOUT);
res->sync_locking_done = false;
if (!ret) {
pr_err("locking DLM '%s' timeout!\n", res->name);
return -EBUSY;
}
if (res->lksb.sb_status == 0)
res->mode = mode;
return res->lksb.sb_status;
......@@ -455,6 +464,7 @@ static void process_suspend_info(struct mddev *mddev,
clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
remove_suspend_info(mddev, slot);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state);
md_wakeup_thread(mddev->thread);
return;
}
......@@ -525,6 +535,7 @@ static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
res = -1;
}
clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
set_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state);
return res;
}
......@@ -593,6 +604,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
case CHANGE_CAPACITY:
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
break;
case RESYNCING_START:
clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &mddev->cluster_info->state);
break;
case RESYNCING:
set_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
process_suspend_info(mddev, le32_to_cpu(msg->slot),
......@@ -743,7 +757,7 @@ static void unlock_comm(struct md_cluster_info *cinfo)
*/
static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
{
int error;
int error, unlock_error;
int slot = cinfo->slot_number - 1;
cmsg->slot = cpu_to_le32(slot);
......@@ -751,7 +765,7 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
if (error) {
pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
goto failed_message;
return error;
}
memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
......@@ -781,14 +795,10 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
}
failed_ack:
error = dlm_unlock_sync(cinfo->message_lockres);
if (unlikely(error != 0)) {
while ((unlock_error = dlm_unlock_sync(cinfo->message_lockres)))
pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
error);
/* in case the message can't be released due to some reason */
goto failed_ack;
}
failed_message:
unlock_error);
return error;
}
......@@ -1343,6 +1353,23 @@ static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi)
spin_unlock_irq(&cinfo->suspend_lock);
}
static int resync_status_get(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
return test_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state);
}
static int resync_start_notify(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg = {0};
cmsg.type = cpu_to_le32(RESYNCING_START);
return sendmsg(cinfo, &cmsg, 0);
}
static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
......@@ -1577,6 +1604,8 @@ static const struct md_cluster_operations cluster_ops = {
.resync_start = resync_start,
.resync_finish = resync_finish,
.resync_info_update = resync_info_update,
.resync_start_notify = resync_start_notify,
.resync_status_get = resync_status_get,
.resync_info_get = resync_info_get,
.metadata_update_start = metadata_update_start,
.metadata_update_finish = metadata_update_finish,
......
......@@ -14,6 +14,8 @@ struct md_cluster_operations {
int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev);
int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
int (*resync_start_notify)(struct mddev *mddev);
int (*resync_status_get)(struct mddev *mddev);
void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi);
int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev);
......
......@@ -8978,7 +8978,8 @@ void md_do_sync(struct md_thread *thread)
* This will mean we have to start checking from the beginning again.
*
*/
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_start_notify(mddev);
do {
int mddev2_minor = -1;
mddev->curr_resync = MD_RESYNC_DELAYED;
......@@ -9992,7 +9993,17 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
*/
if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RESHAPE_ACTIVE)) {
MD_FEATURE_RESHAPE_ACTIVE) &&
!md_cluster_ops->resync_status_get(mddev)) {
/*
* -1 to make raid1_add_disk() set conf->fullsync
* to 1. This could avoid skipping sync when the
* remote node is down during resyncing.
*/
if ((le32_to_cpu(sb->feature_map)
& MD_FEATURE_RECOVERY_OFFSET))
rdev2->saved_raid_disk = -1;
else
rdev2->saved_raid_disk = role;
ret = remove_and_add_spares(mddev, rdev2);
pr_info("Activated spare: %pg\n",
......
......@@ -680,6 +680,7 @@ static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio,
len = r1_bio->sectors;
read_len = raid1_check_read_range(rdev, this_sector, &len);
if (read_len == r1_bio->sectors) {
*max_sectors = read_len;
update_read_sectors(conf, disk, this_sector, read_len);
return disk;
}
......
......@@ -2248,13 +2248,19 @@ static ssize_t dasd_copy_pair_store(struct device *dev,
/* allocate primary devmap if needed */
prim_devmap = dasd_find_busid(prim_busid);
if (IS_ERR(prim_devmap))
if (IS_ERR(prim_devmap)) {
prim_devmap = dasd_add_busid(prim_busid, DASD_FEATURE_DEFAULT);
if (IS_ERR(prim_devmap))
return PTR_ERR(prim_devmap);
}
/* allocate secondary devmap if needed */
sec_devmap = dasd_find_busid(sec_busid);
if (IS_ERR(sec_devmap))
if (IS_ERR(sec_devmap)) {
sec_devmap = dasd_add_busid(sec_busid, DASD_FEATURE_DEFAULT);
if (IS_ERR(sec_devmap))
return PTR_ERR(sec_devmap);
}
/* setting copy relation is only allowed for offline secondary */
if (sec_devmap->device)
......
......@@ -29,6 +29,7 @@
#include "dasd_int.h"
#include "dasd_diag.h"
MODULE_DESCRIPTION("S/390 Support for DIAG access to DASD Disks");
MODULE_LICENSE("GPL");
/* The maximum number of blocks per request (max_blocks) is dependent on the
......
......@@ -44,6 +44,7 @@
/* 64k are 128 x 512 byte sectors */
#define DASD_RAW_SECTORS_PER_TRACK 128
MODULE_DESCRIPTION("S/390 DASD ECKD Disks device driver");
MODULE_LICENSE("GPL");
static struct dasd_discipline dasd_eckd_discipline;
......
......@@ -32,6 +32,7 @@
#define DASD_FBA_CCW_LOCATE 0x43
#define DASD_FBA_CCW_DEFINE_EXTENT 0x63
MODULE_DESCRIPTION("S/390 DASD FBA Disks device driver");
MODULE_LICENSE("GPL");
static struct dasd_discipline dasd_fba_discipline;
......
......@@ -27,38 +27,61 @@ typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t);
* request flags */
typedef __u32 __bitwise req_flags_t;
/* drive already may have started this one */
#define RQF_STARTED ((__force req_flags_t)(1 << 1))
/* request for flush sequence */
#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4))
/* merge of different types, fail separately */
#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5))
/* don't call prep for this one */
#define RQF_DONTPREP ((__force req_flags_t)(1 << 7))
/* use hctx->sched_tags */
#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << 8))
/* use an I/O scheduler for this request */
#define RQF_USE_SCHED ((__force req_flags_t)(1 << 9))
/* vaguely specified driver internal error. Ignored by the block layer */
#define RQF_FAILED ((__force req_flags_t)(1 << 10))
/* don't warn about errors */
#define RQF_QUIET ((__force req_flags_t)(1 << 11))
/* account into disk and partition IO statistics */
#define RQF_IO_STAT ((__force req_flags_t)(1 << 13))
/* runtime pm request */
#define RQF_PM ((__force req_flags_t)(1 << 15))
/* on IO scheduler merge hash */
#define RQF_HASHED ((__force req_flags_t)(1 << 16))
/* track IO completion time */
#define RQF_STATS ((__force req_flags_t)(1 << 17))
/* Look at ->special_vec for the actual data payload instead of the
/* Keep rqf_name[] in sync with the definitions below */
enum {
/* drive already may have started this one */
__RQF_STARTED,
/* request for flush sequence */
__RQF_FLUSH_SEQ,
/* merge of different types, fail separately */
__RQF_MIXED_MERGE,
/* don't call prep for this one */
__RQF_DONTPREP,
/* use hctx->sched_tags */
__RQF_SCHED_TAGS,
/* use an I/O scheduler for this request */
__RQF_USE_SCHED,
/* vaguely specified driver internal error. Ignored by block layer */
__RQF_FAILED,
/* don't warn about errors */
__RQF_QUIET,
/* account into disk and partition IO statistics */
__RQF_IO_STAT,
/* runtime pm request */
__RQF_PM,
/* on IO scheduler merge hash */
__RQF_HASHED,
/* track IO completion time */
__RQF_STATS,
/* Look at ->special_vec for the actual data payload instead of the
bio chain. */
#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
/* The request completion needs to be signaled to zone write pluging. */
#define RQF_ZONE_WRITE_PLUGGING ((__force req_flags_t)(1 << 20))
/* ->timeout has been called, don't expire again */
#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21))
#define RQF_RESV ((__force req_flags_t)(1 << 23))
__RQF_SPECIAL_PAYLOAD,
/* request completion needs to be signaled to zone write plugging. */
__RQF_ZONE_WRITE_PLUGGING,
/* ->timeout has been called, don't expire again */
__RQF_TIMED_OUT,
__RQF_RESV,
__RQF_BITS
};
#define RQF_STARTED ((__force req_flags_t)(1 << __RQF_STARTED))
#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << __RQF_FLUSH_SEQ))
#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << __RQF_MIXED_MERGE))
#define RQF_DONTPREP ((__force req_flags_t)(1 << __RQF_DONTPREP))
#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << __RQF_SCHED_TAGS))
#define RQF_USE_SCHED ((__force req_flags_t)(1 << __RQF_USE_SCHED))
#define RQF_FAILED ((__force req_flags_t)(1 << __RQF_FAILED))
#define RQF_QUIET ((__force req_flags_t)(1 << __RQF_QUIET))
#define RQF_IO_STAT ((__force req_flags_t)(1 << __RQF_IO_STAT))
#define RQF_PM ((__force req_flags_t)(1 << __RQF_PM))
#define RQF_HASHED ((__force req_flags_t)(1 << __RQF_HASHED))
#define RQF_STATS ((__force req_flags_t)(1 << __RQF_STATS))
#define RQF_SPECIAL_PAYLOAD \
((__force req_flags_t)(1 << __RQF_SPECIAL_PAYLOAD))
#define RQF_ZONE_WRITE_PLUGGING \
((__force req_flags_t)(1 << __RQF_ZONE_WRITE_PLUGGING))
#define RQF_TIMED_OUT ((__force req_flags_t)(1 << __RQF_TIMED_OUT))
#define RQF_RESV ((__force req_flags_t)(1 << __RQF_RESV))
/* flags that prevent us from merging requests: */
#define RQF_NOMERGE_FLAGS \
......@@ -278,8 +301,12 @@ enum blk_eh_timer_return {
BLK_EH_RESET_TIMER,
};
#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
/* Keep alloc_policy_name[] in sync with the definitions below */
enum {
BLK_TAG_ALLOC_FIFO, /* allocate starting from 0 */
BLK_TAG_ALLOC_RR, /* allocate starting from last allocated tag */
BLK_TAG_ALLOC_MAX
};
/**
* struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
......@@ -644,6 +671,7 @@ struct blk_mq_ops {
#endif
};
/* Keep hctx_flag_name[] in sync with the definitions below */
enum {
BLK_MQ_F_SHOULD_MERGE = 1 << 0,
BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
......@@ -653,27 +681,17 @@ enum {
*/
BLK_MQ_F_STACKING = 1 << 2,
BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
BLK_MQ_F_BLOCKING = 1 << 5,
BLK_MQ_F_BLOCKING = 1 << 4,
/* Do not allow an I/O scheduler to be configured. */
BLK_MQ_F_NO_SCHED = 1 << 6,
BLK_MQ_F_NO_SCHED = 1 << 5,
/*
* Select 'none' during queue registration in case of a single hwq
* or shared hwqs instead of 'mq-deadline'.
*/
BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7,
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6,
BLK_MQ_F_ALLOC_POLICY_START_BIT = 7,
BLK_MQ_F_ALLOC_POLICY_BITS = 1,
BLK_MQ_S_STOPPED = 0,
BLK_MQ_S_TAG_ACTIVE = 1,
BLK_MQ_S_SCHED_RESTART = 2,
/* hw queue is inactive after all its CPUs become offline */
BLK_MQ_S_INACTIVE = 3,
BLK_MQ_MAX_DEPTH = 10240,
BLK_MQ_CPU_WORK_BATCH = 8,
};
#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \
......@@ -682,8 +700,19 @@ enum {
((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
<< BLK_MQ_F_ALLOC_POLICY_START_BIT)
#define BLK_MQ_MAX_DEPTH (10240)
#define BLK_MQ_NO_HCTX_IDX (-1U)
enum {
/* Keep hctx_state_name[] in sync with the definitions below */
BLK_MQ_S_STOPPED,
BLK_MQ_S_TAG_ACTIVE,
BLK_MQ_S_SCHED_RESTART,
/* hw queue is inactive after all its CPUs become offline */
BLK_MQ_S_INACTIVE,
BLK_MQ_S_MAX
};
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata,
struct lock_class_key *lkclass);
......
......@@ -354,6 +354,7 @@ enum req_op {
REQ_OP_LAST = (__force blk_opf_t)36,
};
/* Keep cmd_flag_name[] in sync with the definitions below */
enum req_flag_bits {
__REQ_FAILFAST_DEV = /* no driver retries of device errors */
REQ_OP_BITS,
......
......@@ -588,27 +588,28 @@ struct request_queue {
};
/* Keep blk_queue_flag_name[] in sync with the definitions below */
#define QUEUE_FLAG_STOPPED 0 /* queue is stopped */
#define QUEUE_FLAG_DYING 1 /* queue being torn down */
#define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */
#define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */
#define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */
#define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */
#define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */
#define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */
#define QUEUE_FLAG_STATS 20 /* track IO start and completion times */
#define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */
#define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */
#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */
enum {
QUEUE_FLAG_DYING, /* queue being torn down */
QUEUE_FLAG_NOMERGES, /* disable merge attempts */
QUEUE_FLAG_SAME_COMP, /* complete on same CPU-group */
QUEUE_FLAG_FAIL_IO, /* fake timeout */
QUEUE_FLAG_NOXMERGES, /* No extended merges */
QUEUE_FLAG_SAME_FORCE, /* force complete on same CPU */
QUEUE_FLAG_INIT_DONE, /* queue is initialized */
QUEUE_FLAG_STATS, /* track IO start and completion times */
QUEUE_FLAG_REGISTERED, /* queue has been registered to a disk */
QUEUE_FLAG_QUIESCED, /* queue has been quiesced */
QUEUE_FLAG_RQ_ALLOC_TIME, /* record rq->alloc_time_ns */
QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */
QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */
QUEUE_FLAG_MAX
};
#define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP)
void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
#define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
......
......@@ -539,9 +539,6 @@ struct cgroup {
/* used to store eBPF programs */
struct cgroup_bpf bpf;
/* If there is block congestion on this cgroup. */
atomic_t congestion_count;
/* Used to store internal freezer state */
struct cgroup_freezer_state freezer;
......
......@@ -36,6 +36,11 @@ struct sbitmap_word {
* @cleared: word holding cleared bits
*/
unsigned long cleared ____cacheline_aligned_in_smp;
/**
* @swap_lock: serializes simultaneous updates of ->word and ->cleared
*/
spinlock_t swap_lock;
} ____cacheline_aligned_in_smp;
/**
......
......@@ -60,13 +60,31 @@ static inline void update_alloc_hint_after_get(struct sbitmap *sb,
/*
* See if we have deferred clears that we can batch move
*/
static inline bool sbitmap_deferred_clear(struct sbitmap_word *map)
static inline bool sbitmap_deferred_clear(struct sbitmap_word *map,
unsigned int depth, unsigned int alloc_hint, bool wrap)
{
unsigned long mask;
unsigned long mask, word_mask;
if (!READ_ONCE(map->cleared))
guard(spinlock_irqsave)(&map->swap_lock);
if (!map->cleared) {
if (depth == 0)
return false;
word_mask = (~0UL) >> (BITS_PER_LONG - depth);
/*
* The current behavior is to always retry after moving
* ->cleared to word, and we change it to retry in case
* of any free bits. To avoid an infinite loop, we need
* to take wrap & alloc_hint into account, otherwise a
* soft lockup may occur.
*/
if (!wrap && alloc_hint)
word_mask &= ~((1UL << alloc_hint) - 1);
return (READ_ONCE(map->word) & word_mask) != word_mask;
}
/*
* First get a stable cleared mask, setting the old mask to 0.
*/
......@@ -85,6 +103,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
bool alloc_hint)
{
unsigned int bits_per_word;
int i;
if (shift < 0)
shift = sbitmap_calculate_shift(depth);
......@@ -116,6 +135,9 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
return -ENOMEM;
}
for (i = 0; i < sb->map_nr; i++)
spin_lock_init(&sb->map[i].swap_lock);
return 0;
}
EXPORT_SYMBOL_GPL(sbitmap_init_node);
......@@ -126,7 +148,7 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
unsigned int i;
for (i = 0; i < sb->map_nr; i++)
sbitmap_deferred_clear(&sb->map[i]);
sbitmap_deferred_clear(&sb->map[i], 0, 0, 0);
sb->depth = depth;
sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
......@@ -179,7 +201,7 @@ static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
alloc_hint, wrap);
if (nr != -1)
break;
if (!sbitmap_deferred_clear(map))
if (!sbitmap_deferred_clear(map, depth, alloc_hint, wrap))
break;
} while (1);
......@@ -496,7 +518,7 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
unsigned int map_depth = __map_depth(sb, index);
unsigned long val;
sbitmap_deferred_clear(map);
sbitmap_deferred_clear(map, 0, 0, 0);
val = READ_ONCE(map->word);
if (val == (1UL << (map_depth - 1)) - 1)
goto next;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment