blk-mq: drain I/O when all CPUs in a hctx are offline
Most of blk-mq drivers depend on managed IRQ's auto-affinity to setup up queue mapping. Thomas mentioned the following point[1]: "That was the constraint of managed interrupts from the very beginning: The driver/subsystem has to quiesce the interrupt line and the associated queue _before_ it gets shutdown in CPU unplug and not fiddle with it until it's restarted by the core when the CPU is plugged in again." However, current blk-mq implementation doesn't quiesce hw queue before the last CPU in the hctx is shutdown. Even worse, CPUHP_BLK_MQ_DEAD is a cpuhp state handled after the CPU is down, so there isn't any chance to quiesce the hctx before shutting down the CPU. Add new CPUHP_AP_BLK_MQ_ONLINE state to stop allocating from blk-mq hctxs where the last CPU goes away, and wait for completion of in-flight requests. This guarantees that there is no inflight I/O before shutting down the managed IRQ. Add a BLK_MQ_F_STACKING and set it for dm-rq and loop, so we don't need to wait for completion of in-flight requests from these drivers to avoid a potential dead-lock. It is safe to do this for stacking drivers as those do not use interrupts at all and their I/O completions are triggered by underlying devices I/O completion. [1] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/ [hch: different retry mechanism, merged two patches, minor cleanups] Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Hannes Reinecke <hare@suse.de> Reviewed-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
602380d28e
commit
bf0beec060
|
@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
|
|||
HCTX_STATE_NAME(STOPPED),
|
||||
HCTX_STATE_NAME(TAG_ACTIVE),
|
||||
HCTX_STATE_NAME(SCHED_RESTART),
|
||||
HCTX_STATE_NAME(INACTIVE),
|
||||
};
|
||||
#undef HCTX_STATE_NAME
|
||||
|
||||
|
@ -239,6 +240,7 @@ static const char *const hctx_flag_name[] = {
|
|||
HCTX_FLAG_NAME(TAG_SHARED),
|
||||
HCTX_FLAG_NAME(BLOCKING),
|
||||
HCTX_FLAG_NAME(NO_SCHED),
|
||||
HCTX_FLAG_NAME(STACKING),
|
||||
};
|
||||
#undef HCTX_FLAG_NAME
|
||||
|
||||
|
|
|
@ -180,6 +180,14 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
|
|||
sbitmap_finish_wait(bt, ws, &wait);
|
||||
|
||||
found_tag:
|
||||
/*
|
||||
* Give up this allocation if the hctx is inactive. The caller will
|
||||
* retry on an active hctx.
|
||||
*/
|
||||
if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) {
|
||||
blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
|
||||
return BLK_MQ_NO_TAG;
|
||||
}
|
||||
return tag + tag_offset;
|
||||
}
|
||||
|
||||
|
|
112
block/blk-mq.c
112
block/blk-mq.c
|
@ -375,14 +375,30 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
|
|||
e->type->ops.limit_depth(data->cmd_flags, data);
|
||||
}
|
||||
|
||||
retry:
|
||||
data->ctx = blk_mq_get_ctx(q);
|
||||
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
|
||||
if (!(data->flags & BLK_MQ_REQ_INTERNAL))
|
||||
blk_mq_tag_busy(data->hctx);
|
||||
|
||||
/*
|
||||
* Waiting allocations only fail because of an inactive hctx. In that
|
||||
* case just retry the hctx assignment and tag allocation as CPU hotplug
|
||||
* should have migrated us to an online CPU by now.
|
||||
*/
|
||||
tag = blk_mq_get_tag(data);
|
||||
if (tag == BLK_MQ_NO_TAG)
|
||||
return NULL;
|
||||
if (tag == BLK_MQ_NO_TAG) {
|
||||
if (data->flags & BLK_MQ_REQ_NOWAIT)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Give up the CPU and sleep for a random short time to ensure
|
||||
* that thread using a realtime scheduling class are migrated
|
||||
* off the the CPU, and thus off the hctx that is going away.
|
||||
*/
|
||||
msleep(3);
|
||||
goto retry;
|
||||
}
|
||||
return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
|
||||
}
|
||||
|
||||
|
@ -2335,6 +2351,86 @@ fail:
|
|||
return -ENOMEM;
|
||||
}
|
||||
|
||||
struct rq_iter_data {
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
bool has_rq;
|
||||
};
|
||||
|
||||
static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
|
||||
{
|
||||
struct rq_iter_data *iter_data = data;
|
||||
|
||||
if (rq->mq_hctx != iter_data->hctx)
|
||||
return true;
|
||||
iter_data->has_rq = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
struct blk_mq_tags *tags = hctx->sched_tags ?
|
||||
hctx->sched_tags : hctx->tags;
|
||||
struct rq_iter_data data = {
|
||||
.hctx = hctx,
|
||||
};
|
||||
|
||||
blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
|
||||
return data.has_rq;
|
||||
}
|
||||
|
||||
static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
|
||||
struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
|
||||
return false;
|
||||
if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
|
||||
struct blk_mq_hw_ctx, cpuhp_online);
|
||||
|
||||
if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
|
||||
!blk_mq_last_cpu_in_hctx(cpu, hctx))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Prevent new request from being allocated on the current hctx.
|
||||
*
|
||||
* The smp_mb__after_atomic() Pairs with the implied barrier in
|
||||
* test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
|
||||
* seen once we return from the tag allocator.
|
||||
*/
|
||||
set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
|
||||
smp_mb__after_atomic();
|
||||
|
||||
/*
|
||||
* Try to grab a reference to the queue and wait for any outstanding
|
||||
* requests. If we could not grab a reference the queue has been
|
||||
* frozen and there are no requests.
|
||||
*/
|
||||
if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
|
||||
while (blk_mq_hctx_has_requests(hctx))
|
||||
msleep(5);
|
||||
percpu_ref_put(&hctx->queue->q_usage_counter);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
|
||||
struct blk_mq_hw_ctx, cpuhp_online);
|
||||
|
||||
if (cpumask_test_cpu(cpu, hctx->cpumask))
|
||||
clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* 'cpu' is going away. splice any existing rq_list entries from this
|
||||
* software queue to the hw queue dispatch list, and ensure that it
|
||||
|
@ -2348,6 +2444,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
|
|||
enum hctx_type type;
|
||||
|
||||
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
|
||||
if (!cpumask_test_cpu(cpu, hctx->cpumask))
|
||||
return 0;
|
||||
|
||||
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
|
||||
type = hctx->type;
|
||||
|
||||
|
@ -2371,6 +2470,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
|
|||
|
||||
static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
if (!(hctx->flags & BLK_MQ_F_STACKING))
|
||||
cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
|
||||
&hctx->cpuhp_online);
|
||||
cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
|
||||
&hctx->cpuhp_dead);
|
||||
}
|
||||
|
@ -2430,6 +2532,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
|
|||
{
|
||||
hctx->queue_num = hctx_idx;
|
||||
|
||||
if (!(hctx->flags & BLK_MQ_F_STACKING))
|
||||
cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
|
||||
&hctx->cpuhp_online);
|
||||
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
|
||||
|
||||
hctx->tags = set->tags[hctx_idx];
|
||||
|
@ -3684,6 +3789,9 @@ static int __init blk_mq_init(void)
|
|||
{
|
||||
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
|
||||
blk_mq_hctx_notify_dead);
|
||||
cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
|
||||
blk_mq_hctx_notify_online,
|
||||
blk_mq_hctx_notify_offline);
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(blk_mq_init);
|
||||
|
|
|
@ -2037,7 +2037,7 @@ static int loop_add(struct loop_device **l, int i)
|
|||
lo->tag_set.queue_depth = 128;
|
||||
lo->tag_set.numa_node = NUMA_NO_NODE;
|
||||
lo->tag_set.cmd_size = sizeof(struct loop_cmd);
|
||||
lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
|
||||
lo->tag_set.driver_data = lo;
|
||||
|
||||
err = blk_mq_alloc_tag_set(&lo->tag_set);
|
||||
|
|
|
@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
|
|||
md->tag_set->ops = &dm_mq_ops;
|
||||
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
|
||||
md->tag_set->numa_node = md->numa_node_id;
|
||||
md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
|
||||
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
|
||||
md->tag_set->driver_data = md;
|
||||
|
||||
|
|
|
@ -140,6 +140,8 @@ struct blk_mq_hw_ctx {
|
|||
*/
|
||||
atomic_t nr_active;
|
||||
|
||||
/** @cpuhp_online: List to store request if CPU is going to die */
|
||||
struct hlist_node cpuhp_online;
|
||||
/** @cpuhp_dead: List to store request if some CPU die. */
|
||||
struct hlist_node cpuhp_dead;
|
||||
/** @kobj: Kernel object for sysfs. */
|
||||
|
@ -391,6 +393,11 @@ struct blk_mq_ops {
|
|||
enum {
|
||||
BLK_MQ_F_SHOULD_MERGE = 1 << 0,
|
||||
BLK_MQ_F_TAG_SHARED = 1 << 1,
|
||||
/*
|
||||
* Set when this device requires underlying blk-mq device for
|
||||
* completing IO:
|
||||
*/
|
||||
BLK_MQ_F_STACKING = 1 << 2,
|
||||
BLK_MQ_F_BLOCKING = 1 << 5,
|
||||
BLK_MQ_F_NO_SCHED = 1 << 6,
|
||||
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
|
||||
|
@ -400,6 +407,9 @@ enum {
|
|||
BLK_MQ_S_TAG_ACTIVE = 1,
|
||||
BLK_MQ_S_SCHED_RESTART = 2,
|
||||
|
||||
/* hw queue is inactive after all its CPUs become offline */
|
||||
BLK_MQ_S_INACTIVE = 3,
|
||||
|
||||
BLK_MQ_MAX_DEPTH = 10240,
|
||||
|
||||
BLK_MQ_CPU_WORK_BATCH = 8,
|
||||
|
|
|
@ -152,6 +152,7 @@ enum cpuhp_state {
|
|||
CPUHP_AP_SMPBOOT_THREADS,
|
||||
CPUHP_AP_X86_VDSO_VMA_ONLINE,
|
||||
CPUHP_AP_IRQ_AFFINITY_ONLINE,
|
||||
CPUHP_AP_BLK_MQ_ONLINE,
|
||||
CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS,
|
||||
CPUHP_AP_X86_INTEL_EPB_ONLINE,
|
||||
CPUHP_AP_PERF_ONLINE,
|
||||
|
|
Loading…
Reference in New Issue