Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: "This is the main pull request for block IO related changes for the 4.16 kernel. Nothing major in this pull request, but a good amount of improvements and fixes all over the map. This contains: - BFQ improvements, fixes, and cleanups from Angelo, Chiara, and Paolo. - Support for SMR zones for deadline and mq-deadline from Damien and Christoph. - Set of fixes for bcache by way of Michael Lyle, including fixes from himself, Kent, Rui, Tang, and Coly. - Series from Matias for lightnvm with fixes from Hans Holmberg, Javier, and Matias. Mostly centered around pblk, and the removing rrpc 1.2 in preparation for supporting 2.0. - A couple of NVMe pull requests from Christoph. Nothing major in here, just fixes and cleanups, and support for command tracing from Johannes. - Support for blk-throttle for tracking reads and writes separately. From Joseph Qi. A few cleanups/fixes also for blk-throttle from Weiping. - Series from Mike Snitzer that enables dm to register its queue more logically, something that's alwways been problematic on dm since it's a stacked device. - Series from Ming cleaning up some of the bio accessor use, in preparation for supporting multipage bvecs. - Various fixes from Ming closing up holes around queue mapping and quiescing. - BSD partition fix from Richard Narron, fixing a problem where we can't mount newer (10/11) FreeBSD partitions. - Series from Tejun reworking blk-mq timeout handling. The previous scheme relied on atomic bits, but it had races where we would think a request had timed out if it to reused at the wrong time. - null_blk now supports faking timeouts, to enable us to better exercise and test that functionality separately. From me. - Kill the separate atomic poll bit in the request struct. After this, we don't use the atomic bits on blk-mq anymore at all. From me. - sgl_alloc/free helpers from Bart. - Heavily contended tag case scalability improvement from me. - Various little fixes and cleanups from Arnd, Bart, Corentin, Douglas, Eryu, Goldwyn, and myself" * 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits) block: remove smart1,2.h nvme: add tracepoint for nvme_complete_rq nvme: add tracepoint for nvme_setup_cmd nvme-pci: introduce RECONNECTING state to mark initializing procedure nvme-rdma: remove redundant boolean for inline_data nvme: don't free uuid pointer before printing it nvme-pci: Suspend queues after deleting them bsg: use pr_debug instead of hand crafted macros blk-mq-debugfs: don't allow write on attributes with seq_operations set nvme-pci: Fix queue double allocations block: Set BIO_TRACE_COMPLETION on new bio during split blk-throttle: use queue_is_rq_based block: Remove kblockd_schedule_delayed_work{,_on}() blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly() lib/scatterlist: Fix chaining support in sgl_alloc_order() blk-throttle: track read and write request individually block: add bdev_read_only() checks to common helpers block: fail op_is_write() requests to read-only partitions blk-throttle: export io_serviced_recursive, io_service_bytes_recursive ...
This commit is contained in:
commit
0a4b6e2f80
|
@ -775,10 +775,11 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
|
|||
unsigned long flags;
|
||||
int i;
|
||||
|
||||
if (!entity) /* root group */
|
||||
return;
|
||||
|
||||
spin_lock_irqsave(&bfqd->lock, flags);
|
||||
|
||||
if (!entity) /* root group */
|
||||
goto put_async_queues;
|
||||
|
||||
/*
|
||||
* Empty all service_trees belonging to this group before
|
||||
* deactivating the group itself.
|
||||
|
@ -809,6 +810,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
|
|||
}
|
||||
|
||||
__bfq_deactivate_entity(entity, false);
|
||||
|
||||
put_async_queues:
|
||||
bfq_put_async_queues(bfqd, bfqg);
|
||||
|
||||
spin_unlock_irqrestore(&bfqd->lock, flags);
|
||||
|
|
|
@ -166,6 +166,20 @@ static const int bfq_async_charge_factor = 10;
|
|||
/* Default timeout values, in jiffies, approximating CFQ defaults. */
|
||||
const int bfq_timeout = HZ / 8;
|
||||
|
||||
/*
|
||||
* Time limit for merging (see comments in bfq_setup_cooperator). Set
|
||||
* to the slowest value that, in our tests, proved to be effective in
|
||||
* removing false positives, while not causing true positives to miss
|
||||
* queue merging.
|
||||
*
|
||||
* As can be deduced from the low time limit below, queue merging, if
|
||||
* successful, happens at the very beggining of the I/O of the involved
|
||||
* cooperating processes, as a consequence of the arrival of the very
|
||||
* first requests from each cooperator. After that, there is very
|
||||
* little chance to find cooperators.
|
||||
*/
|
||||
static const unsigned long bfq_merge_time_limit = HZ/10;
|
||||
|
||||
static struct kmem_cache *bfq_pool;
|
||||
|
||||
/* Below this threshold (in ns), we consider thinktime immediate. */
|
||||
|
@ -178,7 +192,7 @@ static struct kmem_cache *bfq_pool;
|
|||
#define BFQQ_SEEK_THR (sector_t)(8 * 100)
|
||||
#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
|
||||
#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
|
||||
#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
|
||||
#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
|
||||
|
||||
/* Min number of samples required to perform peak-rate update */
|
||||
#define BFQ_RATE_MIN_SAMPLES 32
|
||||
|
@ -195,15 +209,17 @@ static struct kmem_cache *bfq_pool;
|
|||
* interactive applications automatically, using the following formula:
|
||||
* duration = (R / r) * T, where r is the peak rate of the device, and
|
||||
* R and T are two reference parameters.
|
||||
* In particular, R is the peak rate of the reference device (see below),
|
||||
* and T is a reference time: given the systems that are likely to be
|
||||
* installed on the reference device according to its speed class, T is
|
||||
* about the maximum time needed, under BFQ and while reading two files in
|
||||
* parallel, to load typical large applications on these systems.
|
||||
* In practice, the slower/faster the device at hand is, the more/less it
|
||||
* takes to load applications with respect to the reference device.
|
||||
* Accordingly, the longer/shorter BFQ grants weight raising to interactive
|
||||
* applications.
|
||||
* In particular, R is the peak rate of the reference device (see
|
||||
* below), and T is a reference time: given the systems that are
|
||||
* likely to be installed on the reference device according to its
|
||||
* speed class, T is about the maximum time needed, under BFQ and
|
||||
* while reading two files in parallel, to load typical large
|
||||
* applications on these systems (see the comments on
|
||||
* max_service_from_wr below, for more details on how T is obtained).
|
||||
* In practice, the slower/faster the device at hand is, the more/less
|
||||
* it takes to load applications with respect to the reference device.
|
||||
* Accordingly, the longer/shorter BFQ grants weight raising to
|
||||
* interactive applications.
|
||||
*
|
||||
* BFQ uses four different reference pairs (R, T), depending on:
|
||||
* . whether the device is rotational or non-rotational;
|
||||
|
@ -240,6 +256,60 @@ static int T_slow[2];
|
|||
static int T_fast[2];
|
||||
static int device_speed_thresh[2];
|
||||
|
||||
/*
|
||||
* BFQ uses the above-detailed, time-based weight-raising mechanism to
|
||||
* privilege interactive tasks. This mechanism is vulnerable to the
|
||||
* following false positives: I/O-bound applications that will go on
|
||||
* doing I/O for much longer than the duration of weight
|
||||
* raising. These applications have basically no benefit from being
|
||||
* weight-raised at the beginning of their I/O. On the opposite end,
|
||||
* while being weight-raised, these applications
|
||||
* a) unjustly steal throughput to applications that may actually need
|
||||
* low latency;
|
||||
* b) make BFQ uselessly perform device idling; device idling results
|
||||
* in loss of device throughput with most flash-based storage, and may
|
||||
* increase latencies when used purposelessly.
|
||||
*
|
||||
* BFQ tries to reduce these problems, by adopting the following
|
||||
* countermeasure. To introduce this countermeasure, we need first to
|
||||
* finish explaining how the duration of weight-raising for
|
||||
* interactive tasks is computed.
|
||||
*
|
||||
* For a bfq_queue deemed as interactive, the duration of weight
|
||||
* raising is dynamically adjusted, as a function of the estimated
|
||||
* peak rate of the device, so as to be equal to the time needed to
|
||||
* execute the 'largest' interactive task we benchmarked so far. By
|
||||
* largest task, we mean the task for which each involved process has
|
||||
* to do more I/O than for any of the other tasks we benchmarked. This
|
||||
* reference interactive task is the start-up of LibreOffice Writer,
|
||||
* and in this task each process/bfq_queue needs to have at most ~110K
|
||||
* sectors transferred.
|
||||
*
|
||||
* This last piece of information enables BFQ to reduce the actual
|
||||
* duration of weight-raising for at least one class of I/O-bound
|
||||
* applications: those doing sequential or quasi-sequential I/O. An
|
||||
* example is file copy. In fact, once started, the main I/O-bound
|
||||
* processes of these applications usually consume the above 110K
|
||||
* sectors in much less time than the processes of an application that
|
||||
* is starting, because these I/O-bound processes will greedily devote
|
||||
* almost all their CPU cycles only to their target,
|
||||
* throughput-friendly I/O operations. This is even more true if BFQ
|
||||
* happens to be underestimating the device peak rate, and thus
|
||||
* overestimating the duration of weight raising. But, according to
|
||||
* our measurements, once transferred 110K sectors, these processes
|
||||
* have no right to be weight-raised any longer.
|
||||
*
|
||||
* Basing on the last consideration, BFQ ends weight-raising for a
|
||||
* bfq_queue if the latter happens to have received an amount of
|
||||
* service at least equal to the following constant. The constant is
|
||||
* set to slightly more than 110K, to have a minimum safety margin.
|
||||
*
|
||||
* This early ending of weight-raising reduces the amount of time
|
||||
* during which interactive false positives cause the two problems
|
||||
* described at the beginning of these comments.
|
||||
*/
|
||||
static const unsigned long max_service_from_wr = 120000;
|
||||
|
||||
#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0])
|
||||
#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
|
||||
|
||||
|
@ -403,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* See the comments on bfq_limit_depth for the purpose of
|
||||
* the depths set in the function.
|
||||
*/
|
||||
static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
|
||||
{
|
||||
bfqd->sb_shift = bt->sb.shift;
|
||||
|
||||
/*
|
||||
* In-word depths if no bfq_queue is being weight-raised:
|
||||
* leaving 25% of tags only for sync reads.
|
||||
*
|
||||
* In next formulas, right-shift the value
|
||||
* (1U<<bfqd->sb_shift), instead of computing directly
|
||||
* (1U<<(bfqd->sb_shift - something)), to be robust against
|
||||
* any possible value of bfqd->sb_shift, without having to
|
||||
* limit 'something'.
|
||||
*/
|
||||
/* no more than 50% of tags for async I/O */
|
||||
bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U);
|
||||
/*
|
||||
* no more than 75% of tags for sync writes (25% extra tags
|
||||
* w.r.t. async I/O, to prevent async I/O from starving sync
|
||||
* writes)
|
||||
*/
|
||||
bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U);
|
||||
|
||||
/*
|
||||
* In-word depths in case some bfq_queue is being weight-
|
||||
* raised: leaving ~63% of tags for sync reads. This is the
|
||||
* highest percentage for which, in our tests, application
|
||||
* start-up times didn't suffer from any regression due to tag
|
||||
* shortage.
|
||||
*/
|
||||
/* no more than ~18% of tags for async I/O */
|
||||
bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U);
|
||||
/* no more than ~37% of tags for sync writes (~20% extra tags) */
|
||||
bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U);
|
||||
}
|
||||
|
||||
/*
|
||||
* Async I/O can easily starve sync I/O (both sync reads and sync
|
||||
* writes), by consuming all tags. Similarly, storms of sync writes,
|
||||
* such as those that sync(2) may trigger, can starve sync reads.
|
||||
* Limit depths of async I/O and sync writes so as to counter both
|
||||
* problems.
|
||||
*/
|
||||
static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
|
||||
{
|
||||
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
|
||||
struct bfq_data *bfqd = data->q->elevator->elevator_data;
|
||||
struct sbitmap_queue *bt;
|
||||
|
||||
if (op_is_sync(op) && !op_is_write(op))
|
||||
return;
|
||||
|
||||
if (data->flags & BLK_MQ_REQ_RESERVED) {
|
||||
if (unlikely(!tags->nr_reserved_tags)) {
|
||||
WARN_ON_ONCE(1);
|
||||
return;
|
||||
}
|
||||
bt = &tags->breserved_tags;
|
||||
} else
|
||||
bt = &tags->bitmap_tags;
|
||||
|
||||
if (unlikely(bfqd->sb_shift != bt->sb.shift))
|
||||
bfq_update_depths(bfqd, bt);
|
||||
|
||||
data->shallow_depth =
|
||||
bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
|
||||
|
||||
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
|
||||
__func__, bfqd->wr_busy_queues, op_is_sync(op),
|
||||
data->shallow_depth);
|
||||
}
|
||||
|
||||
static struct bfq_queue *
|
||||
bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
|
||||
sector_t sector, struct rb_node **ret_parent,
|
||||
|
@ -444,6 +590,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
|
|||
return bfqq;
|
||||
}
|
||||
|
||||
static bool bfq_too_late_for_merging(struct bfq_queue *bfqq)
|
||||
{
|
||||
return bfqq->service_from_backlogged > 0 &&
|
||||
time_is_before_jiffies(bfqq->first_IO_time +
|
||||
bfq_merge_time_limit);
|
||||
}
|
||||
|
||||
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
||||
{
|
||||
struct rb_node **p, *parent;
|
||||
|
@ -454,6 +607,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
|||
bfqq->pos_root = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* bfqq cannot be merged any longer (see comments in
|
||||
* bfq_setup_cooperator): no point in adding bfqq into the
|
||||
* position tree.
|
||||
*/
|
||||
if (bfq_too_late_for_merging(bfqq))
|
||||
return;
|
||||
|
||||
if (bfq_class_idle(bfqq))
|
||||
return;
|
||||
if (!bfqq->next_rq)
|
||||
|
@ -1247,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
|
|||
if (old_wr_coeff == 1 && wr_or_deserves_wr) {
|
||||
/* start a weight-raising period */
|
||||
if (interactive) {
|
||||
bfqq->service_from_wr = 0;
|
||||
bfqq->wr_coeff = bfqd->bfq_wr_coeff;
|
||||
bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
|
||||
} else {
|
||||
|
@ -1627,6 +1789,8 @@ static void bfq_remove_request(struct request_queue *q,
|
|||
rb_erase(&bfqq->pos_node, bfqq->pos_root);
|
||||
bfqq->pos_root = NULL;
|
||||
}
|
||||
} else {
|
||||
bfq_pos_tree_add_move(bfqd, bfqq);
|
||||
}
|
||||
|
||||
if (rq->cmd_flags & REQ_META)
|
||||
|
@ -1933,6 +2097,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
|
|||
static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
|
||||
struct bfq_queue *new_bfqq)
|
||||
{
|
||||
if (bfq_too_late_for_merging(new_bfqq))
|
||||
return false;
|
||||
|
||||
if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
|
||||
(bfqq->ioprio_class != new_bfqq->ioprio_class))
|
||||
return false;
|
||||
|
@ -1956,20 +2123,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
|
|||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this function returns true, then bfqq cannot be merged. The idea
|
||||
* is that true cooperation happens very early after processes start
|
||||
* to do I/O. Usually, late cooperations are just accidental false
|
||||
* positives. In case bfqq is weight-raised, such false positives
|
||||
* would evidently degrade latency guarantees for bfqq.
|
||||
*/
|
||||
static bool wr_from_too_long(struct bfq_queue *bfqq)
|
||||
{
|
||||
return bfqq->wr_coeff > 1 &&
|
||||
time_is_before_jiffies(bfqq->last_wr_start_finish +
|
||||
msecs_to_jiffies(100));
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempt to schedule a merge of bfqq with the currently in-service
|
||||
* queue or with a close queue among the scheduled queues. Return
|
||||
|
@ -1983,11 +2136,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq)
|
|||
* to maintain. Besides, in such a critical condition as an out of memory,
|
||||
* the benefits of queue merging may be little relevant, or even negligible.
|
||||
*
|
||||
* Weight-raised queues can be merged only if their weight-raising
|
||||
* period has just started. In fact cooperating processes are usually
|
||||
* started together. Thus, with this filter we avoid false positives
|
||||
* that would jeopardize low-latency guarantees.
|
||||
*
|
||||
* WARNING: queue merging may impair fairness among non-weight raised
|
||||
* queues, for at least two reasons: 1) the original weight of a
|
||||
* merged queue may change during the merged state, 2) even being the
|
||||
|
@ -2001,12 +2149,24 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|||
{
|
||||
struct bfq_queue *in_service_bfqq, *new_bfqq;
|
||||
|
||||
/*
|
||||
* Prevent bfqq from being merged if it has been created too
|
||||
* long ago. The idea is that true cooperating processes, and
|
||||
* thus their associated bfq_queues, are supposed to be
|
||||
* created shortly after each other. This is the case, e.g.,
|
||||
* for KVM/QEMU and dump I/O threads. Basing on this
|
||||
* assumption, the following filtering greatly reduces the
|
||||
* probability that two non-cooperating processes, which just
|
||||
* happen to do close I/O for some short time interval, have
|
||||
* their queues merged by mistake.
|
||||
*/
|
||||
if (bfq_too_late_for_merging(bfqq))
|
||||
return NULL;
|
||||
|
||||
if (bfqq->new_bfqq)
|
||||
return bfqq->new_bfqq;
|
||||
|
||||
if (!io_struct ||
|
||||
wr_from_too_long(bfqq) ||
|
||||
unlikely(bfqq == &bfqd->oom_bfqq))
|
||||
if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
|
||||
return NULL;
|
||||
|
||||
/* If there is only one backlogged queue, don't search. */
|
||||
|
@ -2015,12 +2175,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|||
|
||||
in_service_bfqq = bfqd->in_service_queue;
|
||||
|
||||
if (!in_service_bfqq || in_service_bfqq == bfqq
|
||||
|| wr_from_too_long(in_service_bfqq) ||
|
||||
unlikely(in_service_bfqq == &bfqd->oom_bfqq))
|
||||
goto check_scheduled;
|
||||
|
||||
if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
|
||||
if (in_service_bfqq && in_service_bfqq != bfqq &&
|
||||
likely(in_service_bfqq != &bfqd->oom_bfqq) &&
|
||||
bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
|
||||
bfqq->entity.parent == in_service_bfqq->entity.parent &&
|
||||
bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
|
||||
new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
|
||||
|
@ -2032,12 +2189,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|||
* queues. The only thing we need is that the bio/request is not
|
||||
* NULL, as we need it to establish whether a cooperator exists.
|
||||
*/
|
||||
check_scheduled:
|
||||
new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
|
||||
bfq_io_struct_pos(io_struct, request));
|
||||
|
||||
if (new_bfqq && !wr_from_too_long(new_bfqq) &&
|
||||
likely(new_bfqq != &bfqd->oom_bfqq) &&
|
||||
if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&
|
||||
bfq_may_be_close_cooperator(bfqq, new_bfqq))
|
||||
return bfq_setup_merge(bfqq, new_bfqq);
|
||||
|
||||
|
@ -2062,7 +2217,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
|
|||
bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
|
||||
bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
|
||||
if (unlikely(bfq_bfqq_just_created(bfqq) &&
|
||||
!bfq_bfqq_in_large_burst(bfqq))) {
|
||||
!bfq_bfqq_in_large_burst(bfqq) &&
|
||||
bfqq->bfqd->low_latency)) {
|
||||
/*
|
||||
* bfqq being merged right after being created: bfqq
|
||||
* would have deserved interactive weight raising, but
|
||||
|
@ -2917,45 +3073,87 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|||
* whereas soft_rt_next_start is set to infinity for applications that do
|
||||
* not.
|
||||
*
|
||||
* Unfortunately, even a greedy application may happen to behave in an
|
||||
* isochronous way if the CPU load is high. In fact, the application may
|
||||
* stop issuing requests while the CPUs are busy serving other processes,
|
||||
* then restart, then stop again for a while, and so on. In addition, if
|
||||
* the disk achieves a low enough throughput with the request pattern
|
||||
* issued by the application (e.g., because the request pattern is random
|
||||
* and/or the device is slow), then the application may meet the above
|
||||
* bandwidth requirement too. To prevent such a greedy application to be
|
||||
* deemed as soft real-time, a further rule is used in the computation of
|
||||
* soft_rt_next_start: soft_rt_next_start must be higher than the current
|
||||
* time plus the maximum time for which the arrival of a request is waited
|
||||
* for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
|
||||
* This filters out greedy applications, as the latter issue instead their
|
||||
* next request as soon as possible after the last one has been completed
|
||||
* (in contrast, when a batch of requests is completed, a soft real-time
|
||||
* application spends some time processing data).
|
||||
* Unfortunately, even a greedy (i.e., I/O-bound) application may
|
||||
* happen to meet, occasionally or systematically, both the above
|
||||
* bandwidth and isochrony requirements. This may happen at least in
|
||||
* the following circumstances. First, if the CPU load is high. The
|
||||
* application may stop issuing requests while the CPUs are busy
|
||||
* serving other processes, then restart, then stop again for a while,
|
||||
* and so on. The other circumstances are related to the storage
|
||||
* device: the storage device is highly loaded or reaches a low-enough
|
||||
* throughput with the I/O of the application (e.g., because the I/O
|
||||
* is random and/or the device is slow). In all these cases, the
|
||||
* I/O of the application may be simply slowed down enough to meet
|
||||
* the bandwidth and isochrony requirements. To reduce the probability
|
||||
* that greedy applications are deemed as soft real-time in these
|
||||
* corner cases, a further rule is used in the computation of
|
||||
* soft_rt_next_start: the return value of this function is forced to
|
||||
* be higher than the maximum between the following two quantities.
|
||||
*
|
||||
* Unfortunately, the last filter may easily generate false positives if
|
||||
* only bfqd->bfq_slice_idle is used as a reference time interval and one
|
||||
* or both the following cases occur:
|
||||
* 1) HZ is so low that the duration of a jiffy is comparable to or higher
|
||||
* than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
|
||||
* HZ=100.
|
||||
* (a) Current time plus: (1) the maximum time for which the arrival
|
||||
* of a request is waited for when a sync queue becomes idle,
|
||||
* namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We
|
||||
* postpone for a moment the reason for adding a few extra
|
||||
* jiffies; we get back to it after next item (b). Lower-bounding
|
||||
* the return value of this function with the current time plus
|
||||
* bfqd->bfq_slice_idle tends to filter out greedy applications,
|
||||
* because the latter issue their next request as soon as possible
|
||||
* after the last one has been completed. In contrast, a soft
|
||||
* real-time application spends some time processing data, after a
|
||||
* batch of its requests has been completed.
|
||||
*
|
||||
* (b) Current value of bfqq->soft_rt_next_start. As pointed out
|
||||
* above, greedy applications may happen to meet both the
|
||||
* bandwidth and isochrony requirements under heavy CPU or
|
||||
* storage-device load. In more detail, in these scenarios, these
|
||||
* applications happen, only for limited time periods, to do I/O
|
||||
* slowly enough to meet all the requirements described so far,
|
||||
* including the filtering in above item (a). These slow-speed
|
||||
* time intervals are usually interspersed between other time
|
||||
* intervals during which these applications do I/O at a very high
|
||||
* speed. Fortunately, exactly because of the high speed of the
|
||||
* I/O in the high-speed intervals, the values returned by this
|
||||
* function happen to be so high, near the end of any such
|
||||
* high-speed interval, to be likely to fall *after* the end of
|
||||
* the low-speed time interval that follows. These high values are
|
||||
* stored in bfqq->soft_rt_next_start after each invocation of
|
||||
* this function. As a consequence, if the last value of
|
||||
* bfqq->soft_rt_next_start is constantly used to lower-bound the
|
||||
* next value that this function may return, then, from the very
|
||||
* beginning of a low-speed interval, bfqq->soft_rt_next_start is
|
||||
* likely to be constantly kept so high that any I/O request
|
||||
* issued during the low-speed interval is considered as arriving
|
||||
* to soon for the application to be deemed as soft
|
||||
* real-time. Then, in the high-speed interval that follows, the
|
||||
* application will not be deemed as soft real-time, just because
|
||||
* it will do I/O at a high speed. And so on.
|
||||
*
|
||||
* Getting back to the filtering in item (a), in the following two
|
||||
* cases this filtering might be easily passed by a greedy
|
||||
* application, if the reference quantity was just
|
||||
* bfqd->bfq_slice_idle:
|
||||
* 1) HZ is so low that the duration of a jiffy is comparable to or
|
||||
* higher than bfqd->bfq_slice_idle. This happens, e.g., on slow
|
||||
* devices with HZ=100. The time granularity may be so coarse
|
||||
* that the approximation, in jiffies, of bfqd->bfq_slice_idle
|
||||
* is rather lower than the exact value.
|
||||
* 2) jiffies, instead of increasing at a constant rate, may stop increasing
|
||||
* for a while, then suddenly 'jump' by several units to recover the lost
|
||||
* increments. This seems to happen, e.g., inside virtual machines.
|
||||
* To address this issue, we do not use as a reference time interval just
|
||||
* bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
|
||||
* particular we add the minimum number of jiffies for which the filter
|
||||
* seems to be quite precise also in embedded systems and KVM/QEMU virtual
|
||||
* machines.
|
||||
* To address this issue, in the filtering in (a) we do not use as a
|
||||
* reference time interval just bfqd->bfq_slice_idle, but
|
||||
* bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the
|
||||
* minimum number of jiffies for which the filter seems to be quite
|
||||
* precise also in embedded systems and KVM/QEMU virtual machines.
|
||||
*/
|
||||
static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
|
||||
struct bfq_queue *bfqq)
|
||||
{
|
||||
return max(bfqq->last_idle_bklogged +
|
||||
HZ * bfqq->service_from_backlogged /
|
||||
bfqd->bfq_wr_max_softrt_rate,
|
||||
jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
|
||||
return max3(bfqq->soft_rt_next_start,
|
||||
bfqq->last_idle_bklogged +
|
||||
HZ * bfqq->service_from_backlogged /
|
||||
bfqd->bfq_wr_max_softrt_rate,
|
||||
jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -2999,17 +3197,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
|
|||
*/
|
||||
slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
|
||||
|
||||
/*
|
||||
* Increase service_from_backlogged before next statement,
|
||||
* because the possible next invocation of
|
||||
* bfq_bfqq_charge_time would likely inflate
|
||||
* entity->service. In contrast, service_from_backlogged must
|
||||
* contain real service, to enable the soft real-time
|
||||
* heuristic to correctly compute the bandwidth consumed by
|
||||
* bfqq.
|
||||
*/
|
||||
bfqq->service_from_backlogged += entity->service;
|
||||
|
||||
/*
|
||||
* As above explained, charge slow (typically seeky) and
|
||||
* timed-out queues with the time and not the service
|
||||
|
@ -3535,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
|||
bfqq->entity.prio_changed = 1;
|
||||
}
|
||||
}
|
||||
if (bfqq->wr_coeff > 1 &&
|
||||
bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time &&
|
||||
bfqq->service_from_wr > max_service_from_wr) {
|
||||
/* see comments on max_service_from_wr */
|
||||
bfq_bfqq_end_wr(bfqq);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* To improve latency (for this or other queues), immediately
|
||||
|
@ -3630,8 +3823,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
|||
}
|
||||
|
||||
/*
|
||||
* We exploit the put_rq_private hook to decrement
|
||||
* rq_in_driver, but put_rq_private will not be
|
||||
* We exploit the bfq_finish_request hook to decrement
|
||||
* rq_in_driver, but bfq_finish_request will not be
|
||||
* invoked on this request. So, to avoid unbalance,
|
||||
* just start this request, without incrementing
|
||||
* rq_in_driver. As a negative consequence,
|
||||
|
@ -3640,14 +3833,14 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
|||
* bfq_schedule_dispatch to be invoked uselessly.
|
||||
*
|
||||
* As for implementing an exact solution, the
|
||||
* put_request hook, if defined, is probably invoked
|
||||
* also on this request. So, by exploiting this hook,
|
||||
* we could 1) increment rq_in_driver here, and 2)
|
||||
* decrement it in put_request. Such a solution would
|
||||
* let the value of the counter be always accurate,
|
||||
* but it would entail using an extra interface
|
||||
* function. This cost seems higher than the benefit,
|
||||
* being the frequency of non-elevator-private
|
||||
* bfq_finish_request hook, if defined, is probably
|
||||
* invoked also on this request. So, by exploiting
|
||||
* this hook, we could 1) increment rq_in_driver here,
|
||||
* and 2) decrement it in bfq_finish_request. Such a
|
||||
* solution would let the value of the counter be
|
||||
* always accurate, but it would entail using an extra
|
||||
* interface function. This cost seems higher than the
|
||||
* benefit, being the frequency of non-elevator-private
|
||||
* requests very low.
|
||||
*/
|
||||
goto start_rq;
|
||||
|
@ -3689,35 +3882,16 @@ exit:
|
|||
return rq;
|
||||
}
|
||||
|
||||
static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
||||
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
|
||||
static void bfq_update_dispatch_stats(struct request_queue *q,
|
||||
struct request *rq,
|
||||
struct bfq_queue *in_serv_queue,
|
||||
bool idle_timer_disabled)
|
||||
{
|
||||
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
|
||||
struct request *rq;
|
||||
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
|
||||
struct bfq_queue *in_serv_queue, *bfqq;
|
||||
bool waiting_rq, idle_timer_disabled;
|
||||
#endif
|
||||
struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL;
|
||||
|
||||
spin_lock_irq(&bfqd->lock);
|
||||
|
||||
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
|
||||
in_serv_queue = bfqd->in_service_queue;
|
||||
waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
|
||||
|
||||
rq = __bfq_dispatch_request(hctx);
|
||||
|
||||
idle_timer_disabled =
|
||||
waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
|
||||
|
||||
#else
|
||||
rq = __bfq_dispatch_request(hctx);
|
||||
#endif
|
||||
spin_unlock_irq(&bfqd->lock);
|
||||
|
||||
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
|
||||
bfqq = rq ? RQ_BFQQ(rq) : NULL;
|
||||
if (!idle_timer_disabled && !bfqq)
|
||||
return rq;
|
||||
return;
|
||||
|
||||
/*
|
||||
* rq and bfqq are guaranteed to exist until this function
|
||||
|
@ -3732,7 +3906,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
|||
* In addition, the following queue lock guarantees that
|
||||
* bfqq_group(bfqq) exists as well.
|
||||
*/
|
||||
spin_lock_irq(hctx->queue->queue_lock);
|
||||
spin_lock_irq(q->queue_lock);
|
||||
if (idle_timer_disabled)
|
||||
/*
|
||||
* Since the idle timer has been disabled,
|
||||
|
@ -3751,9 +3925,37 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
|||
bfqg_stats_set_start_empty_time(bfqg);
|
||||
bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
|
||||
}
|
||||
spin_unlock_irq(hctx->queue->queue_lock);
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
}
|
||||
#else
|
||||
static inline void bfq_update_dispatch_stats(struct request_queue *q,
|
||||
struct request *rq,
|
||||
struct bfq_queue *in_serv_queue,
|
||||
bool idle_timer_disabled) {}
|
||||
#endif
|
||||
|
||||
static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
|
||||
struct request *rq;
|
||||
struct bfq_queue *in_serv_queue;
|
||||
bool waiting_rq, idle_timer_disabled;
|
||||
|
||||
spin_lock_irq(&bfqd->lock);
|
||||
|
||||
in_serv_queue = bfqd->in_service_queue;
|
||||
waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
|
||||
|
||||
rq = __bfq_dispatch_request(hctx);
|
||||
|
||||
idle_timer_disabled =
|
||||
waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
|
||||
|
||||
spin_unlock_irq(&bfqd->lock);
|
||||
|
||||
bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
|
||||
idle_timer_disabled);
|
||||
|
||||
return rq;
|
||||
}
|
||||
|
||||
|
@ -4002,10 +4204,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|||
bfqq->split_time = bfq_smallest_from_now();
|
||||
|
||||
/*
|
||||
* Set to the value for which bfqq will not be deemed as
|
||||
* soft rt when it becomes backlogged.
|
||||
* To not forget the possibly high bandwidth consumed by a
|
||||
* process/queue in the recent past,
|
||||
* bfq_bfqq_softrt_next_start() returns a value at least equal
|
||||
* to the current value of bfqq->soft_rt_next_start (see
|
||||
* comments on bfq_bfqq_softrt_next_start). Set
|
||||
* soft_rt_next_start to now, to mean that bfqq has consumed
|
||||
* no bandwidth so far.
|
||||
*/
|
||||
bfqq->soft_rt_next_start = bfq_greatest_from_now();
|
||||
bfqq->soft_rt_next_start = jiffies;
|
||||
|
||||
/* first request is almost certainly seeky */
|
||||
bfqq->seek_history = 1;
|
||||
|
@ -4276,16 +4483,46 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
|
|||
return idle_timer_disabled;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
|
||||
static void bfq_update_insert_stats(struct request_queue *q,
|
||||
struct bfq_queue *bfqq,
|
||||
bool idle_timer_disabled,
|
||||
unsigned int cmd_flags)
|
||||
{
|
||||
if (!bfqq)
|
||||
return;
|
||||
|
||||
/*
|
||||
* bfqq still exists, because it can disappear only after
|
||||
* either it is merged with another queue, or the process it
|
||||
* is associated with exits. But both actions must be taken by
|
||||
* the same process currently executing this flow of
|
||||
* instructions.
|
||||
*
|
||||
* In addition, the following queue lock guarantees that
|
||||
* bfqq_group(bfqq) exists as well.
|
||||
*/
|
||||
spin_lock_irq(q->queue_lock);
|
||||
bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
|
||||
if (idle_timer_disabled)
|
||||
bfqg_stats_update_idle_time(bfqq_group(bfqq));
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
}
|
||||
#else
|
||||
static inline void bfq_update_insert_stats(struct request_queue *q,
|
||||
struct bfq_queue *bfqq,
|
||||
bool idle_timer_disabled,
|
||||
unsigned int cmd_flags) {}
|
||||
#endif
|
||||
|
||||
static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
|
||||
bool at_head)
|
||||
{
|
||||
struct request_queue *q = hctx->queue;
|
||||
struct bfq_data *bfqd = q->elevator->elevator_data;
|
||||
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
|
||||
struct bfq_queue *bfqq = RQ_BFQQ(rq);
|
||||
bool idle_timer_disabled = false;
|
||||
unsigned int cmd_flags;
|
||||
#endif
|
||||
|
||||
spin_lock_irq(&bfqd->lock);
|
||||
if (blk_mq_sched_try_insert_merge(q, rq)) {
|
||||
|
@ -4304,7 +4541,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
|
|||
else
|
||||
list_add_tail(&rq->queuelist, &bfqd->dispatch);
|
||||
} else {
|
||||
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
|
||||
idle_timer_disabled = __bfq_insert_request(bfqd, rq);
|
||||
/*
|
||||
* Update bfqq, because, if a queue merge has occurred
|
||||
|
@ -4312,9 +4548,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
|
|||
* redirected into a new queue.
|
||||
*/
|
||||
bfqq = RQ_BFQQ(rq);
|
||||
#else
|
||||
__bfq_insert_request(bfqd, rq);
|
||||
#endif
|
||||
|
||||
if (rq_mergeable(rq)) {
|
||||
elv_rqhash_add(q, rq);
|
||||
|
@ -4323,35 +4556,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
|
|||
}
|
||||
}
|
||||
|
||||
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
|
||||
/*
|
||||
* Cache cmd_flags before releasing scheduler lock, because rq
|
||||
* may disappear afterwards (for example, because of a request
|
||||
* merge).
|
||||
*/
|
||||
cmd_flags = rq->cmd_flags;
|
||||
#endif
|
||||
|
||||
spin_unlock_irq(&bfqd->lock);
|
||||
|
||||
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
|
||||
if (!bfqq)
|
||||
return;
|
||||
/*
|
||||
* bfqq still exists, because it can disappear only after
|
||||
* either it is merged with another queue, or the process it
|
||||
* is associated with exits. But both actions must be taken by
|
||||
* the same process currently executing this flow of
|
||||
* instruction.
|
||||
*
|
||||
* In addition, the following queue lock guarantees that
|
||||
* bfqq_group(bfqq) exists as well.
|
||||
*/
|
||||
spin_lock_irq(q->queue_lock);
|
||||
bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
|
||||
if (idle_timer_disabled)
|
||||
bfqg_stats_update_idle_time(bfqq_group(bfqq));
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
#endif
|
||||
bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
|
||||
cmd_flags);
|
||||
}
|
||||
|
||||
static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
|
||||
|
@ -4482,7 +4697,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
|
|||
bfq_schedule_dispatch(bfqd);
|
||||
}
|
||||
|
||||
static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
|
||||
static void bfq_finish_request_body(struct bfq_queue *bfqq)
|
||||
{
|
||||
bfqq->allocated--;
|
||||
|
||||
|
@ -4512,7 +4727,7 @@ static void bfq_finish_request(struct request *rq)
|
|||
spin_lock_irqsave(&bfqd->lock, flags);
|
||||
|
||||
bfq_completed_request(bfqq, bfqd);
|
||||
bfq_put_rq_priv_body(bfqq);
|
||||
bfq_finish_request_body(bfqq);
|
||||
|
||||
spin_unlock_irqrestore(&bfqd->lock, flags);
|
||||
} else {
|
||||
|
@ -4533,7 +4748,7 @@ static void bfq_finish_request(struct request *rq)
|
|||
bfqg_stats_update_io_remove(bfqq_group(bfqq),
|
||||
rq->cmd_flags);
|
||||
}
|
||||
bfq_put_rq_priv_body(bfqq);
|
||||
bfq_finish_request_body(bfqq);
|
||||
}
|
||||
|
||||
rq->elv.priv[0] = NULL;
|
||||
|
@ -4818,6 +5033,9 @@ static void bfq_exit_queue(struct elevator_queue *e)
|
|||
hrtimer_cancel(&bfqd->idle_slice_timer);
|
||||
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
/* release oom-queue reference to root group */
|
||||
bfqg_and_blkg_put(bfqd->root_group);
|
||||
|
||||
blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
|
||||
#else
|
||||
spin_lock_irq(&bfqd->lock);
|
||||
|
@ -5206,6 +5424,7 @@ static struct elv_fs_entry bfq_attrs[] = {
|
|||
|
||||
static struct elevator_type iosched_bfq_mq = {
|
||||
.ops.mq = {
|
||||
.limit_depth = bfq_limit_depth,
|
||||
.prepare_request = bfq_prepare_request,
|
||||
.finish_request = bfq_finish_request,
|
||||
.exit_icq = bfq_exit_icq,
|
||||
|
|
|
@ -337,6 +337,11 @@ struct bfq_queue {
|
|||
* last transition from idle to backlogged.
|
||||
*/
|
||||
unsigned long service_from_backlogged;
|
||||
/*
|
||||
* Cumulative service received from the @bfq_queue since its
|
||||
* last transition to weight-raised state.
|
||||
*/
|
||||
unsigned long service_from_wr;
|
||||
|
||||
/*
|
||||
* Value of wr start time when switching to soft rt
|
||||
|
@ -344,6 +349,8 @@ struct bfq_queue {
|
|||
unsigned long wr_start_at_switch_to_srt;
|
||||
|
||||
unsigned long split_time; /* time of last split */
|
||||
|
||||
unsigned long first_IO_time; /* time of first I/O for this queue */
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -627,6 +634,18 @@ struct bfq_data {
|
|||
struct bfq_io_cq *bio_bic;
|
||||
/* bfqq associated with the task issuing current bio for merging */
|
||||
struct bfq_queue *bio_bfqq;
|
||||
|
||||
/*
|
||||
* Cached sbitmap shift, used to compute depth limits in
|
||||
* bfq_update_depths.
|
||||
*/
|
||||
unsigned int sb_shift;
|
||||
|
||||
/*
|
||||
* Depth limits used in bfq_limit_depth (see comments on the
|
||||
* function)
|
||||
*/
|
||||
unsigned int word_depths[2][2];
|
||||
};
|
||||
|
||||
enum bfqq_state_flags {
|
||||
|
|
|
@ -835,6 +835,13 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
|
|||
struct bfq_entity *entity = &bfqq->entity;
|
||||
struct bfq_service_tree *st;
|
||||
|
||||
if (!bfqq->service_from_backlogged)
|
||||
bfqq->first_IO_time = jiffies;
|
||||
|
||||
if (bfqq->wr_coeff > 1)
|
||||
bfqq->service_from_wr += served;
|
||||
|
||||
bfqq->service_from_backlogged += served;
|
||||
for_each_entity(entity) {
|
||||
st = bfq_entity_service_tree(entity);
|
||||
|
||||
|
|
|
@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work)
|
|||
/**
|
||||
* __bio_integrity_endio - Integrity I/O completion function
|
||||
* @bio: Protected bio
|
||||
* @error: Pointer to errno
|
||||
*
|
||||
* Description: Completion for integrity I/O
|
||||
*
|
||||
|
|
30
block/bio.c
30
block/bio.c
|
@ -970,34 +970,6 @@ void bio_advance(struct bio *bio, unsigned bytes)
|
|||
}
|
||||
EXPORT_SYMBOL(bio_advance);
|
||||
|
||||
/**
|
||||
* bio_alloc_pages - allocates a single page for each bvec in a bio
|
||||
* @bio: bio to allocate pages for
|
||||
* @gfp_mask: flags for allocation
|
||||
*
|
||||
* Allocates pages up to @bio->bi_vcnt.
|
||||
*
|
||||
* Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
|
||||
* freed.
|
||||
*/
|
||||
int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
|
||||
{
|
||||
int i;
|
||||
struct bio_vec *bv;
|
||||
|
||||
bio_for_each_segment_all(bv, bio, i) {
|
||||
bv->bv_page = alloc_page(gfp_mask);
|
||||
if (!bv->bv_page) {
|
||||
while (--bv >= bio->bi_io_vec)
|
||||
__free_page(bv->bv_page);
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(bio_alloc_pages);
|
||||
|
||||
/**
|
||||
* bio_copy_data - copy contents of data buffers from one chain of bios to
|
||||
* another
|
||||
|
@ -1838,7 +1810,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
|
|||
bio_advance(bio, split->bi_iter.bi_size);
|
||||
|
||||
if (bio_flagged(bio, BIO_TRACE_COMPLETION))
|
||||
bio_set_flag(bio, BIO_TRACE_COMPLETION);
|
||||
bio_set_flag(split, BIO_TRACE_COMPLETION);
|
||||
|
||||
return split;
|
||||
}
|
||||
|
|
|
@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
|
|||
rq->start_time = jiffies;
|
||||
set_start_time_ns(rq);
|
||||
rq->part = NULL;
|
||||
seqcount_init(&rq->gstate_seq);
|
||||
u64_stats_init(&rq->aborted_gstate_sync);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_rq_init);
|
||||
|
||||
|
@ -699,6 +701,15 @@ void blk_cleanup_queue(struct request_queue *q)
|
|||
queue_flag_set(QUEUE_FLAG_DEAD, q);
|
||||
spin_unlock_irq(lock);
|
||||
|
||||
/*
|
||||
* make sure all in-progress dispatch are completed because
|
||||
* blk_freeze_queue() can only complete all requests, and
|
||||
* dispatch may still be in-progress since we dispatch requests
|
||||
* from more than one contexts
|
||||
*/
|
||||
if (q->mq_ops)
|
||||
blk_mq_quiesce_queue(q);
|
||||
|
||||
/* for synchronous bio-based driver finish in-flight integrity i/o */
|
||||
blk_flush_integrity();
|
||||
|
||||
|
@ -1646,6 +1657,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
|
|||
|
||||
lockdep_assert_held(q->queue_lock);
|
||||
|
||||
blk_req_zone_write_unlock(req);
|
||||
blk_pm_put_request(req);
|
||||
|
||||
elv_completed_request(q, req);
|
||||
|
@ -2055,6 +2067,21 @@ static inline bool should_fail_request(struct hd_struct *part,
|
|||
|
||||
#endif /* CONFIG_FAIL_MAKE_REQUEST */
|
||||
|
||||
static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
|
||||
{
|
||||
if (part->policy && op_is_write(bio_op(bio))) {
|
||||
char b[BDEVNAME_SIZE];
|
||||
|
||||
printk(KERN_ERR
|
||||
"generic_make_request: Trying to write "
|
||||
"to read-only block-device %s (partno %d)\n",
|
||||
bio_devname(bio, b), part->partno);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remap block n of partition p to block n+start(p) of the disk.
|
||||
*/
|
||||
|
@ -2063,27 +2090,28 @@ static inline int blk_partition_remap(struct bio *bio)
|
|||
struct hd_struct *p;
|
||||
int ret = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
p = __disk_get_part(bio->bi_disk, bio->bi_partno);
|
||||
if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
|
||||
bio_check_ro(bio, p))) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Zone reset does not include bi_size so bio_sectors() is always 0.
|
||||
* Include a test for the reset op code and perform the remap if needed.
|
||||
*/
|
||||
if (!bio->bi_partno ||
|
||||
(!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET))
|
||||
return 0;
|
||||
if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
|
||||
goto out;
|
||||
|
||||
rcu_read_lock();
|
||||
p = __disk_get_part(bio->bi_disk, bio->bi_partno);
|
||||
if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) {
|
||||
bio->bi_iter.bi_sector += p->start_sect;
|
||||
bio->bi_partno = 0;
|
||||
trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
|
||||
bio->bi_iter.bi_sector - p->start_sect);
|
||||
} else {
|
||||
printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
|
||||
ret = -EIO;
|
||||
}
|
||||
bio->bi_iter.bi_sector += p->start_sect;
|
||||
bio->bi_partno = 0;
|
||||
trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
|
||||
bio->bi_iter.bi_sector - p->start_sect);
|
||||
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2142,15 +2170,19 @@ generic_make_request_checks(struct bio *bio)
|
|||
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
|
||||
* if queue is not a request based queue.
|
||||
*/
|
||||
|
||||
if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
|
||||
goto not_supported;
|
||||
|
||||
if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
|
||||
goto end_io;
|
||||
|
||||
if (blk_partition_remap(bio))
|
||||
goto end_io;
|
||||
if (!bio->bi_partno) {
|
||||
if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
|
||||
goto end_io;
|
||||
} else {
|
||||
if (blk_partition_remap(bio))
|
||||
goto end_io;
|
||||
}
|
||||
|
||||
if (bio_check_eod(bio, nr_sectors))
|
||||
goto end_io;
|
||||
|
@ -2493,8 +2525,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
|
|||
* bypass a potential scheduler on the bottom device for
|
||||
* insert.
|
||||
*/
|
||||
blk_mq_request_bypass_insert(rq, true);
|
||||
return BLK_STS_OK;
|
||||
return blk_mq_request_issue_directly(rq);
|
||||
}
|
||||
|
||||
spin_lock_irqsave(q->queue_lock, flags);
|
||||
|
@ -2846,7 +2877,7 @@ void blk_start_request(struct request *req)
|
|||
wbt_issue(req->q->rq_wb, &req->issue_stat);
|
||||
}
|
||||
|
||||
BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
|
||||
BUG_ON(blk_rq_is_complete(req));
|
||||
blk_add_timer(req);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_start_request);
|
||||
|
@ -3415,20 +3446,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
|
|||
}
|
||||
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
|
||||
|
||||
int kblockd_schedule_delayed_work(struct delayed_work *dwork,
|
||||
unsigned long delay)
|
||||
{
|
||||
return queue_delayed_work(kblockd_workqueue, dwork, delay);
|
||||
}
|
||||
EXPORT_SYMBOL(kblockd_schedule_delayed_work);
|
||||
|
||||
int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
|
||||
unsigned long delay)
|
||||
{
|
||||
return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
|
||||
}
|
||||
EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
|
||||
|
||||
/**
|
||||
* blk_start_plug - initialize blk_plug and track it inside the task_struct
|
||||
* @plug: The &struct blk_plug that needs to be initialized
|
||||
|
|
|
@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
|
|||
* be reused after dying flag is set
|
||||
*/
|
||||
if (q->mq_ops) {
|
||||
blk_mq_sched_insert_request(rq, at_head, true, false, false);
|
||||
blk_mq_sched_insert_request(rq, at_head, true, false);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
|
|||
if (!q)
|
||||
return -ENXIO;
|
||||
|
||||
if (bdev_read_only(bdev))
|
||||
return -EPERM;
|
||||
|
||||
if (flags & BLKDEV_DISCARD_SECURE) {
|
||||
if (!blk_queue_secure_erase(q))
|
||||
return -EOPNOTSUPP;
|
||||
|
@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
|
|||
if (!q)
|
||||
return -ENXIO;
|
||||
|
||||
if (bdev_read_only(bdev))
|
||||
return -EPERM;
|
||||
|
||||
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
|
||||
if ((sector | nr_sects) & bs_mask)
|
||||
return -EINVAL;
|
||||
|
@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
|
|||
if (!q)
|
||||
return -ENXIO;
|
||||
|
||||
if (bdev_read_only(bdev))
|
||||
return -EPERM;
|
||||
|
||||
/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
|
||||
max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
|
||||
|
||||
|
@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
|
|||
if (!q)
|
||||
return -ENXIO;
|
||||
|
||||
if (bdev_read_only(bdev))
|
||||
return -EPERM;
|
||||
|
||||
while (nr_sects != 0) {
|
||||
bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
|
||||
gfp_mask);
|
||||
|
|
|
@ -119,7 +119,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
|
|||
unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
|
||||
struct bio *bio = NULL;
|
||||
struct iov_iter i;
|
||||
int ret;
|
||||
int ret = -EINVAL;
|
||||
|
||||
if (!iter_is_iovec(iter))
|
||||
goto fail;
|
||||
|
@ -148,7 +148,7 @@ unmap_rq:
|
|||
__blk_rq_unmap_user(bio);
|
||||
fail:
|
||||
rq->bio = NULL;
|
||||
return -EINVAL;
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(blk_rq_map_user_iov);
|
||||
|
||||
|
|
|
@ -128,9 +128,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
|
|||
nsegs++;
|
||||
sectors = max_sectors;
|
||||
}
|
||||
if (sectors)
|
||||
goto split;
|
||||
/* Make this single bvec as the 1st segment */
|
||||
goto split;
|
||||
}
|
||||
|
||||
if (bvprvp && blk_queue_cluster(q)) {
|
||||
|
@ -146,22 +144,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
|
|||
bvprvp = &bvprv;
|
||||
sectors += bv.bv_len >> 9;
|
||||
|
||||
if (nsegs == 1 && seg_size > front_seg_size)
|
||||
front_seg_size = seg_size;
|
||||
continue;
|
||||
}
|
||||
new_segment:
|
||||
if (nsegs == queue_max_segments(q))
|
||||
goto split;
|
||||
|
||||
if (nsegs == 1 && seg_size > front_seg_size)
|
||||
front_seg_size = seg_size;
|
||||
|
||||
nsegs++;
|
||||
bvprv = bv;
|
||||
bvprvp = &bvprv;
|
||||
seg_size = bv.bv_len;
|
||||
sectors += bv.bv_len >> 9;
|
||||
|
||||
if (nsegs == 1 && seg_size > front_seg_size)
|
||||
front_seg_size = seg_size;
|
||||
}
|
||||
|
||||
do_split = false;
|
||||
|
@ -174,6 +171,8 @@ split:
|
|||
bio = new;
|
||||
}
|
||||
|
||||
if (nsegs == 1 && seg_size > front_seg_size)
|
||||
front_seg_size = seg_size;
|
||||
bio->bi_seg_front_size = front_seg_size;
|
||||
if (seg_size > bio->bi_seg_back_size)
|
||||
bio->bi_seg_back_size = seg_size;
|
||||
|
|
|
@ -289,17 +289,12 @@ static const char *const rqf_name[] = {
|
|||
RQF_NAME(HASHED),
|
||||
RQF_NAME(STATS),
|
||||
RQF_NAME(SPECIAL_PAYLOAD),
|
||||
RQF_NAME(ZONE_WRITE_LOCKED),
|
||||
RQF_NAME(MQ_TIMEOUT_EXPIRED),
|
||||
RQF_NAME(MQ_POLL_SLEPT),
|
||||
};
|
||||
#undef RQF_NAME
|
||||
|
||||
#define RQAF_NAME(name) [REQ_ATOM_##name] = #name
|
||||
static const char *const rqaf_name[] = {
|
||||
RQAF_NAME(COMPLETE),
|
||||
RQAF_NAME(STARTED),
|
||||
RQAF_NAME(POLL_SLEPT),
|
||||
};
|
||||
#undef RQAF_NAME
|
||||
|
||||
int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
|
||||
{
|
||||
const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
|
||||
|
@ -316,8 +311,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
|
|||
seq_puts(m, ", .rq_flags=");
|
||||
blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
|
||||
ARRAY_SIZE(rqf_name));
|
||||
seq_puts(m, ", .atomic_flags=");
|
||||
blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name));
|
||||
seq_printf(m, ", complete=%d", blk_rq_is_complete(rq));
|
||||
seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
|
||||
rq->internal_tag);
|
||||
if (mq_ops->show_rq)
|
||||
|
@ -409,7 +403,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
|
|||
const struct show_busy_params *params = data;
|
||||
|
||||
if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx &&
|
||||
test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
|
||||
blk_mq_rq_state(rq) != MQ_RQ_IDLE)
|
||||
__blk_mq_debugfs_rq_show(params->m,
|
||||
list_entry_rq(&rq->queuelist));
|
||||
}
|
||||
|
@ -703,7 +697,11 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf,
|
|||
const struct blk_mq_debugfs_attr *attr = m->private;
|
||||
void *data = d_inode(file->f_path.dentry->d_parent)->i_private;
|
||||
|
||||
if (!attr->write)
|
||||
/*
|
||||
* Attributes that only implement .seq_ops are read-only and 'attr' is
|
||||
* the same with 'data' in this case.
|
||||
*/
|
||||
if (attr == data || !attr->write)
|
||||
return -EPERM;
|
||||
|
||||
return attr->write(data, buf, count, ppos);
|
||||
|
|
|
@ -172,7 +172,6 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
|
|||
WRITE_ONCE(hctx->dispatch_from, ctx);
|
||||
}
|
||||
|
||||
/* return true if hw queue need to be run again */
|
||||
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
struct request_queue *q = hctx->queue;
|
||||
|
@ -428,7 +427,7 @@ done:
|
|||
}
|
||||
|
||||
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
|
||||
bool run_queue, bool async, bool can_block)
|
||||
bool run_queue, bool async)
|
||||
{
|
||||
struct request_queue *q = rq->q;
|
||||
struct elevator_queue *e = q->elevator;
|
||||
|
|
|
@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
|
|||
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
|
||||
|
||||
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
|
||||
bool run_queue, bool async, bool can_block);
|
||||
bool run_queue, bool async);
|
||||
void blk_mq_sched_insert_requests(struct request_queue *q,
|
||||
struct blk_mq_ctx *ctx,
|
||||
struct list_head *list, bool run_queue_async);
|
||||
|
|
|
@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
|
||||
void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
int i;
|
||||
|
@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
|
|||
q->mq_sysfs_init_done = false;
|
||||
}
|
||||
|
||||
void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
|
||||
{
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
__blk_mq_unregister_dev(dev, q);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
}
|
||||
|
||||
void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
|
||||
|
|
|
@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
|
|||
ws = bt_wait_ptr(bt, data->hctx);
|
||||
drop_ctx = data->ctx == NULL;
|
||||
do {
|
||||
prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
|
||||
|
||||
tag = __blk_mq_get_tag(data, bt);
|
||||
if (tag != -1)
|
||||
break;
|
||||
|
||||
/*
|
||||
* We're out of tags on this hardware queue, kick any
|
||||
* pending IO submits before going to sleep waiting for
|
||||
|
@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
|
|||
if (tag != -1)
|
||||
break;
|
||||
|
||||
prepare_to_wait_exclusive(&ws->wait, &wait,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
|
||||
tag = __blk_mq_get_tag(data, bt);
|
||||
if (tag != -1)
|
||||
break;
|
||||
|
||||
if (data->ctx)
|
||||
blk_mq_put_ctx(data->ctx);
|
||||
|
||||
|
|
669
block/blk-mq.c
669
block/blk-mq.c
File diff suppressed because it is too large
Load Diff
|
@ -27,6 +27,20 @@ struct blk_mq_ctx {
|
|||
struct kobject kobj;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
||||
/*
|
||||
* Bits for request->gstate. The lower two bits carry MQ_RQ_* state value
|
||||
* and the upper bits the generation number.
|
||||
*/
|
||||
enum mq_rq_state {
|
||||
MQ_RQ_IDLE = 0,
|
||||
MQ_RQ_IN_FLIGHT = 1,
|
||||
MQ_RQ_COMPLETE = 2,
|
||||
|
||||
MQ_RQ_STATE_BITS = 2,
|
||||
MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1,
|
||||
MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS,
|
||||
};
|
||||
|
||||
void blk_mq_freeze_queue(struct request_queue *q);
|
||||
void blk_mq_free_queue(struct request_queue *q);
|
||||
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
|
||||
|
@ -60,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
|
|||
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
|
||||
struct list_head *list);
|
||||
|
||||
/* Used by blk_insert_cloned_request() to issue request directly */
|
||||
blk_status_t blk_mq_request_issue_directly(struct request *rq);
|
||||
|
||||
/*
|
||||
* CPU -> queue mappings
|
||||
*/
|
||||
|
@ -81,10 +98,41 @@ extern int blk_mq_sysfs_register(struct request_queue *q);
|
|||
extern void blk_mq_sysfs_unregister(struct request_queue *q);
|
||||
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
|
||||
|
||||
extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
|
||||
|
||||
void blk_mq_release(struct request_queue *q);
|
||||
|
||||
/**
|
||||
* blk_mq_rq_state() - read the current MQ_RQ_* state of a request
|
||||
* @rq: target request.
|
||||
*/
|
||||
static inline int blk_mq_rq_state(struct request *rq)
|
||||
{
|
||||
return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
|
||||
* @rq: target request.
|
||||
* @state: new state to set.
|
||||
*
|
||||
* Set @rq's state to @state. The caller is responsible for ensuring that
|
||||
* there are no other updaters. A request can transition into IN_FLIGHT
|
||||
* only from IDLE and doing so increments the generation number.
|
||||
*/
|
||||
static inline void blk_mq_rq_update_state(struct request *rq,
|
||||
enum mq_rq_state state)
|
||||
{
|
||||
u64 old_val = READ_ONCE(rq->gstate);
|
||||
u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
|
||||
|
||||
if (state == MQ_RQ_IN_FLIGHT) {
|
||||
WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
|
||||
new_val += MQ_RQ_GEN_INC;
|
||||
}
|
||||
|
||||
/* avoid exposing interim values */
|
||||
WRITE_ONCE(rq->gstate, new_val);
|
||||
}
|
||||
|
||||
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
|
||||
unsigned int cpu)
|
||||
{
|
||||
|
|
|
@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = {
|
|||
.release = blk_release_queue,
|
||||
};
|
||||
|
||||
/**
|
||||
* blk_register_queue - register a block layer queue with sysfs
|
||||
* @disk: Disk of which the request queue should be registered with sysfs.
|
||||
*/
|
||||
int blk_register_queue(struct gendisk *disk)
|
||||
{
|
||||
int ret;
|
||||
|
@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk)
|
|||
if (q->request_fn || (q->mq_ops && q->elevator)) {
|
||||
ret = elv_register_queue(q);
|
||||
if (ret) {
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
kobject_uevent(&q->kobj, KOBJ_REMOVE);
|
||||
kobject_del(&q->kobj);
|
||||
blk_trace_remove_sysfs(dev);
|
||||
kobject_put(&dev->kobj);
|
||||
goto unlock;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
ret = 0;
|
||||
|
@ -921,7 +926,15 @@ unlock:
|
|||
mutex_unlock(&q->sysfs_lock);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_register_queue);
|
||||
|
||||
/**
|
||||
* blk_unregister_queue - counterpart of blk_register_queue()
|
||||
* @disk: Disk of which the request queue should be unregistered from sysfs.
|
||||
*
|
||||
* Note: the caller is responsible for guaranteeing that this function is called
|
||||
* after blk_register_queue() has finished.
|
||||
*/
|
||||
void blk_unregister_queue(struct gendisk *disk)
|
||||
{
|
||||
struct request_queue *q = disk->queue;
|
||||
|
@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk)
|
|||
if (WARN_ON(!q))
|
||||
return;
|
||||
|
||||
/* Return early if disk->queue was never registered. */
|
||||
if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Since sysfs_remove_dir() prevents adding new directory entries
|
||||
* before removal of existing entries starts, protect against
|
||||
* concurrent elv_iosched_store() calls.
|
||||
*/
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
wbt_exit(q);
|
||||
|
||||
spin_lock_irq(q->queue_lock);
|
||||
queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
|
||||
/*
|
||||
* Remove the sysfs attributes before unregistering the queue data
|
||||
* structures that can be modified through sysfs.
|
||||
*/
|
||||
if (q->mq_ops)
|
||||
blk_mq_unregister_dev(disk_to_dev(disk), q);
|
||||
|
||||
if (q->request_fn || (q->mq_ops && q->elevator))
|
||||
elv_unregister_queue(q);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
kobject_uevent(&q->kobj, KOBJ_REMOVE);
|
||||
kobject_del(&q->kobj);
|
||||
blk_trace_remove_sysfs(disk_to_dev(disk));
|
||||
|
||||
wbt_exit(q);
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
if (q->request_fn || (q->mq_ops && q->elevator))
|
||||
elv_unregister_queue(q);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
kobject_put(&disk_to_dev(disk)->kobj);
|
||||
}
|
||||
|
|
|
@ -216,9 +216,9 @@ struct throtl_data
|
|||
|
||||
unsigned int scale;
|
||||
|
||||
struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
|
||||
struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
|
||||
struct latency_bucket __percpu *latency_buckets;
|
||||
struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
|
||||
struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
|
||||
struct latency_bucket __percpu *latency_buckets[2];
|
||||
unsigned long last_calculate_time;
|
||||
unsigned long filtered_latency;
|
||||
|
||||
|
@ -1510,11 +1510,21 @@ static struct cftype throtl_legacy_files[] = {
|
|||
.private = (unsigned long)&blkcg_policy_throtl,
|
||||
.seq_show = blkg_print_stat_bytes,
|
||||
},
|
||||
{
|
||||
.name = "throttle.io_service_bytes_recursive",
|
||||
.private = (unsigned long)&blkcg_policy_throtl,
|
||||
.seq_show = blkg_print_stat_bytes_recursive,
|
||||
},
|
||||
{
|
||||
.name = "throttle.io_serviced",
|
||||
.private = (unsigned long)&blkcg_policy_throtl,
|
||||
.seq_show = blkg_print_stat_ios,
|
||||
},
|
||||
{
|
||||
.name = "throttle.io_serviced_recursive",
|
||||
.private = (unsigned long)&blkcg_policy_throtl,
|
||||
.seq_show = blkg_print_stat_ios_recursive,
|
||||
},
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
|
@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
|
|||
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
|
||||
static void throtl_update_latency_buckets(struct throtl_data *td)
|
||||
{
|
||||
struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
|
||||
int i, cpu;
|
||||
unsigned long last_latency = 0;
|
||||
unsigned long latency;
|
||||
struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
|
||||
int i, cpu, rw;
|
||||
unsigned long last_latency[2] = { 0 };
|
||||
unsigned long latency[2];
|
||||
|
||||
if (!blk_queue_nonrot(td->queue))
|
||||
return;
|
||||
|
@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
|
|||
td->last_calculate_time = jiffies;
|
||||
|
||||
memset(avg_latency, 0, sizeof(avg_latency));
|
||||
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
|
||||
struct latency_bucket *tmp = &td->tmp_buckets[i];
|
||||
for (rw = READ; rw <= WRITE; rw++) {
|
||||
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
|
||||
struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct latency_bucket *bucket;
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct latency_bucket *bucket;
|
||||
|
||||
/* this isn't race free, but ok in practice */
|
||||
bucket = per_cpu_ptr(td->latency_buckets, cpu);
|
||||
tmp->total_latency += bucket[i].total_latency;
|
||||
tmp->samples += bucket[i].samples;
|
||||
bucket[i].total_latency = 0;
|
||||
bucket[i].samples = 0;
|
||||
}
|
||||
/* this isn't race free, but ok in practice */
|
||||
bucket = per_cpu_ptr(td->latency_buckets[rw],
|
||||
cpu);
|
||||
tmp->total_latency += bucket[i].total_latency;
|
||||
tmp->samples += bucket[i].samples;
|
||||
bucket[i].total_latency = 0;
|
||||
bucket[i].samples = 0;
|
||||
}
|
||||
|
||||
if (tmp->samples >= 32) {
|
||||
int samples = tmp->samples;
|
||||
if (tmp->samples >= 32) {
|
||||
int samples = tmp->samples;
|
||||
|
||||
latency = tmp->total_latency;
|
||||
latency[rw] = tmp->total_latency;
|
||||
|
||||
tmp->total_latency = 0;
|
||||
tmp->samples = 0;
|
||||
latency /= samples;
|
||||
if (latency == 0)
|
||||
continue;
|
||||
avg_latency[i].latency = latency;
|
||||
tmp->total_latency = 0;
|
||||
tmp->samples = 0;
|
||||
latency[rw] /= samples;
|
||||
if (latency[rw] == 0)
|
||||
continue;
|
||||
avg_latency[rw][i].latency = latency[rw];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
|
||||
if (!avg_latency[i].latency) {
|
||||
if (td->avg_buckets[i].latency < last_latency)
|
||||
td->avg_buckets[i].latency = last_latency;
|
||||
continue;
|
||||
for (rw = READ; rw <= WRITE; rw++) {
|
||||
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
|
||||
if (!avg_latency[rw][i].latency) {
|
||||
if (td->avg_buckets[rw][i].latency < last_latency[rw])
|
||||
td->avg_buckets[rw][i].latency =
|
||||
last_latency[rw];
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!td->avg_buckets[rw][i].valid)
|
||||
latency[rw] = avg_latency[rw][i].latency;
|
||||
else
|
||||
latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
|
||||
avg_latency[rw][i].latency) >> 3;
|
||||
|
||||
td->avg_buckets[rw][i].latency = max(latency[rw],
|
||||
last_latency[rw]);
|
||||
td->avg_buckets[rw][i].valid = true;
|
||||
last_latency[rw] = td->avg_buckets[rw][i].latency;
|
||||
}
|
||||
|
||||
if (!td->avg_buckets[i].valid)
|
||||
latency = avg_latency[i].latency;
|
||||
else
|
||||
latency = (td->avg_buckets[i].latency * 7 +
|
||||
avg_latency[i].latency) >> 3;
|
||||
|
||||
td->avg_buckets[i].latency = max(latency, last_latency);
|
||||
td->avg_buckets[i].valid = true;
|
||||
last_latency = td->avg_buckets[i].latency;
|
||||
}
|
||||
|
||||
for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
|
||||
throtl_log(&td->service_queue,
|
||||
"Latency bucket %d: latency=%ld, valid=%d", i,
|
||||
td->avg_buckets[i].latency, td->avg_buckets[i].valid);
|
||||
"Latency bucket %d: read latency=%ld, read valid=%d, "
|
||||
"write latency=%ld, write valid=%d", i,
|
||||
td->avg_buckets[READ][i].latency,
|
||||
td->avg_buckets[READ][i].valid,
|
||||
td->avg_buckets[WRITE][i].latency,
|
||||
td->avg_buckets[WRITE][i].valid);
|
||||
}
|
||||
#else
|
||||
static inline void throtl_update_latency_buckets(struct throtl_data *td)
|
||||
|
@ -2242,16 +2263,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
|
|||
struct latency_bucket *latency;
|
||||
int index;
|
||||
|
||||
if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
|
||||
if (!td || td->limit_index != LIMIT_LOW ||
|
||||
!(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
|
||||
!blk_queue_nonrot(td->queue))
|
||||
return;
|
||||
|
||||
index = request_bucket_index(size);
|
||||
|
||||
latency = get_cpu_ptr(td->latency_buckets);
|
||||
latency = get_cpu_ptr(td->latency_buckets[op]);
|
||||
latency[index].total_latency += time;
|
||||
latency[index].samples++;
|
||||
put_cpu_ptr(td->latency_buckets);
|
||||
put_cpu_ptr(td->latency_buckets[op]);
|
||||
}
|
||||
|
||||
void blk_throtl_stat_add(struct request *rq, u64 time_ns)
|
||||
|
@ -2270,6 +2292,7 @@ void blk_throtl_bio_endio(struct bio *bio)
|
|||
unsigned long finish_time;
|
||||
unsigned long start_time;
|
||||
unsigned long lat;
|
||||
int rw = bio_data_dir(bio);
|
||||
|
||||
tg = bio->bi_cg_private;
|
||||
if (!tg)
|
||||
|
@ -2298,7 +2321,7 @@ void blk_throtl_bio_endio(struct bio *bio)
|
|||
|
||||
bucket = request_bucket_index(
|
||||
blk_stat_size(&bio->bi_issue_stat));
|
||||
threshold = tg->td->avg_buckets[bucket].latency +
|
||||
threshold = tg->td->avg_buckets[rw][bucket].latency +
|
||||
tg->latency_target;
|
||||
if (lat > threshold)
|
||||
tg->bad_bio_cnt++;
|
||||
|
@ -2391,9 +2414,16 @@ int blk_throtl_init(struct request_queue *q)
|
|||
td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
|
||||
if (!td)
|
||||
return -ENOMEM;
|
||||
td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
|
||||
td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
|
||||
LATENCY_BUCKET_SIZE, __alignof__(u64));
|
||||
if (!td->latency_buckets) {
|
||||
if (!td->latency_buckets[READ]) {
|
||||
kfree(td);
|
||||
return -ENOMEM;
|
||||
}
|
||||
td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
|
||||
LATENCY_BUCKET_SIZE, __alignof__(u64));
|
||||
if (!td->latency_buckets[WRITE]) {
|
||||
free_percpu(td->latency_buckets[READ]);
|
||||
kfree(td);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
@ -2412,7 +2442,8 @@ int blk_throtl_init(struct request_queue *q)
|
|||
/* activate policy */
|
||||
ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
|
||||
if (ret) {
|
||||
free_percpu(td->latency_buckets);
|
||||
free_percpu(td->latency_buckets[READ]);
|
||||
free_percpu(td->latency_buckets[WRITE]);
|
||||
kfree(td);
|
||||
}
|
||||
return ret;
|
||||
|
@ -2423,7 +2454,8 @@ void blk_throtl_exit(struct request_queue *q)
|
|||
BUG_ON(!q->td);
|
||||
throtl_shutdown_wq(q);
|
||||
blkcg_deactivate_policy(q, &blkcg_policy_throtl);
|
||||
free_percpu(q->td->latency_buckets);
|
||||
free_percpu(q->td->latency_buckets[READ]);
|
||||
free_percpu(q->td->latency_buckets[WRITE]);
|
||||
kfree(q->td);
|
||||
}
|
||||
|
||||
|
@ -2441,15 +2473,17 @@ void blk_throtl_register_queue(struct request_queue *q)
|
|||
} else {
|
||||
td->throtl_slice = DFL_THROTL_SLICE_HD;
|
||||
td->filtered_latency = LATENCY_FILTERED_HD;
|
||||
for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
|
||||
td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
|
||||
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
|
||||
td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
|
||||
td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
|
||||
}
|
||||
}
|
||||
#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
|
||||
/* if no low limit, use previous default */
|
||||
td->throtl_slice = DFL_THROTL_SLICE_HD;
|
||||
#endif
|
||||
|
||||
td->track_bio_latency = !q->mq_ops && !q->request_fn;
|
||||
td->track_bio_latency = !queue_is_rq_based(q);
|
||||
if (!td->track_bio_latency)
|
||||
blk_stat_enable_accounting(q);
|
||||
}
|
||||
|
|
|
@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req)
|
|||
static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
|
||||
unsigned int *next_set)
|
||||
{
|
||||
if (time_after_eq(jiffies, rq->deadline)) {
|
||||
const unsigned long deadline = blk_rq_deadline(rq);
|
||||
|
||||
if (time_after_eq(jiffies, deadline)) {
|
||||
list_del_init(&rq->timeout_list);
|
||||
|
||||
/*
|
||||
|
@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
|
|||
*/
|
||||
if (!blk_mark_rq_complete(rq))
|
||||
blk_rq_timed_out(rq);
|
||||
} else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
|
||||
*next_timeout = rq->deadline;
|
||||
} else if (!*next_set || time_after(*next_timeout, deadline)) {
|
||||
*next_timeout = deadline;
|
||||
*next_set = 1;
|
||||
}
|
||||
}
|
||||
|
@ -156,12 +158,17 @@ void blk_timeout_work(struct work_struct *work)
|
|||
*/
|
||||
void blk_abort_request(struct request *req)
|
||||
{
|
||||
if (blk_mark_rq_complete(req))
|
||||
return;
|
||||
|
||||
if (req->q->mq_ops) {
|
||||
blk_mq_rq_timed_out(req, false);
|
||||
/*
|
||||
* All we need to ensure is that timeout scan takes place
|
||||
* immediately and that scan sees the new timeout value.
|
||||
* No need for fancy synchronizations.
|
||||
*/
|
||||
blk_rq_set_deadline(req, jiffies);
|
||||
mod_timer(&req->q->timeout, 0);
|
||||
} else {
|
||||
if (blk_mark_rq_complete(req))
|
||||
return;
|
||||
blk_delete_timer(req);
|
||||
blk_rq_timed_out(req);
|
||||
}
|
||||
|
@ -208,7 +215,8 @@ void blk_add_timer(struct request *req)
|
|||
if (!req->timeout)
|
||||
req->timeout = q->rq_timeout;
|
||||
|
||||
WRITE_ONCE(req->deadline, jiffies + req->timeout);
|
||||
blk_rq_set_deadline(req, jiffies + req->timeout);
|
||||
req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
|
||||
|
||||
/*
|
||||
* Only the non-mq case needs to add the request to a protected list.
|
||||
|
@ -222,7 +230,7 @@ void blk_add_timer(struct request *req)
|
|||
* than an existing one, modify the timer. Round up to next nearest
|
||||
* second.
|
||||
*/
|
||||
expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
|
||||
expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req)));
|
||||
|
||||
if (!timer_pending(&q->timeout) ||
|
||||
time_before(expiry, q->timeout.expires)) {
|
||||
|
|
|
@ -21,6 +21,48 @@ static inline sector_t blk_zone_start(struct request_queue *q,
|
|||
return sector & ~zone_mask;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if a request is a write requests that needs zone write locking.
|
||||
*/
|
||||
bool blk_req_needs_zone_write_lock(struct request *rq)
|
||||
{
|
||||
if (!rq->q->seq_zones_wlock)
|
||||
return false;
|
||||
|
||||
if (blk_rq_is_passthrough(rq))
|
||||
return false;
|
||||
|
||||
switch (req_op(rq)) {
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
case REQ_OP_WRITE_SAME:
|
||||
case REQ_OP_WRITE:
|
||||
return blk_rq_zone_is_seq(rq);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
|
||||
|
||||
void __blk_req_zone_write_lock(struct request *rq)
|
||||
{
|
||||
if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
|
||||
rq->q->seq_zones_wlock)))
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
|
||||
rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
|
||||
|
||||
void __blk_req_zone_write_unlock(struct request *rq)
|
||||
{
|
||||
rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
|
||||
if (rq->q->seq_zones_wlock)
|
||||
WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
|
||||
rq->q->seq_zones_wlock));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
|
||||
|
||||
/*
|
||||
* Check that a zone report belongs to the partition.
|
||||
* If yes, fix its start sector and write pointer, copy it in the
|
||||
|
|
46
block/blk.h
46
block/blk.h
|
@ -119,34 +119,24 @@ void blk_account_io_start(struct request *req, bool new_io);
|
|||
void blk_account_io_completion(struct request *req, unsigned int bytes);
|
||||
void blk_account_io_done(struct request *req);
|
||||
|
||||
/*
|
||||
* Internal atomic flags for request handling
|
||||
*/
|
||||
enum rq_atomic_flags {
|
||||
/*
|
||||
* Keep these two bits first - not because we depend on the
|
||||
* value of them, but we do depend on them being in the same
|
||||
* byte of storage to ensure ordering on writes. Keeping them
|
||||
* first will achieve that nicely.
|
||||
*/
|
||||
REQ_ATOM_COMPLETE = 0,
|
||||
REQ_ATOM_STARTED,
|
||||
|
||||
REQ_ATOM_POLL_SLEPT,
|
||||
};
|
||||
|
||||
/*
|
||||
* EH timer and IO completion will both attempt to 'grab' the request, make
|
||||
* sure that only one of them succeeds
|
||||
* sure that only one of them succeeds. Steal the bottom bit of the
|
||||
* __deadline field for this.
|
||||
*/
|
||||
static inline int blk_mark_rq_complete(struct request *rq)
|
||||
{
|
||||
return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
|
||||
return test_and_set_bit(0, &rq->__deadline);
|
||||
}
|
||||
|
||||
static inline void blk_clear_rq_complete(struct request *rq)
|
||||
{
|
||||
clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
|
||||
clear_bit(0, &rq->__deadline);
|
||||
}
|
||||
|
||||
static inline bool blk_rq_is_complete(struct request *rq)
|
||||
{
|
||||
return test_bit(0, &rq->__deadline);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -172,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
|
|||
e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
|
||||
}
|
||||
|
||||
int elv_register_queue(struct request_queue *q);
|
||||
void elv_unregister_queue(struct request_queue *q);
|
||||
|
||||
struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
|
||||
|
||||
#ifdef CONFIG_FAIL_IO_TIMEOUT
|
||||
|
@ -245,6 +238,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
|
|||
q->last_merge = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Steal a bit from this field for legacy IO path atomic IO marking. Note that
|
||||
* setting the deadline clears the bottom bit, potentially clearing the
|
||||
* completed bit. The user has to be OK with this (current ones are fine).
|
||||
*/
|
||||
static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
|
||||
{
|
||||
rq->__deadline = time & ~0x1UL;
|
||||
}
|
||||
|
||||
static inline unsigned long blk_rq_deadline(struct request *rq)
|
||||
{
|
||||
return rq->__deadline & ~0x1UL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Internal io_context interface
|
||||
*/
|
||||
|
|
|
@ -113,45 +113,50 @@ int init_emergency_isa_pool(void)
|
|||
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
|
||||
{
|
||||
unsigned char *vfrom;
|
||||
struct bio_vec tovec, *fromvec = from->bi_io_vec;
|
||||
struct bio_vec tovec, fromvec;
|
||||
struct bvec_iter iter;
|
||||
/*
|
||||
* The bio of @from is created by bounce, so we can iterate
|
||||
* its bvec from start to end, but the @from->bi_iter can't be
|
||||
* trusted because it might be changed by splitting.
|
||||
*/
|
||||
struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
|
||||
|
||||
bio_for_each_segment(tovec, to, iter) {
|
||||
if (tovec.bv_page != fromvec->bv_page) {
|
||||
fromvec = bio_iter_iovec(from, from_iter);
|
||||
if (tovec.bv_page != fromvec.bv_page) {
|
||||
/*
|
||||
* fromvec->bv_offset and fromvec->bv_len might have
|
||||
* been modified by the block layer, so use the original
|
||||
* copy, bounce_copy_vec already uses tovec->bv_len
|
||||
*/
|
||||
vfrom = page_address(fromvec->bv_page) +
|
||||
vfrom = page_address(fromvec.bv_page) +
|
||||
tovec.bv_offset;
|
||||
|
||||
bounce_copy_vec(&tovec, vfrom);
|
||||
flush_dcache_page(tovec.bv_page);
|
||||
}
|
||||
|
||||
fromvec++;
|
||||
bio_advance_iter(from, &from_iter, tovec.bv_len);
|
||||
}
|
||||
}
|
||||
|
||||
static void bounce_end_io(struct bio *bio, mempool_t *pool)
|
||||
{
|
||||
struct bio *bio_orig = bio->bi_private;
|
||||
struct bio_vec *bvec, *org_vec;
|
||||
struct bio_vec *bvec, orig_vec;
|
||||
int i;
|
||||
int start = bio_orig->bi_iter.bi_idx;
|
||||
struct bvec_iter orig_iter = bio_orig->bi_iter;
|
||||
|
||||
/*
|
||||
* free up bounce indirect pages used
|
||||
*/
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
org_vec = bio_orig->bi_io_vec + i + start;
|
||||
|
||||
if (bvec->bv_page == org_vec->bv_page)
|
||||
continue;
|
||||
|
||||
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
|
||||
mempool_free(bvec->bv_page, pool);
|
||||
orig_vec = bio_iter_iovec(bio_orig, orig_iter);
|
||||
if (bvec->bv_page != orig_vec.bv_page) {
|
||||
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
|
||||
mempool_free(bvec->bv_page, pool);
|
||||
}
|
||||
bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
|
||||
}
|
||||
|
||||
bio_orig->bi_status = bio->bi_status;
|
||||
|
|
|
@ -30,7 +30,7 @@
|
|||
|
||||
/**
|
||||
* bsg_teardown_job - routine to teardown a bsg job
|
||||
* @job: bsg_job that is to be torn down
|
||||
* @kref: kref inside bsg_job that is to be torn down
|
||||
*/
|
||||
static void bsg_teardown_job(struct kref *kref)
|
||||
{
|
||||
|
@ -251,6 +251,7 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
|
|||
* @name: device to give bsg device
|
||||
* @job_fn: bsg job handler
|
||||
* @dd_job_size: size of LLD data needed for each job
|
||||
* @release: @dev release function
|
||||
*/
|
||||
struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
|
||||
bsg_job_fn *job_fn, int dd_job_size,
|
||||
|
|
40
block/bsg.c
40
block/bsg.c
|
@ -32,6 +32,9 @@
|
|||
#define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver"
|
||||
#define BSG_VERSION "0.4"
|
||||
|
||||
#define bsg_dbg(bd, fmt, ...) \
|
||||
pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__)
|
||||
|
||||
struct bsg_device {
|
||||
struct request_queue *queue;
|
||||
spinlock_t lock;
|
||||
|
@ -55,14 +58,6 @@ enum {
|
|||
#define BSG_DEFAULT_CMDS 64
|
||||
#define BSG_MAX_DEVS 32768
|
||||
|
||||
#undef BSG_DEBUG
|
||||
|
||||
#ifdef BSG_DEBUG
|
||||
#define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ##args)
|
||||
#else
|
||||
#define dprintk(fmt, args...)
|
||||
#endif
|
||||
|
||||
static DEFINE_MUTEX(bsg_mutex);
|
||||
static DEFINE_IDR(bsg_minor_idr);
|
||||
|
||||
|
@ -123,7 +118,7 @@ static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)
|
|||
|
||||
bc->bd = bd;
|
||||
INIT_LIST_HEAD(&bc->list);
|
||||
dprintk("%s: returning free cmd %p\n", bd->name, bc);
|
||||
bsg_dbg(bd, "returning free cmd %p\n", bc);
|
||||
return bc;
|
||||
out:
|
||||
spin_unlock_irq(&bd->lock);
|
||||
|
@ -222,7 +217,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
|
|||
if (!bcd->class_dev)
|
||||
return ERR_PTR(-ENXIO);
|
||||
|
||||
dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp,
|
||||
bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n",
|
||||
(unsigned long long) hdr->dout_xferp,
|
||||
hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
|
||||
hdr->din_xfer_len);
|
||||
|
||||
|
@ -299,8 +295,8 @@ static void bsg_rq_end_io(struct request *rq, blk_status_t status)
|
|||
struct bsg_device *bd = bc->bd;
|
||||
unsigned long flags;
|
||||
|
||||
dprintk("%s: finished rq %p bc %p, bio %p\n",
|
||||
bd->name, rq, bc, bc->bio);
|
||||
bsg_dbg(bd, "finished rq %p bc %p, bio %p\n",
|
||||
rq, bc, bc->bio);
|
||||
|
||||
bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
|
||||
|
||||
|
@ -333,7 +329,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
|
|||
list_add_tail(&bc->list, &bd->busy_list);
|
||||
spin_unlock_irq(&bd->lock);
|
||||
|
||||
dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc);
|
||||
bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc);
|
||||
|
||||
rq->end_io_data = bc;
|
||||
blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
|
||||
|
@ -379,7 +375,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
|
|||
}
|
||||
} while (1);
|
||||
|
||||
dprintk("%s: returning done %p\n", bd->name, bc);
|
||||
bsg_dbg(bd, "returning done %p\n", bc);
|
||||
|
||||
return bc;
|
||||
}
|
||||
|
@ -390,7 +386,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
|
|||
struct scsi_request *req = scsi_req(rq);
|
||||
int ret = 0;
|
||||
|
||||
dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result);
|
||||
pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result);
|
||||
/*
|
||||
* fill in all the output members
|
||||
*/
|
||||
|
@ -469,7 +465,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)
|
|||
struct bsg_command *bc;
|
||||
int ret, tret;
|
||||
|
||||
dprintk("%s: entered\n", bd->name);
|
||||
bsg_dbg(bd, "entered\n");
|
||||
|
||||
/*
|
||||
* wait for all commands to complete
|
||||
|
@ -572,7 +568,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
|
|||
int ret;
|
||||
ssize_t bytes_read;
|
||||
|
||||
dprintk("%s: read %zd bytes\n", bd->name, count);
|
||||
bsg_dbg(bd, "read %zd bytes\n", count);
|
||||
|
||||
bsg_set_block(bd, file);
|
||||
|
||||
|
@ -646,7 +642,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
|
|||
ssize_t bytes_written;
|
||||
int ret;
|
||||
|
||||
dprintk("%s: write %zd bytes\n", bd->name, count);
|
||||
bsg_dbg(bd, "write %zd bytes\n", count);
|
||||
|
||||
if (unlikely(uaccess_kernel()))
|
||||
return -EINVAL;
|
||||
|
@ -664,7 +660,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
|
|||
if (!bytes_written || err_block_err(ret))
|
||||
bytes_written = ret;
|
||||
|
||||
dprintk("%s: returning %zd\n", bd->name, bytes_written);
|
||||
bsg_dbg(bd, "returning %zd\n", bytes_written);
|
||||
return bytes_written;
|
||||
}
|
||||
|
||||
|
@ -717,7 +713,7 @@ static int bsg_put_device(struct bsg_device *bd)
|
|||
hlist_del(&bd->dev_list);
|
||||
mutex_unlock(&bsg_mutex);
|
||||
|
||||
dprintk("%s: tearing down\n", bd->name);
|
||||
bsg_dbg(bd, "tearing down\n");
|
||||
|
||||
/*
|
||||
* close can always block
|
||||
|
@ -744,9 +740,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
|
|||
struct file *file)
|
||||
{
|
||||
struct bsg_device *bd;
|
||||
#ifdef BSG_DEBUG
|
||||
unsigned char buf[32];
|
||||
#endif
|
||||
|
||||
if (!blk_queue_scsi_passthrough(rq)) {
|
||||
WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
|
||||
|
@ -771,7 +765,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
|
|||
hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
|
||||
|
||||
strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1);
|
||||
dprintk("bound to <%s>, max queue %d\n",
|
||||
bsg_dbg(bd, "bound to <%s>, max queue %d\n",
|
||||
format_dev_t(buf, inode->i_rdev), bd->max_queue);
|
||||
|
||||
mutex_unlock(&bsg_mutex);
|
||||
|
|
|
@ -50,8 +50,6 @@ struct deadline_data {
|
|||
int front_merges;
|
||||
};
|
||||
|
||||
static void deadline_move_request(struct deadline_data *, struct request *);
|
||||
|
||||
static inline struct rb_root *
|
||||
deadline_rb_root(struct deadline_data *dd, struct request *rq)
|
||||
{
|
||||
|
@ -100,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request *rq)
|
|||
struct deadline_data *dd = q->elevator->elevator_data;
|
||||
const int data_dir = rq_data_dir(rq);
|
||||
|
||||
/*
|
||||
* This may be a requeue of a write request that has locked its
|
||||
* target zone. If it is the case, this releases the zone lock.
|
||||
*/
|
||||
blk_req_zone_write_unlock(rq);
|
||||
|
||||
deadline_add_rq_rb(dd, rq);
|
||||
|
||||
/*
|
||||
|
@ -190,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
|
|||
{
|
||||
struct request_queue *q = rq->q;
|
||||
|
||||
/*
|
||||
* For a zoned block device, write requests must write lock their
|
||||
* target zone.
|
||||
*/
|
||||
blk_req_zone_write_lock(rq);
|
||||
|
||||
deadline_remove_request(q, rq);
|
||||
elv_dispatch_add_tail(q, rq);
|
||||
}
|
||||
|
@ -230,6 +240,69 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* For the specified data direction, return the next request to dispatch using
|
||||
* arrival ordered lists.
|
||||
*/
|
||||
static struct request *
|
||||
deadline_fifo_request(struct deadline_data *dd, int data_dir)
|
||||
{
|
||||
struct request *rq;
|
||||
|
||||
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
|
||||
return NULL;
|
||||
|
||||
if (list_empty(&dd->fifo_list[data_dir]))
|
||||
return NULL;
|
||||
|
||||
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
|
||||
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
|
||||
return rq;
|
||||
|
||||
/*
|
||||
* Look for a write request that can be dispatched, that is one with
|
||||
* an unlocked target zone.
|
||||
*/
|
||||
list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
|
||||
if (blk_req_can_dispatch_to_zone(rq))
|
||||
return rq;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* For the specified data direction, return the next request to dispatch using
|
||||
* sector position sorted lists.
|
||||
*/
|
||||
static struct request *
|
||||
deadline_next_request(struct deadline_data *dd, int data_dir)
|
||||
{
|
||||
struct request *rq;
|
||||
|
||||
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
|
||||
return NULL;
|
||||
|
||||
rq = dd->next_rq[data_dir];
|
||||
if (!rq)
|
||||
return NULL;
|
||||
|
||||
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
|
||||
return rq;
|
||||
|
||||
/*
|
||||
* Look for a write request that can be dispatched, that is one with
|
||||
* an unlocked target zone.
|
||||
*/
|
||||
while (rq) {
|
||||
if (blk_req_can_dispatch_to_zone(rq))
|
||||
return rq;
|
||||
rq = deadline_latter_request(rq);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* deadline_dispatch_requests selects the best request according to
|
||||
* read/write expire, fifo_batch, etc
|
||||
|
@ -239,16 +312,15 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
|
|||
struct deadline_data *dd = q->elevator->elevator_data;
|
||||
const int reads = !list_empty(&dd->fifo_list[READ]);
|
||||
const int writes = !list_empty(&dd->fifo_list[WRITE]);
|
||||
struct request *rq;
|
||||
struct request *rq, *next_rq;
|
||||
int data_dir;
|
||||
|
||||
/*
|
||||
* batches are currently reads XOR writes
|
||||
*/
|
||||
if (dd->next_rq[WRITE])
|
||||
rq = dd->next_rq[WRITE];
|
||||
else
|
||||
rq = dd->next_rq[READ];
|
||||
rq = deadline_next_request(dd, WRITE);
|
||||
if (!rq)
|
||||
rq = deadline_next_request(dd, READ);
|
||||
|
||||
if (rq && dd->batching < dd->fifo_batch)
|
||||
/* we have a next request are still entitled to batch */
|
||||
|
@ -262,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
|
|||
if (reads) {
|
||||
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
|
||||
|
||||
if (writes && (dd->starved++ >= dd->writes_starved))
|
||||
if (deadline_fifo_request(dd, WRITE) &&
|
||||
(dd->starved++ >= dd->writes_starved))
|
||||
goto dispatch_writes;
|
||||
|
||||
data_dir = READ;
|
||||
|
@ -291,21 +364,29 @@ dispatch_find_request:
|
|||
/*
|
||||
* we are not running a batch, find best request for selected data_dir
|
||||
*/
|
||||
if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
|
||||
next_rq = deadline_next_request(dd, data_dir);
|
||||
if (deadline_check_fifo(dd, data_dir) || !next_rq) {
|
||||
/*
|
||||
* A deadline has expired, the last request was in the other
|
||||
* direction, or we have run out of higher-sectored requests.
|
||||
* Start again from the request with the earliest expiry time.
|
||||
*/
|
||||
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
|
||||
rq = deadline_fifo_request(dd, data_dir);
|
||||
} else {
|
||||
/*
|
||||
* The last req was the same dir and we have a next request in
|
||||
* sort order. No expired requests so continue on from here.
|
||||
*/
|
||||
rq = dd->next_rq[data_dir];
|
||||
rq = next_rq;
|
||||
}
|
||||
|
||||
/*
|
||||
* For a zoned block device, if we only have writes queued and none of
|
||||
* them can be dispatched, rq will be NULL.
|
||||
*/
|
||||
if (!rq)
|
||||
return 0;
|
||||
|
||||
dd->batching = 0;
|
||||
|
||||
dispatch_request:
|
||||
|
@ -318,6 +399,16 @@ dispatch_request:
|
|||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* For zoned block devices, write unlock the target zone of completed
|
||||
* write requests.
|
||||
*/
|
||||
static void
|
||||
deadline_completed_request(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
blk_req_zone_write_unlock(rq);
|
||||
}
|
||||
|
||||
static void deadline_exit_queue(struct elevator_queue *e)
|
||||
{
|
||||
struct deadline_data *dd = e->elevator_data;
|
||||
|
@ -439,6 +530,7 @@ static struct elevator_type iosched_deadline = {
|
|||
.elevator_merged_fn = deadline_merged_request,
|
||||
.elevator_merge_req_fn = deadline_merged_requests,
|
||||
.elevator_dispatch_fn = deadline_dispatch_requests,
|
||||
.elevator_completed_req_fn = deadline_completed_request,
|
||||
.elevator_add_req_fn = deadline_add_request,
|
||||
.elevator_former_req_fn = elv_rb_former_request,
|
||||
.elevator_latter_req_fn = elv_rb_latter_request,
|
||||
|
|
|
@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q)
|
|||
struct elevator_queue *e = q->elevator;
|
||||
int error;
|
||||
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
|
||||
error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
|
||||
if (!error) {
|
||||
struct elv_fs_entry *attr = e->type->elevator_attrs;
|
||||
|
@ -886,10 +888,11 @@ int elv_register_queue(struct request_queue *q)
|
|||
}
|
||||
return error;
|
||||
}
|
||||
EXPORT_SYMBOL(elv_register_queue);
|
||||
|
||||
void elv_unregister_queue(struct request_queue *q)
|
||||
{
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
|
||||
if (q) {
|
||||
struct elevator_queue *e = q->elevator;
|
||||
|
||||
|
@ -900,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q)
|
|||
wbt_enable_default(q);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(elv_unregister_queue);
|
||||
|
||||
int elv_register(struct elevator_type *e)
|
||||
{
|
||||
|
@ -967,7 +969,10 @@ static int elevator_switch_mq(struct request_queue *q,
|
|||
{
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
|
||||
blk_mq_freeze_queue(q);
|
||||
blk_mq_quiesce_queue(q);
|
||||
|
||||
if (q->elevator) {
|
||||
if (q->elevator->registered)
|
||||
|
@ -994,6 +999,7 @@ static int elevator_switch_mq(struct request_queue *q,
|
|||
blk_add_trace_msg(q, "elv switch: none");
|
||||
|
||||
out:
|
||||
blk_mq_unquiesce_queue(q);
|
||||
blk_mq_unfreeze_queue(q);
|
||||
return ret;
|
||||
}
|
||||
|
@ -1010,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
|
|||
bool old_registered = false;
|
||||
int err;
|
||||
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
|
||||
if (q->mq_ops)
|
||||
return elevator_switch_mq(q, new_e);
|
||||
|
||||
|
|
|
@ -629,16 +629,18 @@ exit:
|
|||
}
|
||||
|
||||
/**
|
||||
* device_add_disk - add partitioning information to kernel list
|
||||
* __device_add_disk - add disk information to kernel list
|
||||
* @parent: parent device for the disk
|
||||
* @disk: per-device partitioning information
|
||||
* @register_queue: register the queue if set to true
|
||||
*
|
||||
* This function registers the partitioning information in @disk
|
||||
* with the kernel.
|
||||
*
|
||||
* FIXME: error handling
|
||||
*/
|
||||
void device_add_disk(struct device *parent, struct gendisk *disk)
|
||||
static void __device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
bool register_queue)
|
||||
{
|
||||
dev_t devt;
|
||||
int retval;
|
||||
|
@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
|
|||
exact_match, exact_lock, disk);
|
||||
}
|
||||
register_disk(parent, disk);
|
||||
blk_register_queue(disk);
|
||||
if (register_queue)
|
||||
blk_register_queue(disk);
|
||||
|
||||
/*
|
||||
* Take an extra ref on queue which will be put on disk_release()
|
||||
|
@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
|
|||
disk_add_events(disk);
|
||||
blk_integrity_add(disk);
|
||||
}
|
||||
|
||||
void device_add_disk(struct device *parent, struct gendisk *disk)
|
||||
{
|
||||
__device_add_disk(parent, disk, true);
|
||||
}
|
||||
EXPORT_SYMBOL(device_add_disk);
|
||||
|
||||
void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
|
||||
{
|
||||
__device_add_disk(parent, disk, false);
|
||||
}
|
||||
EXPORT_SYMBOL(device_add_disk_no_queue_reg);
|
||||
|
||||
void del_gendisk(struct gendisk *disk)
|
||||
{
|
||||
struct disk_part_iter piter;
|
||||
|
@ -725,7 +739,8 @@ void del_gendisk(struct gendisk *disk)
|
|||
* Unregister bdi before releasing device numbers (as they can
|
||||
* get reused and we'd get clashes in sysfs).
|
||||
*/
|
||||
bdi_unregister(disk->queue->backing_dev_info);
|
||||
if (!(disk->flags & GENHD_FL_HIDDEN))
|
||||
bdi_unregister(disk->queue->backing_dev_info);
|
||||
blk_unregister_queue(disk);
|
||||
} else {
|
||||
WARN_ON(1);
|
||||
|
|
|
@ -59,6 +59,7 @@ struct deadline_data {
|
|||
int front_merges;
|
||||
|
||||
spinlock_t lock;
|
||||
spinlock_t zone_lock;
|
||||
struct list_head dispatch;
|
||||
};
|
||||
|
||||
|
@ -191,14 +192,84 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* For the specified data direction, return the next request to
|
||||
* dispatch using arrival ordered lists.
|
||||
*/
|
||||
static struct request *
|
||||
deadline_fifo_request(struct deadline_data *dd, int data_dir)
|
||||
{
|
||||
struct request *rq;
|
||||
unsigned long flags;
|
||||
|
||||
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
|
||||
return NULL;
|
||||
|
||||
if (list_empty(&dd->fifo_list[data_dir]))
|
||||
return NULL;
|
||||
|
||||
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
|
||||
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
|
||||
return rq;
|
||||
|
||||
/*
|
||||
* Look for a write request that can be dispatched, that is one with
|
||||
* an unlocked target zone.
|
||||
*/
|
||||
spin_lock_irqsave(&dd->zone_lock, flags);
|
||||
list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
|
||||
if (blk_req_can_dispatch_to_zone(rq))
|
||||
goto out;
|
||||
}
|
||||
rq = NULL;
|
||||
out:
|
||||
spin_unlock_irqrestore(&dd->zone_lock, flags);
|
||||
|
||||
return rq;
|
||||
}
|
||||
|
||||
/*
|
||||
* For the specified data direction, return the next request to
|
||||
* dispatch using sector position sorted lists.
|
||||
*/
|
||||
static struct request *
|
||||
deadline_next_request(struct deadline_data *dd, int data_dir)
|
||||
{
|
||||
struct request *rq;
|
||||
unsigned long flags;
|
||||
|
||||
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
|
||||
return NULL;
|
||||
|
||||
rq = dd->next_rq[data_dir];
|
||||
if (!rq)
|
||||
return NULL;
|
||||
|
||||
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
|
||||
return rq;
|
||||
|
||||
/*
|
||||
* Look for a write request that can be dispatched, that is one with
|
||||
* an unlocked target zone.
|
||||
*/
|
||||
spin_lock_irqsave(&dd->zone_lock, flags);
|
||||
while (rq) {
|
||||
if (blk_req_can_dispatch_to_zone(rq))
|
||||
break;
|
||||
rq = deadline_latter_request(rq);
|
||||
}
|
||||
spin_unlock_irqrestore(&dd->zone_lock, flags);
|
||||
|
||||
return rq;
|
||||
}
|
||||
|
||||
/*
|
||||
* deadline_dispatch_requests selects the best request according to
|
||||
* read/write expire, fifo_batch, etc
|
||||
*/
|
||||
static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
||||
static struct request *__dd_dispatch_request(struct deadline_data *dd)
|
||||
{
|
||||
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
|
||||
struct request *rq;
|
||||
struct request *rq, *next_rq;
|
||||
bool reads, writes;
|
||||
int data_dir;
|
||||
|
||||
|
@ -214,10 +285,9 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
|||
/*
|
||||
* batches are currently reads XOR writes
|
||||
*/
|
||||
if (dd->next_rq[WRITE])
|
||||
rq = dd->next_rq[WRITE];
|
||||
else
|
||||
rq = dd->next_rq[READ];
|
||||
rq = deadline_next_request(dd, WRITE);
|
||||
if (!rq)
|
||||
rq = deadline_next_request(dd, READ);
|
||||
|
||||
if (rq && dd->batching < dd->fifo_batch)
|
||||
/* we have a next request are still entitled to batch */
|
||||
|
@ -231,7 +301,8 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
|||
if (reads) {
|
||||
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
|
||||
|
||||
if (writes && (dd->starved++ >= dd->writes_starved))
|
||||
if (deadline_fifo_request(dd, WRITE) &&
|
||||
(dd->starved++ >= dd->writes_starved))
|
||||
goto dispatch_writes;
|
||||
|
||||
data_dir = READ;
|
||||
|
@ -260,21 +331,29 @@ dispatch_find_request:
|
|||
/*
|
||||
* we are not running a batch, find best request for selected data_dir
|
||||
*/
|
||||
if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
|
||||
next_rq = deadline_next_request(dd, data_dir);
|
||||
if (deadline_check_fifo(dd, data_dir) || !next_rq) {
|
||||
/*
|
||||
* A deadline has expired, the last request was in the other
|
||||
* direction, or we have run out of higher-sectored requests.
|
||||
* Start again from the request with the earliest expiry time.
|
||||
*/
|
||||
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
|
||||
rq = deadline_fifo_request(dd, data_dir);
|
||||
} else {
|
||||
/*
|
||||
* The last req was the same dir and we have a next request in
|
||||
* sort order. No expired requests so continue on from here.
|
||||
*/
|
||||
rq = dd->next_rq[data_dir];
|
||||
rq = next_rq;
|
||||
}
|
||||
|
||||
/*
|
||||
* For a zoned block device, if we only have writes queued and none of
|
||||
* them can be dispatched, rq will be NULL.
|
||||
*/
|
||||
if (!rq)
|
||||
return NULL;
|
||||
|
||||
dd->batching = 0;
|
||||
|
||||
dispatch_request:
|
||||
|
@ -284,17 +363,27 @@ dispatch_request:
|
|||
dd->batching++;
|
||||
deadline_move_request(dd, rq);
|
||||
done:
|
||||
/*
|
||||
* If the request needs its target zone locked, do it.
|
||||
*/
|
||||
blk_req_zone_write_lock(rq);
|
||||
rq->rq_flags |= RQF_STARTED;
|
||||
return rq;
|
||||
}
|
||||
|
||||
/*
|
||||
* One confusing aspect here is that we get called for a specific
|
||||
* hardware queue, but we return a request that may not be for a
|
||||
* different hardware queue. This is because mq-deadline has shared
|
||||
* state for all hardware queues, in terms of sorting, FIFOs, etc.
|
||||
*/
|
||||
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
|
||||
struct request *rq;
|
||||
|
||||
spin_lock(&dd->lock);
|
||||
rq = __dd_dispatch_request(hctx);
|
||||
rq = __dd_dispatch_request(dd);
|
||||
spin_unlock(&dd->lock);
|
||||
|
||||
return rq;
|
||||
|
@ -339,6 +428,7 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
|
|||
dd->front_merges = 1;
|
||||
dd->fifo_batch = fifo_batch;
|
||||
spin_lock_init(&dd->lock);
|
||||
spin_lock_init(&dd->zone_lock);
|
||||
INIT_LIST_HEAD(&dd->dispatch);
|
||||
|
||||
q->elevator = eq;
|
||||
|
@ -395,6 +485,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
|
|||
struct deadline_data *dd = q->elevator->elevator_data;
|
||||
const int data_dir = rq_data_dir(rq);
|
||||
|
||||
/*
|
||||
* This may be a requeue of a write request that has locked its
|
||||
* target zone. If it is the case, this releases the zone lock.
|
||||
*/
|
||||
blk_req_zone_write_unlock(rq);
|
||||
|
||||
if (blk_mq_sched_try_insert_merge(q, rq))
|
||||
return;
|
||||
|
||||
|
@ -439,6 +535,26 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
|
|||
spin_unlock(&dd->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* For zoned block devices, write unlock the target zone of
|
||||
* completed write requests. Do this while holding the zone lock
|
||||
* spinlock so that the zone is never unlocked while deadline_fifo_request()
|
||||
* while deadline_next_request() are executing.
|
||||
*/
|
||||
static void dd_completed_request(struct request *rq)
|
||||
{
|
||||
struct request_queue *q = rq->q;
|
||||
|
||||
if (blk_queue_is_zoned(q)) {
|
||||
struct deadline_data *dd = q->elevator->elevator_data;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&dd->zone_lock, flags);
|
||||
blk_req_zone_write_unlock(rq);
|
||||
spin_unlock_irqrestore(&dd->zone_lock, flags);
|
||||
}
|
||||
}
|
||||
|
||||
static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
|
||||
|
@ -640,6 +756,7 @@ static struct elevator_type mq_deadline = {
|
|||
.ops.mq = {
|
||||
.insert_requests = dd_insert_requests,
|
||||
.dispatch_request = dd_dispatch_request,
|
||||
.completed_request = dd_completed_request,
|
||||
.next_request = elv_rb_latter_request,
|
||||
.former_request = elv_rb_former_request,
|
||||
.bio_merge = dd_bio_merge,
|
||||
|
|
|
@ -301,7 +301,9 @@ static void parse_bsd(struct parsed_partitions *state,
|
|||
continue;
|
||||
bsd_start = le32_to_cpu(p->p_offset);
|
||||
bsd_size = le32_to_cpu(p->p_size);
|
||||
if (memcmp(flavour, "bsd\0", 4) == 0)
|
||||
/* FreeBSD has relative offset if C partition offset is zero */
|
||||
if (memcmp(flavour, "bsd\0", 4) == 0 &&
|
||||
le32_to_cpu(l->d_partitions[2].p_offset) == 0)
|
||||
bsd_start += offset;
|
||||
if (offset == bsd_start && size == bsd_size)
|
||||
/* full parent partition, we have it already */
|
||||
|
|
|
@ -384,9 +384,10 @@ out_put_request:
|
|||
|
||||
/**
|
||||
* sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
|
||||
* @file: file this ioctl operates on (optional)
|
||||
* @q: request queue to send scsi commands down
|
||||
* @disk: gendisk to operate on (option)
|
||||
* @mode: mode used to open the file through which the ioctl has been
|
||||
* submitted
|
||||
* @sic: userspace structure describing the command to perform
|
||||
*
|
||||
* Send down the scsi command described by @sic to the device below
|
||||
|
@ -415,10 +416,10 @@ out_put_request:
|
|||
* Positive numbers returned are the compacted SCSI error codes (4
|
||||
* bytes in one int) where the lowest byte is the SCSI status.
|
||||
*/
|
||||
#define OMAX_SB_LEN 16 /* For backward compatibility */
|
||||
int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
|
||||
struct scsi_ioctl_command __user *sic)
|
||||
{
|
||||
enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */
|
||||
struct request *rq;
|
||||
struct scsi_request *req;
|
||||
int err;
|
||||
|
@ -692,38 +693,9 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
|
|||
if (bd && bd == bd->bd_contains)
|
||||
return 0;
|
||||
|
||||
/* Actually none of these is particularly useful on a partition,
|
||||
* but they are safe.
|
||||
*/
|
||||
switch (cmd) {
|
||||
case SCSI_IOCTL_GET_IDLUN:
|
||||
case SCSI_IOCTL_GET_BUS_NUMBER:
|
||||
case SCSI_IOCTL_GET_PCI:
|
||||
case SCSI_IOCTL_PROBE_HOST:
|
||||
case SG_GET_VERSION_NUM:
|
||||
case SG_SET_TIMEOUT:
|
||||
case SG_GET_TIMEOUT:
|
||||
case SG_GET_RESERVED_SIZE:
|
||||
case SG_SET_RESERVED_SIZE:
|
||||
case SG_EMULATED_HOST:
|
||||
return 0;
|
||||
case CDROM_GET_CAPABILITY:
|
||||
/* Keep this until we remove the printk below. udev sends it
|
||||
* and we do not want to spam dmesg about it. CD-ROMs do
|
||||
* not have partitions, so we get here only for disks.
|
||||
*/
|
||||
return -ENOIOCTLCMD;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (capable(CAP_SYS_RAWIO))
|
||||
return 0;
|
||||
|
||||
/* In particular, rule out all resets and host-specific ioctls. */
|
||||
printk_ratelimited(KERN_WARNING
|
||||
"%s: sending ioctl %x to a partition!\n", current->comm, cmd);
|
||||
|
||||
return -ENOIOCTLCMD;
|
||||
}
|
||||
EXPORT_SYMBOL(scsi_verify_blk_ioctl);
|
||||
|
|
|
@ -106,6 +106,7 @@ config CRYPTO_KPP
|
|||
config CRYPTO_ACOMP2
|
||||
tristate
|
||||
select CRYPTO_ALGAPI2
|
||||
select SGL_ALLOC
|
||||
|
||||
config CRYPTO_ACOMP
|
||||
tristate
|
||||
|
|
|
@ -140,53 +140,6 @@ static int crypto_scomp_init_tfm(struct crypto_tfm *tfm)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static void crypto_scomp_sg_free(struct scatterlist *sgl)
|
||||
{
|
||||
int i, n;
|
||||
struct page *page;
|
||||
|
||||
if (!sgl)
|
||||
return;
|
||||
|
||||
n = sg_nents(sgl);
|
||||
for_each_sg(sgl, sgl, n, i) {
|
||||
page = sg_page(sgl);
|
||||
if (page)
|
||||
__free_page(page);
|
||||
}
|
||||
|
||||
kfree(sgl);
|
||||
}
|
||||
|
||||
static struct scatterlist *crypto_scomp_sg_alloc(size_t size, gfp_t gfp)
|
||||
{
|
||||
struct scatterlist *sgl;
|
||||
struct page *page;
|
||||
int i, n;
|
||||
|
||||
n = ((size - 1) >> PAGE_SHIFT) + 1;
|
||||
|
||||
sgl = kmalloc_array(n, sizeof(struct scatterlist), gfp);
|
||||
if (!sgl)
|
||||
return NULL;
|
||||
|
||||
sg_init_table(sgl, n);
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
page = alloc_page(gfp);
|
||||
if (!page)
|
||||
goto err;
|
||||
sg_set_page(sgl + i, page, PAGE_SIZE, 0);
|
||||
}
|
||||
|
||||
return sgl;
|
||||
|
||||
err:
|
||||
sg_mark_end(sgl + i);
|
||||
crypto_scomp_sg_free(sgl);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
|
||||
{
|
||||
struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
|
||||
|
@ -220,7 +173,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
|
|||
scratch_dst, &req->dlen, *ctx);
|
||||
if (!ret) {
|
||||
if (!req->dst) {
|
||||
req->dst = crypto_scomp_sg_alloc(req->dlen, GFP_ATOMIC);
|
||||
req->dst = sgl_alloc(req->dlen, GFP_ATOMIC, NULL);
|
||||
if (!req->dst)
|
||||
goto out;
|
||||
}
|
||||
|
@ -274,7 +227,7 @@ int crypto_init_scomp_ops_async(struct crypto_tfm *tfm)
|
|||
|
||||
crt->compress = scomp_acomp_compress;
|
||||
crt->decompress = scomp_acomp_decompress;
|
||||
crt->dst_free = crypto_scomp_sg_free;
|
||||
crt->dst_free = sgl_free;
|
||||
crt->reqsize = sizeof(void *);
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -6619,43 +6619,27 @@ static void DAC960_DestroyProcEntries(DAC960_Controller_T *Controller)
|
|||
|
||||
#ifdef DAC960_GAM_MINOR
|
||||
|
||||
/*
|
||||
* DAC960_gam_ioctl is the ioctl function for performing RAID operations.
|
||||
*/
|
||||
|
||||
static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
|
||||
unsigned long Argument)
|
||||
static long DAC960_gam_get_controller_info(DAC960_ControllerInfo_T __user *UserSpaceControllerInfo)
|
||||
{
|
||||
long ErrorCode = 0;
|
||||
if (!capable(CAP_SYS_ADMIN)) return -EACCES;
|
||||
|
||||
mutex_lock(&DAC960_mutex);
|
||||
switch (Request)
|
||||
{
|
||||
case DAC960_IOCTL_GET_CONTROLLER_COUNT:
|
||||
ErrorCode = DAC960_ControllerCount;
|
||||
break;
|
||||
case DAC960_IOCTL_GET_CONTROLLER_INFO:
|
||||
{
|
||||
DAC960_ControllerInfo_T __user *UserSpaceControllerInfo =
|
||||
(DAC960_ControllerInfo_T __user *) Argument;
|
||||
DAC960_ControllerInfo_T ControllerInfo;
|
||||
DAC960_Controller_T *Controller;
|
||||
int ControllerNumber;
|
||||
long ErrorCode;
|
||||
|
||||
if (UserSpaceControllerInfo == NULL)
|
||||
ErrorCode = -EINVAL;
|
||||
else ErrorCode = get_user(ControllerNumber,
|
||||
&UserSpaceControllerInfo->ControllerNumber);
|
||||
if (ErrorCode != 0)
|
||||
break;
|
||||
goto out;
|
||||
ErrorCode = -ENXIO;
|
||||
if (ControllerNumber < 0 ||
|
||||
ControllerNumber > DAC960_ControllerCount - 1) {
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
Controller = DAC960_Controllers[ControllerNumber];
|
||||
if (Controller == NULL)
|
||||
break;
|
||||
goto out;
|
||||
memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T));
|
||||
ControllerInfo.ControllerNumber = ControllerNumber;
|
||||
ControllerInfo.FirmwareType = Controller->FirmwareType;
|
||||
|
@ -6670,12 +6654,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
|
|||
strcpy(ControllerInfo.FirmwareVersion, Controller->FirmwareVersion);
|
||||
ErrorCode = (copy_to_user(UserSpaceControllerInfo, &ControllerInfo,
|
||||
sizeof(DAC960_ControllerInfo_T)) ? -EFAULT : 0);
|
||||
break;
|
||||
}
|
||||
case DAC960_IOCTL_V1_EXECUTE_COMMAND:
|
||||
{
|
||||
DAC960_V1_UserCommand_T __user *UserSpaceUserCommand =
|
||||
(DAC960_V1_UserCommand_T __user *) Argument;
|
||||
out:
|
||||
return ErrorCode;
|
||||
}
|
||||
|
||||
static long DAC960_gam_v1_execute_command(DAC960_V1_UserCommand_T __user *UserSpaceUserCommand)
|
||||
{
|
||||
DAC960_V1_UserCommand_T UserCommand;
|
||||
DAC960_Controller_T *Controller;
|
||||
DAC960_Command_T *Command = NULL;
|
||||
|
@ -6688,39 +6672,41 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
|
|||
int ControllerNumber, DataTransferLength;
|
||||
unsigned char *DataTransferBuffer = NULL;
|
||||
dma_addr_t DataTransferBufferDMA;
|
||||
long ErrorCode;
|
||||
|
||||
if (UserSpaceUserCommand == NULL) {
|
||||
ErrorCode = -EINVAL;
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
if (copy_from_user(&UserCommand, UserSpaceUserCommand,
|
||||
sizeof(DAC960_V1_UserCommand_T))) {
|
||||
ErrorCode = -EFAULT;
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
ControllerNumber = UserCommand.ControllerNumber;
|
||||
ErrorCode = -ENXIO;
|
||||
if (ControllerNumber < 0 ||
|
||||
ControllerNumber > DAC960_ControllerCount - 1)
|
||||
break;
|
||||
goto out;
|
||||
Controller = DAC960_Controllers[ControllerNumber];
|
||||
if (Controller == NULL)
|
||||
break;
|
||||
goto out;
|
||||
ErrorCode = -EINVAL;
|
||||
if (Controller->FirmwareType != DAC960_V1_Controller)
|
||||
break;
|
||||
goto out;
|
||||
CommandOpcode = UserCommand.CommandMailbox.Common.CommandOpcode;
|
||||
DataTransferLength = UserCommand.DataTransferLength;
|
||||
if (CommandOpcode & 0x80)
|
||||
break;
|
||||
goto out;
|
||||
if (CommandOpcode == DAC960_V1_DCDB)
|
||||
{
|
||||
if (copy_from_user(&DCDB, UserCommand.DCDB,
|
||||
sizeof(DAC960_V1_DCDB_T))) {
|
||||
ErrorCode = -EFAULT;
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
if (DCDB.Channel >= DAC960_V1_MaxChannels)
|
||||
break;
|
||||
goto out;
|
||||
if (!((DataTransferLength == 0 &&
|
||||
DCDB.Direction
|
||||
== DAC960_V1_DCDB_NoDataTransfer) ||
|
||||
|
@ -6730,15 +6716,15 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
|
|||
(DataTransferLength < 0 &&
|
||||
DCDB.Direction
|
||||
== DAC960_V1_DCDB_DataTransferSystemToDevice)))
|
||||
break;
|
||||
goto out;
|
||||
if (((DCDB.TransferLengthHigh4 << 16) | DCDB.TransferLength)
|
||||
!= abs(DataTransferLength))
|
||||
break;
|
||||
goto out;
|
||||
DCDB_IOBUF = pci_alloc_consistent(Controller->PCIDevice,
|
||||
sizeof(DAC960_V1_DCDB_T), &DCDB_IOBUFDMA);
|
||||
if (DCDB_IOBUF == NULL) {
|
||||
ErrorCode = -ENOMEM;
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
ErrorCode = -ENOMEM;
|
||||
|
@ -6748,19 +6734,19 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
|
|||
DataTransferLength,
|
||||
&DataTransferBufferDMA);
|
||||
if (DataTransferBuffer == NULL)
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
else if (DataTransferLength < 0)
|
||||
{
|
||||
DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice,
|
||||
-DataTransferLength, &DataTransferBufferDMA);
|
||||
if (DataTransferBuffer == NULL)
|
||||
break;
|
||||
goto out;
|
||||
if (copy_from_user(DataTransferBuffer,
|
||||
UserCommand.DataTransferBuffer,
|
||||
-DataTransferLength)) {
|
||||
ErrorCode = -EFAULT;
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
if (CommandOpcode == DAC960_V1_DCDB)
|
||||
|
@ -6837,12 +6823,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
|
|||
if (DCDB_IOBUF != NULL)
|
||||
pci_free_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T),
|
||||
DCDB_IOBUF, DCDB_IOBUFDMA);
|
||||
break;
|
||||
}
|
||||
case DAC960_IOCTL_V2_EXECUTE_COMMAND:
|
||||
{
|
||||
DAC960_V2_UserCommand_T __user *UserSpaceUserCommand =
|
||||
(DAC960_V2_UserCommand_T __user *) Argument;
|
||||
out:
|
||||
return ErrorCode;
|
||||
}
|
||||
|
||||
static long DAC960_gam_v2_execute_command(DAC960_V2_UserCommand_T __user *UserSpaceUserCommand)
|
||||
{
|
||||
DAC960_V2_UserCommand_T UserCommand;
|
||||
DAC960_Controller_T *Controller;
|
||||
DAC960_Command_T *Command = NULL;
|
||||
|
@ -6855,26 +6841,26 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
|
|||
dma_addr_t DataTransferBufferDMA;
|
||||
unsigned char *RequestSenseBuffer = NULL;
|
||||
dma_addr_t RequestSenseBufferDMA;
|
||||
long ErrorCode = -EINVAL;
|
||||
|
||||
ErrorCode = -EINVAL;
|
||||
if (UserSpaceUserCommand == NULL)
|
||||
break;
|
||||
goto out;
|
||||
if (copy_from_user(&UserCommand, UserSpaceUserCommand,
|
||||
sizeof(DAC960_V2_UserCommand_T))) {
|
||||
ErrorCode = -EFAULT;
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
ErrorCode = -ENXIO;
|
||||
ControllerNumber = UserCommand.ControllerNumber;
|
||||
if (ControllerNumber < 0 ||
|
||||
ControllerNumber > DAC960_ControllerCount - 1)
|
||||
break;
|
||||
goto out;
|
||||
Controller = DAC960_Controllers[ControllerNumber];
|
||||
if (Controller == NULL)
|
||||
break;
|
||||
goto out;
|
||||
if (Controller->FirmwareType != DAC960_V2_Controller){
|
||||
ErrorCode = -EINVAL;
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
DataTransferLength = UserCommand.DataTransferLength;
|
||||
ErrorCode = -ENOMEM;
|
||||
|
@ -6884,14 +6870,14 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
|
|||
DataTransferLength,
|
||||
&DataTransferBufferDMA);
|
||||
if (DataTransferBuffer == NULL)
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
else if (DataTransferLength < 0)
|
||||
{
|
||||
DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice,
|
||||
-DataTransferLength, &DataTransferBufferDMA);
|
||||
if (DataTransferBuffer == NULL)
|
||||
break;
|
||||
goto out;
|
||||
if (copy_from_user(DataTransferBuffer,
|
||||
UserCommand.DataTransferBuffer,
|
||||
-DataTransferLength)) {
|
||||
|
@ -7001,42 +6987,44 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
|
|||
if (RequestSenseBuffer != NULL)
|
||||
pci_free_consistent(Controller->PCIDevice, RequestSenseLength,
|
||||
RequestSenseBuffer, RequestSenseBufferDMA);
|
||||
break;
|
||||
}
|
||||
case DAC960_IOCTL_V2_GET_HEALTH_STATUS:
|
||||
{
|
||||
DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus =
|
||||
(DAC960_V2_GetHealthStatus_T __user *) Argument;
|
||||
out:
|
||||
return ErrorCode;
|
||||
}
|
||||
|
||||
static long DAC960_gam_v2_get_health_status(DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus)
|
||||
{
|
||||
DAC960_V2_GetHealthStatus_T GetHealthStatus;
|
||||
DAC960_V2_HealthStatusBuffer_T HealthStatusBuffer;
|
||||
DAC960_Controller_T *Controller;
|
||||
int ControllerNumber;
|
||||
long ErrorCode;
|
||||
|
||||
if (UserSpaceGetHealthStatus == NULL) {
|
||||
ErrorCode = -EINVAL;
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
if (copy_from_user(&GetHealthStatus, UserSpaceGetHealthStatus,
|
||||
sizeof(DAC960_V2_GetHealthStatus_T))) {
|
||||
ErrorCode = -EFAULT;
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
ErrorCode = -ENXIO;
|
||||
ControllerNumber = GetHealthStatus.ControllerNumber;
|
||||
if (ControllerNumber < 0 ||
|
||||
ControllerNumber > DAC960_ControllerCount - 1)
|
||||
break;
|
||||
goto out;
|
||||
Controller = DAC960_Controllers[ControllerNumber];
|
||||
if (Controller == NULL)
|
||||
break;
|
||||
goto out;
|
||||
if (Controller->FirmwareType != DAC960_V2_Controller) {
|
||||
ErrorCode = -EINVAL;
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
if (copy_from_user(&HealthStatusBuffer,
|
||||
GetHealthStatus.HealthStatusBuffer,
|
||||
sizeof(DAC960_V2_HealthStatusBuffer_T))) {
|
||||
ErrorCode = -EFAULT;
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
ErrorCode = wait_event_interruptible_timeout(Controller->HealthStatusWaitQueue,
|
||||
!(Controller->V2.HealthStatusBuffer->StatusChangeCounter
|
||||
|
@ -7046,7 +7034,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
|
|||
DAC960_MonitoringTimerInterval);
|
||||
if (ErrorCode == -ERESTARTSYS) {
|
||||
ErrorCode = -EINTR;
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
if (copy_to_user(GetHealthStatus.HealthStatusBuffer,
|
||||
Controller->V2.HealthStatusBuffer,
|
||||
|
@ -7054,7 +7042,39 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
|
|||
ErrorCode = -EFAULT;
|
||||
else
|
||||
ErrorCode = 0;
|
||||
}
|
||||
|
||||
out:
|
||||
return ErrorCode;
|
||||
}
|
||||
|
||||
/*
|
||||
* DAC960_gam_ioctl is the ioctl function for performing RAID operations.
|
||||
*/
|
||||
|
||||
static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
|
||||
unsigned long Argument)
|
||||
{
|
||||
long ErrorCode = 0;
|
||||
void __user *argp = (void __user *)Argument;
|
||||
if (!capable(CAP_SYS_ADMIN)) return -EACCES;
|
||||
|
||||
mutex_lock(&DAC960_mutex);
|
||||
switch (Request)
|
||||
{
|
||||
case DAC960_IOCTL_GET_CONTROLLER_COUNT:
|
||||
ErrorCode = DAC960_ControllerCount;
|
||||
break;
|
||||
case DAC960_IOCTL_GET_CONTROLLER_INFO:
|
||||
ErrorCode = DAC960_gam_get_controller_info(argp);
|
||||
break;
|
||||
case DAC960_IOCTL_V1_EXECUTE_COMMAND:
|
||||
ErrorCode = DAC960_gam_v1_execute_command(argp);
|
||||
break;
|
||||
case DAC960_IOCTL_V2_EXECUTE_COMMAND:
|
||||
ErrorCode = DAC960_gam_v2_execute_command(argp);
|
||||
break;
|
||||
case DAC960_IOCTL_V2_GET_HEALTH_STATUS:
|
||||
ErrorCode = DAC960_gam_v2_get_health_status(argp);
|
||||
break;
|
||||
default:
|
||||
ErrorCode = -ENOTTY;
|
||||
|
|
|
@ -20,6 +20,10 @@ config BLK_DEV_NULL_BLK
|
|||
tristate "Null test block driver"
|
||||
select CONFIGFS_FS
|
||||
|
||||
config BLK_DEV_NULL_BLK_FAULT_INJECTION
|
||||
bool "Support fault injection for Null test block driver"
|
||||
depends on BLK_DEV_NULL_BLK && FAULT_INJECTION
|
||||
|
||||
config BLK_DEV_FD
|
||||
tristate "Normal floppy disk support"
|
||||
depends on ARCH_MAY_HAVE_PC_FDC
|
||||
|
|
|
@ -112,8 +112,7 @@ enum frame_flags {
|
|||
struct frame {
|
||||
struct list_head head;
|
||||
u32 tag;
|
||||
struct timeval sent; /* high-res time packet was sent */
|
||||
u32 sent_jiffs; /* low-res jiffies-based sent time */
|
||||
ktime_t sent; /* high-res time packet was sent */
|
||||
ulong waited;
|
||||
ulong waited_total;
|
||||
struct aoetgt *t; /* parent target I belong to */
|
||||
|
|
|
@ -398,8 +398,7 @@ aoecmd_ata_rw(struct aoedev *d)
|
|||
|
||||
skb = skb_clone(f->skb, GFP_ATOMIC);
|
||||
if (skb) {
|
||||
do_gettimeofday(&f->sent);
|
||||
f->sent_jiffs = (u32) jiffies;
|
||||
f->sent = ktime_get();
|
||||
__skb_queue_head_init(&queue);
|
||||
__skb_queue_tail(&queue, skb);
|
||||
aoenet_xmit(&queue);
|
||||
|
@ -489,8 +488,7 @@ resend(struct aoedev *d, struct frame *f)
|
|||
skb = skb_clone(skb, GFP_ATOMIC);
|
||||
if (skb == NULL)
|
||||
return;
|
||||
do_gettimeofday(&f->sent);
|
||||
f->sent_jiffs = (u32) jiffies;
|
||||
f->sent = ktime_get();
|
||||
__skb_queue_head_init(&queue);
|
||||
__skb_queue_tail(&queue, skb);
|
||||
aoenet_xmit(&queue);
|
||||
|
@ -499,33 +497,17 @@ resend(struct aoedev *d, struct frame *f)
|
|||
static int
|
||||
tsince_hr(struct frame *f)
|
||||
{
|
||||
struct timeval now;
|
||||
int n;
|
||||
u64 delta = ktime_to_ns(ktime_sub(ktime_get(), f->sent));
|
||||
|
||||
do_gettimeofday(&now);
|
||||
n = now.tv_usec - f->sent.tv_usec;
|
||||
n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
|
||||
/* delta is normally under 4.2 seconds, avoid 64-bit division */
|
||||
if (likely(delta <= UINT_MAX))
|
||||
return (u32)delta / NSEC_PER_USEC;
|
||||
|
||||
if (n < 0)
|
||||
n = -n;
|
||||
/* avoid overflow after 71 minutes */
|
||||
if (delta > ((u64)INT_MAX * NSEC_PER_USEC))
|
||||
return INT_MAX;
|
||||
|
||||
/* For relatively long periods, use jiffies to avoid
|
||||
* discrepancies caused by updates to the system time.
|
||||
*
|
||||
* On system with HZ of 1000, 32-bits is over 49 days
|
||||
* worth of jiffies, or over 71 minutes worth of usecs.
|
||||
*
|
||||
* Jiffies overflow is handled by subtraction of unsigned ints:
|
||||
* (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
|
||||
* $3 = 4
|
||||
* (gdb)
|
||||
*/
|
||||
if (n > USEC_PER_SEC / 4) {
|
||||
n = ((u32) jiffies) - f->sent_jiffs;
|
||||
n *= USEC_PER_SEC / HZ;
|
||||
}
|
||||
|
||||
return n;
|
||||
return div_u64(delta, NSEC_PER_USEC);
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -589,7 +571,6 @@ reassign_frame(struct frame *f)
|
|||
nf->waited = 0;
|
||||
nf->waited_total = f->waited_total;
|
||||
nf->sent = f->sent;
|
||||
nf->sent_jiffs = f->sent_jiffs;
|
||||
f->skb = skb;
|
||||
|
||||
return nf;
|
||||
|
@ -633,8 +614,7 @@ probe(struct aoetgt *t)
|
|||
|
||||
skb = skb_clone(f->skb, GFP_ATOMIC);
|
||||
if (skb) {
|
||||
do_gettimeofday(&f->sent);
|
||||
f->sent_jiffs = (u32) jiffies;
|
||||
f->sent = ktime_get();
|
||||
__skb_queue_head_init(&queue);
|
||||
__skb_queue_tail(&queue, skb);
|
||||
aoenet_xmit(&queue);
|
||||
|
@ -1432,10 +1412,8 @@ aoecmd_ata_id(struct aoedev *d)
|
|||
d->timer.function = rexmit_timer;
|
||||
|
||||
skb = skb_clone(skb, GFP_ATOMIC);
|
||||
if (skb) {
|
||||
do_gettimeofday(&f->sent);
|
||||
f->sent_jiffs = (u32) jiffies;
|
||||
}
|
||||
if (skb)
|
||||
f->sent = ktime_get();
|
||||
|
||||
return skb;
|
||||
}
|
||||
|
|
|
@ -953,7 +953,7 @@ static void drbd_bm_endio(struct bio *bio)
|
|||
struct drbd_bm_aio_ctx *ctx = bio->bi_private;
|
||||
struct drbd_device *device = ctx->device;
|
||||
struct drbd_bitmap *b = device->bitmap;
|
||||
unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
|
||||
unsigned int idx = bm_page_to_idx(bio_first_page_all(bio));
|
||||
|
||||
if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
|
||||
!bm_test_page_unchanged(b->bm_pages[idx]))
|
||||
|
|
|
@ -12,9 +12,9 @@
|
|||
#include <linux/slab.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/hrtimer.h>
|
||||
#include <linux/lightnvm.h>
|
||||
#include <linux/configfs.h>
|
||||
#include <linux/badblocks.h>
|
||||
#include <linux/fault-inject.h>
|
||||
|
||||
#define SECTOR_SHIFT 9
|
||||
#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
|
||||
|
@ -27,6 +27,10 @@
|
|||
#define TICKS_PER_SEC 50ULL
|
||||
#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC)
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
|
||||
static DECLARE_FAULT_ATTR(null_timeout_attr);
|
||||
#endif
|
||||
|
||||
static inline u64 mb_per_tick(int mbps)
|
||||
{
|
||||
return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
|
||||
|
@ -107,7 +111,6 @@ struct nullb_device {
|
|||
unsigned int hw_queue_depth; /* queue depth */
|
||||
unsigned int index; /* index of the disk, only valid with a disk */
|
||||
unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
|
||||
bool use_lightnvm; /* register as a LightNVM device */
|
||||
bool blocking; /* blocking blk-mq device */
|
||||
bool use_per_node_hctx; /* use per-node allocation for hardware context */
|
||||
bool power; /* power on/off the device */
|
||||
|
@ -121,7 +124,6 @@ struct nullb {
|
|||
unsigned int index;
|
||||
struct request_queue *q;
|
||||
struct gendisk *disk;
|
||||
struct nvm_dev *ndev;
|
||||
struct blk_mq_tag_set *tag_set;
|
||||
struct blk_mq_tag_set __tag_set;
|
||||
unsigned int queue_depth;
|
||||
|
@ -139,7 +141,6 @@ static LIST_HEAD(nullb_list);
|
|||
static struct mutex lock;
|
||||
static int null_major;
|
||||
static DEFINE_IDA(nullb_indexes);
|
||||
static struct kmem_cache *ppa_cache;
|
||||
static struct blk_mq_tag_set tag_set;
|
||||
|
||||
enum {
|
||||
|
@ -166,6 +167,11 @@ static int g_home_node = NUMA_NO_NODE;
|
|||
module_param_named(home_node, g_home_node, int, S_IRUGO);
|
||||
MODULE_PARM_DESC(home_node, "Home node for the device");
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
|
||||
static char g_timeout_str[80];
|
||||
module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
|
||||
#endif
|
||||
|
||||
static int g_queue_mode = NULL_Q_MQ;
|
||||
|
||||
static int null_param_store_val(const char *str, int *val, int min, int max)
|
||||
|
@ -208,10 +214,6 @@ static int nr_devices = 1;
|
|||
module_param(nr_devices, int, S_IRUGO);
|
||||
MODULE_PARM_DESC(nr_devices, "Number of devices to register");
|
||||
|
||||
static bool g_use_lightnvm;
|
||||
module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO);
|
||||
MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
|
||||
|
||||
static bool g_blocking;
|
||||
module_param_named(blocking, g_blocking, bool, S_IRUGO);
|
||||
MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
|
||||
|
@ -345,7 +347,6 @@ NULLB_DEVICE_ATTR(blocksize, uint);
|
|||
NULLB_DEVICE_ATTR(irqmode, uint);
|
||||
NULLB_DEVICE_ATTR(hw_queue_depth, uint);
|
||||
NULLB_DEVICE_ATTR(index, uint);
|
||||
NULLB_DEVICE_ATTR(use_lightnvm, bool);
|
||||
NULLB_DEVICE_ATTR(blocking, bool);
|
||||
NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
|
||||
NULLB_DEVICE_ATTR(memory_backed, bool);
|
||||
|
@ -455,7 +456,6 @@ static struct configfs_attribute *nullb_device_attrs[] = {
|
|||
&nullb_device_attr_irqmode,
|
||||
&nullb_device_attr_hw_queue_depth,
|
||||
&nullb_device_attr_index,
|
||||
&nullb_device_attr_use_lightnvm,
|
||||
&nullb_device_attr_blocking,
|
||||
&nullb_device_attr_use_per_node_hctx,
|
||||
&nullb_device_attr_power,
|
||||
|
@ -573,7 +573,6 @@ static struct nullb_device *null_alloc_dev(void)
|
|||
dev->blocksize = g_bs;
|
||||
dev->irqmode = g_irqmode;
|
||||
dev->hw_queue_depth = g_hw_queue_depth;
|
||||
dev->use_lightnvm = g_use_lightnvm;
|
||||
dev->blocking = g_blocking;
|
||||
dev->use_per_node_hctx = g_use_per_node_hctx;
|
||||
return dev;
|
||||
|
@ -1352,6 +1351,12 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
|
|||
return BLK_QC_T_NONE;
|
||||
}
|
||||
|
||||
static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
|
||||
{
|
||||
pr_info("null: rq %p timed out\n", rq);
|
||||
return BLK_EH_HANDLED;
|
||||
}
|
||||
|
||||
static int null_rq_prep_fn(struct request_queue *q, struct request *req)
|
||||
{
|
||||
struct nullb *nullb = q->queuedata;
|
||||
|
@ -1369,6 +1374,16 @@ static int null_rq_prep_fn(struct request_queue *q, struct request *req)
|
|||
return BLKPREP_DEFER;
|
||||
}
|
||||
|
||||
static bool should_timeout_request(struct request *rq)
|
||||
{
|
||||
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
|
||||
if (g_timeout_str[0])
|
||||
return should_fail(&null_timeout_attr, 1);
|
||||
#endif
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void null_request_fn(struct request_queue *q)
|
||||
{
|
||||
struct request *rq;
|
||||
|
@ -1376,12 +1391,20 @@ static void null_request_fn(struct request_queue *q)
|
|||
while ((rq = blk_fetch_request(q)) != NULL) {
|
||||
struct nullb_cmd *cmd = rq->special;
|
||||
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
null_handle_cmd(cmd);
|
||||
spin_lock_irq(q->queue_lock);
|
||||
if (!should_timeout_request(rq)) {
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
null_handle_cmd(cmd);
|
||||
spin_lock_irq(q->queue_lock);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
|
||||
{
|
||||
pr_info("null: rq %p timed out\n", rq);
|
||||
return BLK_EH_HANDLED;
|
||||
}
|
||||
|
||||
static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
|
||||
const struct blk_mq_queue_data *bd)
|
||||
{
|
||||
|
@ -1399,12 +1422,16 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
|
|||
|
||||
blk_mq_start_request(bd->rq);
|
||||
|
||||
return null_handle_cmd(cmd);
|
||||
if (!should_timeout_request(bd->rq))
|
||||
return null_handle_cmd(cmd);
|
||||
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
static const struct blk_mq_ops null_mq_ops = {
|
||||
.queue_rq = null_queue_rq,
|
||||
.complete = null_softirq_done_fn,
|
||||
.timeout = null_timeout_rq,
|
||||
};
|
||||
|
||||
static void cleanup_queue(struct nullb_queue *nq)
|
||||
|
@ -1423,170 +1450,6 @@ static void cleanup_queues(struct nullb *nullb)
|
|||
kfree(nullb->queues);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVM
|
||||
|
||||
static void null_lnvm_end_io(struct request *rq, blk_status_t status)
|
||||
{
|
||||
struct nvm_rq *rqd = rq->end_io_data;
|
||||
|
||||
/* XXX: lighnvm core seems to expect NVM_RSP_* values here.. */
|
||||
rqd->error = status ? -EIO : 0;
|
||||
nvm_end_io(rqd);
|
||||
|
||||
blk_put_request(rq);
|
||||
}
|
||||
|
||||
static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
|
||||
{
|
||||
struct request_queue *q = dev->q;
|
||||
struct request *rq;
|
||||
struct bio *bio = rqd->bio;
|
||||
|
||||
rq = blk_mq_alloc_request(q,
|
||||
op_is_write(bio_op(bio)) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
|
||||
if (IS_ERR(rq))
|
||||
return -ENOMEM;
|
||||
|
||||
blk_init_request_from_bio(rq, bio);
|
||||
|
||||
rq->end_io_data = rqd;
|
||||
|
||||
blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
|
||||
{
|
||||
struct nullb *nullb = dev->q->queuedata;
|
||||
sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
|
||||
sector_t blksize;
|
||||
struct nvm_id_group *grp;
|
||||
|
||||
id->ver_id = 0x1;
|
||||
id->vmnt = 0;
|
||||
id->cap = 0x2;
|
||||
id->dom = 0x1;
|
||||
|
||||
id->ppaf.blk_offset = 0;
|
||||
id->ppaf.blk_len = 16;
|
||||
id->ppaf.pg_offset = 16;
|
||||
id->ppaf.pg_len = 16;
|
||||
id->ppaf.sect_offset = 32;
|
||||
id->ppaf.sect_len = 8;
|
||||
id->ppaf.pln_offset = 40;
|
||||
id->ppaf.pln_len = 8;
|
||||
id->ppaf.lun_offset = 48;
|
||||
id->ppaf.lun_len = 8;
|
||||
id->ppaf.ch_offset = 56;
|
||||
id->ppaf.ch_len = 8;
|
||||
|
||||
sector_div(size, nullb->dev->blocksize); /* convert size to pages */
|
||||
size >>= 8; /* concert size to pgs pr blk */
|
||||
grp = &id->grp;
|
||||
grp->mtype = 0;
|
||||
grp->fmtype = 0;
|
||||
grp->num_ch = 1;
|
||||
grp->num_pg = 256;
|
||||
blksize = size;
|
||||
size >>= 16;
|
||||
grp->num_lun = size + 1;
|
||||
sector_div(blksize, grp->num_lun);
|
||||
grp->num_blk = blksize;
|
||||
grp->num_pln = 1;
|
||||
|
||||
grp->fpg_sz = nullb->dev->blocksize;
|
||||
grp->csecs = nullb->dev->blocksize;
|
||||
grp->trdt = 25000;
|
||||
grp->trdm = 25000;
|
||||
grp->tprt = 500000;
|
||||
grp->tprm = 500000;
|
||||
grp->tbet = 1500000;
|
||||
grp->tbem = 1500000;
|
||||
grp->mpos = 0x010101; /* single plane rwe */
|
||||
grp->cpar = nullb->dev->hw_queue_depth;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name)
|
||||
{
|
||||
mempool_t *virtmem_pool;
|
||||
|
||||
virtmem_pool = mempool_create_slab_pool(64, ppa_cache);
|
||||
if (!virtmem_pool) {
|
||||
pr_err("null_blk: Unable to create virtual memory pool\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return virtmem_pool;
|
||||
}
|
||||
|
||||
static void null_lnvm_destroy_dma_pool(void *pool)
|
||||
{
|
||||
mempool_destroy(pool);
|
||||
}
|
||||
|
||||
static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool,
|
||||
gfp_t mem_flags, dma_addr_t *dma_handler)
|
||||
{
|
||||
return mempool_alloc(pool, mem_flags);
|
||||
}
|
||||
|
||||
static void null_lnvm_dev_dma_free(void *pool, void *entry,
|
||||
dma_addr_t dma_handler)
|
||||
{
|
||||
mempool_free(entry, pool);
|
||||
}
|
||||
|
||||
static struct nvm_dev_ops null_lnvm_dev_ops = {
|
||||
.identity = null_lnvm_id,
|
||||
.submit_io = null_lnvm_submit_io,
|
||||
|
||||
.create_dma_pool = null_lnvm_create_dma_pool,
|
||||
.destroy_dma_pool = null_lnvm_destroy_dma_pool,
|
||||
.dev_dma_alloc = null_lnvm_dev_dma_alloc,
|
||||
.dev_dma_free = null_lnvm_dev_dma_free,
|
||||
|
||||
/* Simulate nvme protocol restriction */
|
||||
.max_phys_sect = 64,
|
||||
};
|
||||
|
||||
static int null_nvm_register(struct nullb *nullb)
|
||||
{
|
||||
struct nvm_dev *dev;
|
||||
int rv;
|
||||
|
||||
dev = nvm_alloc_dev(0);
|
||||
if (!dev)
|
||||
return -ENOMEM;
|
||||
|
||||
dev->q = nullb->q;
|
||||
memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN);
|
||||
dev->ops = &null_lnvm_dev_ops;
|
||||
|
||||
rv = nvm_register(dev);
|
||||
if (rv) {
|
||||
kfree(dev);
|
||||
return rv;
|
||||
}
|
||||
nullb->ndev = dev;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void null_nvm_unregister(struct nullb *nullb)
|
||||
{
|
||||
nvm_unregister(nullb->ndev);
|
||||
}
|
||||
#else
|
||||
static int null_nvm_register(struct nullb *nullb)
|
||||
{
|
||||
pr_err("null_blk: CONFIG_NVM needs to be enabled for LightNVM\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
static void null_nvm_unregister(struct nullb *nullb) {}
|
||||
#endif /* CONFIG_NVM */
|
||||
|
||||
static void null_del_dev(struct nullb *nullb)
|
||||
{
|
||||
struct nullb_device *dev = nullb->dev;
|
||||
|
@ -1595,10 +1458,7 @@ static void null_del_dev(struct nullb *nullb)
|
|||
|
||||
list_del_init(&nullb->list);
|
||||
|
||||
if (dev->use_lightnvm)
|
||||
null_nvm_unregister(nullb);
|
||||
else
|
||||
del_gendisk(nullb->disk);
|
||||
del_gendisk(nullb->disk);
|
||||
|
||||
if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
|
||||
hrtimer_cancel(&nullb->bw_timer);
|
||||
|
@ -1610,8 +1470,7 @@ static void null_del_dev(struct nullb *nullb)
|
|||
if (dev->queue_mode == NULL_Q_MQ &&
|
||||
nullb->tag_set == &nullb->__tag_set)
|
||||
blk_mq_free_tag_set(nullb->tag_set);
|
||||
if (!dev->use_lightnvm)
|
||||
put_disk(nullb->disk);
|
||||
put_disk(nullb->disk);
|
||||
cleanup_queues(nullb);
|
||||
if (null_cache_active(nullb))
|
||||
null_free_device_storage(nullb->dev, true);
|
||||
|
@ -1775,11 +1634,6 @@ static void null_validate_conf(struct nullb_device *dev)
|
|||
{
|
||||
dev->blocksize = round_down(dev->blocksize, 512);
|
||||
dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
|
||||
if (dev->use_lightnvm && dev->blocksize != 4096)
|
||||
dev->blocksize = 4096;
|
||||
|
||||
if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ)
|
||||
dev->queue_mode = NULL_Q_MQ;
|
||||
|
||||
if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
|
||||
if (dev->submit_queues != nr_online_nodes)
|
||||
|
@ -1805,6 +1659,20 @@ static void null_validate_conf(struct nullb_device *dev)
|
|||
dev->mbps = 0;
|
||||
}
|
||||
|
||||
static bool null_setup_fault(void)
|
||||
{
|
||||
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
|
||||
if (!g_timeout_str[0])
|
||||
return true;
|
||||
|
||||
if (!setup_fault_attr(&null_timeout_attr, g_timeout_str))
|
||||
return false;
|
||||
|
||||
null_timeout_attr.verbose = 0;
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
static int null_add_dev(struct nullb_device *dev)
|
||||
{
|
||||
struct nullb *nullb;
|
||||
|
@ -1838,6 +1706,10 @@ static int null_add_dev(struct nullb_device *dev)
|
|||
if (rv)
|
||||
goto out_cleanup_queues;
|
||||
|
||||
if (!null_setup_fault())
|
||||
goto out_cleanup_queues;
|
||||
|
||||
nullb->tag_set->timeout = 5 * HZ;
|
||||
nullb->q = blk_mq_init_queue(nullb->tag_set);
|
||||
if (IS_ERR(nullb->q)) {
|
||||
rv = -ENOMEM;
|
||||
|
@ -1861,8 +1733,14 @@ static int null_add_dev(struct nullb_device *dev)
|
|||
rv = -ENOMEM;
|
||||
goto out_cleanup_queues;
|
||||
}
|
||||
|
||||
if (!null_setup_fault())
|
||||
goto out_cleanup_blk_queue;
|
||||
|
||||
blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
|
||||
blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
|
||||
blk_queue_rq_timed_out(nullb->q, null_rq_timed_out_fn);
|
||||
nullb->q->rq_timeout = 5 * HZ;
|
||||
rv = init_driver_queues(nullb);
|
||||
if (rv)
|
||||
goto out_cleanup_blk_queue;
|
||||
|
@ -1895,11 +1773,7 @@ static int null_add_dev(struct nullb_device *dev)
|
|||
|
||||
sprintf(nullb->disk_name, "nullb%d", nullb->index);
|
||||
|
||||
if (dev->use_lightnvm)
|
||||
rv = null_nvm_register(nullb);
|
||||
else
|
||||
rv = null_gendisk_register(nullb);
|
||||
|
||||
rv = null_gendisk_register(nullb);
|
||||
if (rv)
|
||||
goto out_cleanup_blk_queue;
|
||||
|
||||
|
@ -1938,18 +1812,6 @@ static int __init null_init(void)
|
|||
g_bs = PAGE_SIZE;
|
||||
}
|
||||
|
||||
if (g_use_lightnvm && g_bs != 4096) {
|
||||
pr_warn("null_blk: LightNVM only supports 4k block size\n");
|
||||
pr_warn("null_blk: defaults block size to 4k\n");
|
||||
g_bs = 4096;
|
||||
}
|
||||
|
||||
if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) {
|
||||
pr_warn("null_blk: LightNVM only supported for blk-mq\n");
|
||||
pr_warn("null_blk: defaults queue mode to blk-mq\n");
|
||||
g_queue_mode = NULL_Q_MQ;
|
||||
}
|
||||
|
||||
if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
|
||||
if (g_submit_queues != nr_online_nodes) {
|
||||
pr_warn("null_blk: submit_queues param is set to %u.\n",
|
||||
|
@ -1982,16 +1844,6 @@ static int __init null_init(void)
|
|||
goto err_conf;
|
||||
}
|
||||
|
||||
if (g_use_lightnvm) {
|
||||
ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
|
||||
0, 0, NULL);
|
||||
if (!ppa_cache) {
|
||||
pr_err("null_blk: unable to create ppa cache\n");
|
||||
ret = -ENOMEM;
|
||||
goto err_ppa;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < nr_devices; i++) {
|
||||
dev = null_alloc_dev();
|
||||
if (!dev) {
|
||||
|
@ -2015,8 +1867,6 @@ err_dev:
|
|||
null_del_dev(nullb);
|
||||
null_free_dev(dev);
|
||||
}
|
||||
kmem_cache_destroy(ppa_cache);
|
||||
err_ppa:
|
||||
unregister_blkdev(null_major, "nullb");
|
||||
err_conf:
|
||||
configfs_unregister_subsystem(&nullb_subsys);
|
||||
|
@ -2047,8 +1897,6 @@ static void __exit null_exit(void)
|
|||
|
||||
if (g_queue_mode == NULL_Q_MQ && shared_tags)
|
||||
blk_mq_free_tag_set(&tag_set);
|
||||
|
||||
kmem_cache_destroy(ppa_cache);
|
||||
}
|
||||
|
||||
module_init(null_init);
|
||||
|
|
|
@ -2579,14 +2579,14 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
|
|||
bdev = bdget(dev);
|
||||
if (!bdev)
|
||||
return -ENOMEM;
|
||||
if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) {
|
||||
WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
|
||||
bdput(bdev);
|
||||
return -EINVAL;
|
||||
}
|
||||
ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) {
|
||||
WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
|
||||
blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* This is safe, since we have a reference from open(). */
|
||||
__module_get(THIS_MODULE);
|
||||
|
@ -2745,7 +2745,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
|
|||
pd->pkt_dev = MKDEV(pktdev_major, idx);
|
||||
ret = pkt_new_dev(pd, dev);
|
||||
if (ret)
|
||||
goto out_new_dev;
|
||||
goto out_mem2;
|
||||
|
||||
/* inherit events of the host device */
|
||||
disk->events = pd->bdev->bd_disk->events;
|
||||
|
@ -2763,8 +2763,6 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
|
|||
mutex_unlock(&ctl_mutex);
|
||||
return 0;
|
||||
|
||||
out_new_dev:
|
||||
blk_cleanup_queue(disk->queue);
|
||||
out_mem2:
|
||||
put_disk(disk);
|
||||
out_mem:
|
||||
|
|
|
@ -1,278 +0,0 @@
|
|||
/*
|
||||
* Disk Array driver for Compaq SMART2 Controllers
|
||||
* Copyright 1998 Compaq Computer Corporation
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*
|
||||
* Questions/Comments/Bugfixes to iss_storagedev@hp.com
|
||||
*
|
||||
* If you want to make changes, improve or add functionality to this
|
||||
* driver, you'll probably need the Compaq Array Controller Interface
|
||||
* Specificiation (Document number ECG086/1198)
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file contains the controller communication implementation for
|
||||
* Compaq SMART-1 and SMART-2 controllers. To the best of my knowledge,
|
||||
* this should support:
|
||||
*
|
||||
* PCI:
|
||||
* SMART-2/P, SMART-2DH, SMART-2SL, SMART-221, SMART-3100ES, SMART-3200
|
||||
* Integerated SMART Array Controller, SMART-4200, SMART-4250ES
|
||||
*
|
||||
* EISA:
|
||||
* SMART-2/E, SMART, IAES, IDA-2, IDA
|
||||
*/
|
||||
|
||||
/*
|
||||
* Memory mapped FIFO interface (SMART 42xx cards)
|
||||
*/
|
||||
static void smart4_submit_command(ctlr_info_t *h, cmdlist_t *c)
|
||||
{
|
||||
writel(c->busaddr, h->vaddr + S42XX_REQUEST_PORT_OFFSET);
|
||||
}
|
||||
|
||||
/*
|
||||
* This card is the opposite of the other cards.
|
||||
* 0 turns interrupts on...
|
||||
* 0x08 turns them off...
|
||||
*/
|
||||
static void smart4_intr_mask(ctlr_info_t *h, unsigned long val)
|
||||
{
|
||||
if (val)
|
||||
{ /* Turn interrupts on */
|
||||
writel(0, h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET);
|
||||
} else /* Turn them off */
|
||||
{
|
||||
writel( S42XX_INTR_OFF,
|
||||
h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* For older cards FIFO Full = 0.
|
||||
* On this card 0 means there is room, anything else FIFO Full.
|
||||
*
|
||||
*/
|
||||
static unsigned long smart4_fifo_full(ctlr_info_t *h)
|
||||
{
|
||||
|
||||
return (!readl(h->vaddr + S42XX_REQUEST_PORT_OFFSET));
|
||||
}
|
||||
|
||||
/* This type of controller returns -1 if the fifo is empty,
|
||||
* Not 0 like the others.
|
||||
* And we need to let it know we read a value out
|
||||
*/
|
||||
static unsigned long smart4_completed(ctlr_info_t *h)
|
||||
{
|
||||
long register_value
|
||||
= readl(h->vaddr + S42XX_REPLY_PORT_OFFSET);
|
||||
|
||||
/* Fifo is empty */
|
||||
if( register_value == 0xffffffff)
|
||||
return 0;
|
||||
|
||||
/* Need to let it know we got the reply */
|
||||
/* We do this by writing a 0 to the port we just read from */
|
||||
writel(0, h->vaddr + S42XX_REPLY_PORT_OFFSET);
|
||||
|
||||
return ((unsigned long) register_value);
|
||||
}
|
||||
|
||||
/*
|
||||
* This hardware returns interrupt pending at a different place and
|
||||
* it does not tell us if the fifo is empty, we will have check
|
||||
* that by getting a 0 back from the command_completed call.
|
||||
*/
|
||||
static unsigned long smart4_intr_pending(ctlr_info_t *h)
|
||||
{
|
||||
unsigned long register_value =
|
||||
readl(h->vaddr + S42XX_INTR_STATUS);
|
||||
|
||||
if( register_value & S42XX_INTR_PENDING)
|
||||
return FIFO_NOT_EMPTY;
|
||||
return 0 ;
|
||||
}
|
||||
|
||||
static struct access_method smart4_access = {
|
||||
smart4_submit_command,
|
||||
smart4_intr_mask,
|
||||
smart4_fifo_full,
|
||||
smart4_intr_pending,
|
||||
smart4_completed,
|
||||
};
|
||||
|
||||
/*
|
||||
* Memory mapped FIFO interface (PCI SMART2 and SMART 3xxx cards)
|
||||
*/
|
||||
static void smart2_submit_command(ctlr_info_t *h, cmdlist_t *c)
|
||||
{
|
||||
writel(c->busaddr, h->vaddr + COMMAND_FIFO);
|
||||
}
|
||||
|
||||
static void smart2_intr_mask(ctlr_info_t *h, unsigned long val)
|
||||
{
|
||||
writel(val, h->vaddr + INTR_MASK);
|
||||
}
|
||||
|
||||
static unsigned long smart2_fifo_full(ctlr_info_t *h)
|
||||
{
|
||||
return readl(h->vaddr + COMMAND_FIFO);
|
||||
}
|
||||
|
||||
static unsigned long smart2_completed(ctlr_info_t *h)
|
||||
{
|
||||
return readl(h->vaddr + COMMAND_COMPLETE_FIFO);
|
||||
}
|
||||
|
||||
static unsigned long smart2_intr_pending(ctlr_info_t *h)
|
||||
{
|
||||
return readl(h->vaddr + INTR_PENDING);
|
||||
}
|
||||
|
||||
static struct access_method smart2_access = {
|
||||
smart2_submit_command,
|
||||
smart2_intr_mask,
|
||||
smart2_fifo_full,
|
||||
smart2_intr_pending,
|
||||
smart2_completed,
|
||||
};
|
||||
|
||||
/*
|
||||
* IO access for SMART-2/E cards
|
||||
*/
|
||||
static void smart2e_submit_command(ctlr_info_t *h, cmdlist_t *c)
|
||||
{
|
||||
outl(c->busaddr, h->io_mem_addr + COMMAND_FIFO);
|
||||
}
|
||||
|
||||
static void smart2e_intr_mask(ctlr_info_t *h, unsigned long val)
|
||||
{
|
||||
outl(val, h->io_mem_addr + INTR_MASK);
|
||||
}
|
||||
|
||||
static unsigned long smart2e_fifo_full(ctlr_info_t *h)
|
||||
{
|
||||
return inl(h->io_mem_addr + COMMAND_FIFO);
|
||||
}
|
||||
|
||||
static unsigned long smart2e_completed(ctlr_info_t *h)
|
||||
{
|
||||
return inl(h->io_mem_addr + COMMAND_COMPLETE_FIFO);
|
||||
}
|
||||
|
||||
static unsigned long smart2e_intr_pending(ctlr_info_t *h)
|
||||
{
|
||||
return inl(h->io_mem_addr + INTR_PENDING);
|
||||
}
|
||||
|
||||
static struct access_method smart2e_access = {
|
||||
smart2e_submit_command,
|
||||
smart2e_intr_mask,
|
||||
smart2e_fifo_full,
|
||||
smart2e_intr_pending,
|
||||
smart2e_completed,
|
||||
};
|
||||
|
||||
/*
|
||||
* IO access for older SMART-1 type cards
|
||||
*/
|
||||
#define SMART1_SYSTEM_MASK 0xC8E
|
||||
#define SMART1_SYSTEM_DOORBELL 0xC8F
|
||||
#define SMART1_LOCAL_MASK 0xC8C
|
||||
#define SMART1_LOCAL_DOORBELL 0xC8D
|
||||
#define SMART1_INTR_MASK 0xC89
|
||||
#define SMART1_LISTADDR 0xC90
|
||||
#define SMART1_LISTLEN 0xC94
|
||||
#define SMART1_TAG 0xC97
|
||||
#define SMART1_COMPLETE_ADDR 0xC98
|
||||
#define SMART1_LISTSTATUS 0xC9E
|
||||
|
||||
#define CHANNEL_BUSY 0x01
|
||||
#define CHANNEL_CLEAR 0x02
|
||||
|
||||
static void smart1_submit_command(ctlr_info_t *h, cmdlist_t *c)
|
||||
{
|
||||
/*
|
||||
* This __u16 is actually a bunch of control flags on SMART
|
||||
* and below. We want them all to be zero.
|
||||
*/
|
||||
c->hdr.size = 0;
|
||||
|
||||
outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
|
||||
|
||||
outl(c->busaddr, h->io_mem_addr + SMART1_LISTADDR);
|
||||
outw(c->size, h->io_mem_addr + SMART1_LISTLEN);
|
||||
|
||||
outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
|
||||
}
|
||||
|
||||
static void smart1_intr_mask(ctlr_info_t *h, unsigned long val)
|
||||
{
|
||||
if (val == 1) {
|
||||
outb(0xFD, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
|
||||
outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
|
||||
outb(0x01, h->io_mem_addr + SMART1_INTR_MASK);
|
||||
outb(0x01, h->io_mem_addr + SMART1_SYSTEM_MASK);
|
||||
} else {
|
||||
outb(0, h->io_mem_addr + 0xC8E);
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned long smart1_fifo_full(ctlr_info_t *h)
|
||||
{
|
||||
unsigned char chan;
|
||||
chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_CLEAR;
|
||||
return chan;
|
||||
}
|
||||
|
||||
static unsigned long smart1_completed(ctlr_info_t *h)
|
||||
{
|
||||
unsigned char status;
|
||||
unsigned long cmd;
|
||||
|
||||
if (inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY) {
|
||||
outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
|
||||
|
||||
cmd = inl(h->io_mem_addr + SMART1_COMPLETE_ADDR);
|
||||
status = inb(h->io_mem_addr + SMART1_LISTSTATUS);
|
||||
|
||||
outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
|
||||
|
||||
/*
|
||||
* this is x86 (actually compaq x86) only, so it's ok
|
||||
*/
|
||||
if (cmd) ((cmdlist_t*)bus_to_virt(cmd))->req.hdr.rcode = status;
|
||||
} else {
|
||||
cmd = 0;
|
||||
}
|
||||
return cmd;
|
||||
}
|
||||
|
||||
static unsigned long smart1_intr_pending(ctlr_info_t *h)
|
||||
{
|
||||
unsigned char chan;
|
||||
chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY;
|
||||
return chan;
|
||||
}
|
||||
|
||||
static struct access_method smart1_access = {
|
||||
smart1_submit_command,
|
||||
smart1_intr_mask,
|
||||
smart1_fifo_full,
|
||||
smart1_intr_pending,
|
||||
smart1_completed,
|
||||
};
|
|
@ -430,7 +430,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry)
|
|||
|
||||
static void zram_page_end_io(struct bio *bio)
|
||||
{
|
||||
struct page *page = bio->bi_io_vec[0].bv_page;
|
||||
struct page *page = bio_first_page_all(bio);
|
||||
|
||||
page_endio(page, op_is_write(bio_op(bio)),
|
||||
blk_status_to_errno(bio->bi_status));
|
||||
|
|
|
@ -27,13 +27,6 @@ config NVM_DEBUG
|
|||
|
||||
It is required to create/remove targets without IOCTLs.
|
||||
|
||||
config NVM_RRPC
|
||||
tristate "Round-robin Hybrid Open-Channel SSD target"
|
||||
---help---
|
||||
Allows an open-channel SSD to be exposed as a block device to the
|
||||
host. The target is implemented using a linear mapping table and
|
||||
cost-based garbage collection. It is optimized for 4K IO sizes.
|
||||
|
||||
config NVM_PBLK
|
||||
tristate "Physical Block Device Open-Channel SSD target"
|
||||
---help---
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
#
|
||||
|
||||
obj-$(CONFIG_NVM) := core.o
|
||||
obj-$(CONFIG_NVM_RRPC) += rrpc.o
|
||||
obj-$(CONFIG_NVM_PBLK) += pblk.o
|
||||
pblk-y := pblk-init.o pblk-core.o pblk-rb.o \
|
||||
pblk-write.o pblk-cache.o pblk-read.o \
|
||||
|
|
|
@ -45,12 +45,6 @@ struct nvm_dev_map {
|
|||
int nr_chnls;
|
||||
};
|
||||
|
||||
struct nvm_area {
|
||||
struct list_head list;
|
||||
sector_t begin;
|
||||
sector_t end; /* end is excluded */
|
||||
};
|
||||
|
||||
static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
|
||||
{
|
||||
struct nvm_target *tgt;
|
||||
|
@ -62,6 +56,30 @@ static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static bool nvm_target_exists(const char *name)
|
||||
{
|
||||
struct nvm_dev *dev;
|
||||
struct nvm_target *tgt;
|
||||
bool ret = false;
|
||||
|
||||
down_write(&nvm_lock);
|
||||
list_for_each_entry(dev, &nvm_devices, devices) {
|
||||
mutex_lock(&dev->mlock);
|
||||
list_for_each_entry(tgt, &dev->targets, list) {
|
||||
if (!strcmp(name, tgt->disk->disk_name)) {
|
||||
ret = true;
|
||||
mutex_unlock(&dev->mlock);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&dev->mlock);
|
||||
}
|
||||
|
||||
out:
|
||||
up_write(&nvm_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end)
|
||||
{
|
||||
int i;
|
||||
|
@ -104,7 +122,7 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
|
|||
if (clear) {
|
||||
for (j = 0; j < ch_map->nr_luns; j++) {
|
||||
int lun = j + lun_offs[j];
|
||||
int lunid = (ch * dev->geo.luns_per_chnl) + lun;
|
||||
int lunid = (ch * dev->geo.nr_luns) + lun;
|
||||
|
||||
WARN_ON(!test_and_clear_bit(lunid,
|
||||
dev->lun_map));
|
||||
|
@ -122,7 +140,8 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
|
|||
}
|
||||
|
||||
static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
|
||||
int lun_begin, int lun_end)
|
||||
u16 lun_begin, u16 lun_end,
|
||||
u16 op)
|
||||
{
|
||||
struct nvm_tgt_dev *tgt_dev = NULL;
|
||||
struct nvm_dev_map *dev_rmap = dev->rmap;
|
||||
|
@ -130,10 +149,10 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
|
|||
struct ppa_addr *luns;
|
||||
int nr_luns = lun_end - lun_begin + 1;
|
||||
int luns_left = nr_luns;
|
||||
int nr_chnls = nr_luns / dev->geo.luns_per_chnl;
|
||||
int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl;
|
||||
int bch = lun_begin / dev->geo.luns_per_chnl;
|
||||
int blun = lun_begin % dev->geo.luns_per_chnl;
|
||||
int nr_chnls = nr_luns / dev->geo.nr_luns;
|
||||
int nr_chnls_mod = nr_luns % dev->geo.nr_luns;
|
||||
int bch = lun_begin / dev->geo.nr_luns;
|
||||
int blun = lun_begin % dev->geo.nr_luns;
|
||||
int lunid = 0;
|
||||
int lun_balanced = 1;
|
||||
int prev_nr_luns;
|
||||
|
@ -154,15 +173,15 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
|
|||
if (!luns)
|
||||
goto err_luns;
|
||||
|
||||
prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ?
|
||||
dev->geo.luns_per_chnl : luns_left;
|
||||
prev_nr_luns = (luns_left > dev->geo.nr_luns) ?
|
||||
dev->geo.nr_luns : luns_left;
|
||||
for (i = 0; i < nr_chnls; i++) {
|
||||
struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
|
||||
int *lun_roffs = ch_rmap->lun_offs;
|
||||
struct nvm_ch_map *ch_map = &dev_map->chnls[i];
|
||||
int *lun_offs;
|
||||
int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ?
|
||||
dev->geo.luns_per_chnl : luns_left;
|
||||
int luns_in_chnl = (luns_left > dev->geo.nr_luns) ?
|
||||
dev->geo.nr_luns : luns_left;
|
||||
|
||||
if (lun_balanced && prev_nr_luns != luns_in_chnl)
|
||||
lun_balanced = 0;
|
||||
|
@ -199,8 +218,9 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
|
|||
memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
|
||||
/* Target device only owns a portion of the physical device */
|
||||
tgt_dev->geo.nr_chnls = nr_chnls;
|
||||
tgt_dev->geo.nr_luns = nr_luns;
|
||||
tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1;
|
||||
tgt_dev->geo.all_luns = nr_luns;
|
||||
tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1;
|
||||
tgt_dev->geo.op = op;
|
||||
tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
|
||||
tgt_dev->q = dev->q;
|
||||
tgt_dev->map = dev_map;
|
||||
|
@ -226,27 +246,79 @@ static const struct block_device_operations nvm_fops = {
|
|||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
|
||||
static struct nvm_tgt_type *__nvm_find_target_type(const char *name)
|
||||
{
|
||||
struct nvm_tgt_type *tmp, *tt = NULL;
|
||||
struct nvm_tgt_type *tt;
|
||||
|
||||
if (lock)
|
||||
down_write(&nvm_tgtt_lock);
|
||||
list_for_each_entry(tt, &nvm_tgt_types, list)
|
||||
if (!strcmp(name, tt->name))
|
||||
return tt;
|
||||
|
||||
list_for_each_entry(tmp, &nvm_tgt_types, list)
|
||||
if (!strcmp(name, tmp->name)) {
|
||||
tt = tmp;
|
||||
break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct nvm_tgt_type *nvm_find_target_type(const char *name)
|
||||
{
|
||||
struct nvm_tgt_type *tt;
|
||||
|
||||
down_write(&nvm_tgtt_lock);
|
||||
tt = __nvm_find_target_type(name);
|
||||
up_write(&nvm_tgtt_lock);
|
||||
|
||||
if (lock)
|
||||
up_write(&nvm_tgtt_lock);
|
||||
return tt;
|
||||
}
|
||||
|
||||
static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin,
|
||||
int lun_end)
|
||||
{
|
||||
if (lun_begin > lun_end || lun_end >= geo->all_luns) {
|
||||
pr_err("nvm: lun out of bound (%u:%u > %u)\n",
|
||||
lun_begin, lun_end, geo->all_luns - 1);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __nvm_config_simple(struct nvm_dev *dev,
|
||||
struct nvm_ioctl_create_simple *s)
|
||||
{
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
|
||||
if (s->lun_begin == -1 && s->lun_end == -1) {
|
||||
s->lun_begin = 0;
|
||||
s->lun_end = geo->all_luns - 1;
|
||||
}
|
||||
|
||||
return nvm_config_check_luns(geo, s->lun_begin, s->lun_end);
|
||||
}
|
||||
|
||||
static int __nvm_config_extended(struct nvm_dev *dev,
|
||||
struct nvm_ioctl_create_extended *e)
|
||||
{
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
|
||||
if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) {
|
||||
e->lun_begin = 0;
|
||||
e->lun_end = dev->geo.all_luns - 1;
|
||||
}
|
||||
|
||||
/* op not set falls into target's default */
|
||||
if (e->op == 0xFFFF)
|
||||
e->op = NVM_TARGET_DEFAULT_OP;
|
||||
|
||||
if (e->op < NVM_TARGET_MIN_OP ||
|
||||
e->op > NVM_TARGET_MAX_OP) {
|
||||
pr_err("nvm: invalid over provisioning value\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return nvm_config_check_luns(geo, e->lun_begin, e->lun_end);
|
||||
}
|
||||
|
||||
static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
|
||||
{
|
||||
struct nvm_ioctl_create_simple *s = &create->conf.s;
|
||||
struct nvm_ioctl_create_extended e;
|
||||
struct request_queue *tqueue;
|
||||
struct gendisk *tdisk;
|
||||
struct nvm_tgt_type *tt;
|
||||
|
@ -255,22 +327,41 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
|
|||
void *targetdata;
|
||||
int ret;
|
||||
|
||||
tt = nvm_find_target_type(create->tgttype, 1);
|
||||
switch (create->conf.type) {
|
||||
case NVM_CONFIG_TYPE_SIMPLE:
|
||||
ret = __nvm_config_simple(dev, &create->conf.s);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
e.lun_begin = create->conf.s.lun_begin;
|
||||
e.lun_end = create->conf.s.lun_end;
|
||||
e.op = NVM_TARGET_DEFAULT_OP;
|
||||
break;
|
||||
case NVM_CONFIG_TYPE_EXTENDED:
|
||||
ret = __nvm_config_extended(dev, &create->conf.e);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
e = create->conf.e;
|
||||
break;
|
||||
default:
|
||||
pr_err("nvm: config type not valid\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
tt = nvm_find_target_type(create->tgttype);
|
||||
if (!tt) {
|
||||
pr_err("nvm: target type %s not found\n", create->tgttype);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
mutex_lock(&dev->mlock);
|
||||
t = nvm_find_target(dev, create->tgtname);
|
||||
if (t) {
|
||||
pr_err("nvm: target name already exists.\n");
|
||||
mutex_unlock(&dev->mlock);
|
||||
if (nvm_target_exists(create->tgtname)) {
|
||||
pr_err("nvm: target name already exists (%s)\n",
|
||||
create->tgtname);
|
||||
return -EINVAL;
|
||||
}
|
||||
mutex_unlock(&dev->mlock);
|
||||
|
||||
ret = nvm_reserve_luns(dev, s->lun_begin, s->lun_end);
|
||||
ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
|
@ -280,7 +371,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
|
|||
goto err_reserve;
|
||||
}
|
||||
|
||||
tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end);
|
||||
tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op);
|
||||
if (!tgt_dev) {
|
||||
pr_err("nvm: could not create target device\n");
|
||||
ret = -ENOMEM;
|
||||
|
@ -350,7 +441,7 @@ err_dev:
|
|||
err_t:
|
||||
kfree(t);
|
||||
err_reserve:
|
||||
nvm_release_luns_err(dev, s->lun_begin, s->lun_end);
|
||||
nvm_release_luns_err(dev, e.lun_begin, e.lun_end);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -420,7 +511,7 @@ static int nvm_register_map(struct nvm_dev *dev)
|
|||
for (i = 0; i < dev->geo.nr_chnls; i++) {
|
||||
struct nvm_ch_map *ch_rmap;
|
||||
int *lun_roffs;
|
||||
int luns_in_chnl = dev->geo.luns_per_chnl;
|
||||
int luns_in_chnl = dev->geo.nr_luns;
|
||||
|
||||
ch_rmap = &rmap->chnls[i];
|
||||
|
||||
|
@ -524,41 +615,12 @@ static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
|
|||
nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas);
|
||||
}
|
||||
|
||||
void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
|
||||
int len)
|
||||
{
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
struct nvm_dev_map *dev_rmap = dev->rmap;
|
||||
u64 i;
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
struct nvm_ch_map *ch_rmap;
|
||||
int *lun_roffs;
|
||||
struct ppa_addr gaddr;
|
||||
u64 pba = le64_to_cpu(entries[i]);
|
||||
u64 diff;
|
||||
|
||||
if (!pba)
|
||||
continue;
|
||||
|
||||
gaddr = linear_to_generic_addr(geo, pba);
|
||||
ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
|
||||
lun_roffs = ch_rmap->lun_offs;
|
||||
|
||||
diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
|
||||
(lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
|
||||
|
||||
entries[i] -= cpu_to_le64(diff);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(nvm_part_to_tgt);
|
||||
|
||||
int nvm_register_tgt_type(struct nvm_tgt_type *tt)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
down_write(&nvm_tgtt_lock);
|
||||
if (nvm_find_target_type(tt->name, 0))
|
||||
if (__nvm_find_target_type(tt->name))
|
||||
ret = -EEXIST;
|
||||
else
|
||||
list_add(&tt->list, &nvm_tgt_types);
|
||||
|
@ -726,112 +788,6 @@ int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
|
|||
}
|
||||
EXPORT_SYMBOL(nvm_submit_io_sync);
|
||||
|
||||
int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
|
||||
int nr_ppas)
|
||||
{
|
||||
struct nvm_geo *geo = &tgt_dev->geo;
|
||||
struct nvm_rq rqd;
|
||||
int ret;
|
||||
|
||||
memset(&rqd, 0, sizeof(struct nvm_rq));
|
||||
|
||||
rqd.opcode = NVM_OP_ERASE;
|
||||
rqd.flags = geo->plane_mode >> 1;
|
||||
|
||||
ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = nvm_submit_io_sync(tgt_dev, &rqd);
|
||||
if (ret) {
|
||||
pr_err("rrpr: erase I/O submission failed: %d\n", ret);
|
||||
goto free_ppa_list;
|
||||
}
|
||||
|
||||
free_ppa_list:
|
||||
nvm_free_rqd_ppalist(tgt_dev, &rqd);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(nvm_erase_sync);
|
||||
|
||||
int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb,
|
||||
nvm_l2p_update_fn *update_l2p, void *priv)
|
||||
{
|
||||
struct nvm_dev *dev = tgt_dev->parent;
|
||||
|
||||
if (!dev->ops->get_l2p_tbl)
|
||||
return 0;
|
||||
|
||||
return dev->ops->get_l2p_tbl(dev, slba, nlb, update_l2p, priv);
|
||||
}
|
||||
EXPORT_SYMBOL(nvm_get_l2p_tbl);
|
||||
|
||||
int nvm_get_area(struct nvm_tgt_dev *tgt_dev, sector_t *lba, sector_t len)
|
||||
{
|
||||
struct nvm_dev *dev = tgt_dev->parent;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
struct nvm_area *area, *prev, *next;
|
||||
sector_t begin = 0;
|
||||
sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9;
|
||||
|
||||
if (len > max_sectors)
|
||||
return -EINVAL;
|
||||
|
||||
area = kmalloc(sizeof(struct nvm_area), GFP_KERNEL);
|
||||
if (!area)
|
||||
return -ENOMEM;
|
||||
|
||||
prev = NULL;
|
||||
|
||||
spin_lock(&dev->lock);
|
||||
list_for_each_entry(next, &dev->area_list, list) {
|
||||
if (begin + len > next->begin) {
|
||||
begin = next->end;
|
||||
prev = next;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if ((begin + len) > max_sectors) {
|
||||
spin_unlock(&dev->lock);
|
||||
kfree(area);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
area->begin = *lba = begin;
|
||||
area->end = begin + len;
|
||||
|
||||
if (prev) /* insert into sorted order */
|
||||
list_add(&area->list, &prev->list);
|
||||
else
|
||||
list_add(&area->list, &dev->area_list);
|
||||
spin_unlock(&dev->lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(nvm_get_area);
|
||||
|
||||
void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
|
||||
{
|
||||
struct nvm_dev *dev = tgt_dev->parent;
|
||||
struct nvm_area *area;
|
||||
|
||||
spin_lock(&dev->lock);
|
||||
list_for_each_entry(area, &dev->area_list, list) {
|
||||
if (area->begin != begin)
|
||||
continue;
|
||||
|
||||
list_del(&area->list);
|
||||
spin_unlock(&dev->lock);
|
||||
kfree(area);
|
||||
return;
|
||||
}
|
||||
spin_unlock(&dev->lock);
|
||||
}
|
||||
EXPORT_SYMBOL(nvm_put_area);
|
||||
|
||||
void nvm_end_io(struct nvm_rq *rqd)
|
||||
{
|
||||
struct nvm_tgt_dev *tgt_dev = rqd->dev;
|
||||
|
@ -858,10 +814,10 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
|
|||
struct nvm_geo *geo = &dev->geo;
|
||||
int blk, offset, pl, blktype;
|
||||
|
||||
if (nr_blks != geo->blks_per_lun * geo->plane_mode)
|
||||
if (nr_blks != geo->nr_chks * geo->plane_mode)
|
||||
return -EINVAL;
|
||||
|
||||
for (blk = 0; blk < geo->blks_per_lun; blk++) {
|
||||
for (blk = 0; blk < geo->nr_chks; blk++) {
|
||||
offset = blk * geo->plane_mode;
|
||||
blktype = blks[offset];
|
||||
|
||||
|
@ -877,7 +833,7 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
|
|||
blks[blk] = blktype;
|
||||
}
|
||||
|
||||
return geo->blks_per_lun;
|
||||
return geo->nr_chks;
|
||||
}
|
||||
EXPORT_SYMBOL(nvm_bb_tbl_fold);
|
||||
|
||||
|
@ -892,53 +848,6 @@ int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa,
|
|||
}
|
||||
EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
|
||||
|
||||
static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
|
||||
{
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
int i;
|
||||
|
||||
dev->lps_per_blk = geo->pgs_per_blk;
|
||||
dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
|
||||
if (!dev->lptbl)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Just a linear array */
|
||||
for (i = 0; i < dev->lps_per_blk; i++)
|
||||
dev->lptbl[i] = i;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
|
||||
{
|
||||
int i, p;
|
||||
struct nvm_id_lp_mlc *mlc = &grp->lptbl.mlc;
|
||||
|
||||
if (!mlc->num_pairs)
|
||||
return 0;
|
||||
|
||||
dev->lps_per_blk = mlc->num_pairs;
|
||||
dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
|
||||
if (!dev->lptbl)
|
||||
return -ENOMEM;
|
||||
|
||||
/* The lower page table encoding consists of a list of bytes, where each
|
||||
* has a lower and an upper half. The first half byte maintains the
|
||||
* increment value and every value after is an offset added to the
|
||||
* previous incrementation value
|
||||
*/
|
||||
dev->lptbl[0] = mlc->pairs[0] & 0xF;
|
||||
for (i = 1; i < dev->lps_per_blk; i++) {
|
||||
p = mlc->pairs[i >> 1];
|
||||
if (i & 0x1) /* upper */
|
||||
dev->lptbl[i] = dev->lptbl[i - 1] + ((p & 0xF0) >> 4);
|
||||
else /* lower */
|
||||
dev->lptbl[i] = dev->lptbl[i - 1] + (p & 0xF);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nvm_core_init(struct nvm_dev *dev)
|
||||
{
|
||||
struct nvm_id *id = &dev->identity;
|
||||
|
@ -946,66 +855,44 @@ static int nvm_core_init(struct nvm_dev *dev)
|
|||
struct nvm_geo *geo = &dev->geo;
|
||||
int ret;
|
||||
|
||||
/* Whole device values */
|
||||
geo->nr_chnls = grp->num_ch;
|
||||
geo->luns_per_chnl = grp->num_lun;
|
||||
|
||||
/* Generic device values */
|
||||
geo->pgs_per_blk = grp->num_pg;
|
||||
geo->blks_per_lun = grp->num_blk;
|
||||
geo->nr_planes = grp->num_pln;
|
||||
geo->fpg_size = grp->fpg_sz;
|
||||
geo->pfpg_size = grp->fpg_sz * grp->num_pln;
|
||||
geo->sec_size = grp->csecs;
|
||||
geo->oob_size = grp->sos;
|
||||
geo->sec_per_pg = grp->fpg_sz / grp->csecs;
|
||||
geo->mccap = grp->mccap;
|
||||
memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
|
||||
|
||||
geo->plane_mode = NVM_PLANE_SINGLE;
|
||||
geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size;
|
||||
|
||||
if (grp->mpos & 0x020202)
|
||||
geo->plane_mode = NVM_PLANE_DOUBLE;
|
||||
if (grp->mpos & 0x040404)
|
||||
geo->plane_mode = NVM_PLANE_QUAD;
|
||||
|
||||
if (grp->mtype != 0) {
|
||||
pr_err("nvm: memory type not supported\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* calculated values */
|
||||
geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes;
|
||||
geo->sec_per_blk = geo->sec_per_pl * geo->pgs_per_blk;
|
||||
geo->sec_per_lun = geo->sec_per_blk * geo->blks_per_lun;
|
||||
geo->nr_luns = geo->luns_per_chnl * geo->nr_chnls;
|
||||
/* Whole device values */
|
||||
geo->nr_chnls = grp->num_ch;
|
||||
geo->nr_luns = grp->num_lun;
|
||||
|
||||
dev->total_secs = geo->nr_luns * geo->sec_per_lun;
|
||||
dev->lun_map = kcalloc(BITS_TO_LONGS(geo->nr_luns),
|
||||
/* Generic device geometry values */
|
||||
geo->ws_min = grp->ws_min;
|
||||
geo->ws_opt = grp->ws_opt;
|
||||
geo->ws_seq = grp->ws_seq;
|
||||
geo->ws_per_chk = grp->ws_per_chk;
|
||||
geo->nr_chks = grp->num_chk;
|
||||
geo->sec_size = grp->csecs;
|
||||
geo->oob_size = grp->sos;
|
||||
geo->mccap = grp->mccap;
|
||||
geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size;
|
||||
|
||||
geo->sec_per_chk = grp->clba;
|
||||
geo->sec_per_lun = geo->sec_per_chk * geo->nr_chks;
|
||||
geo->all_luns = geo->nr_luns * geo->nr_chnls;
|
||||
|
||||
/* 1.2 spec device geometry values */
|
||||
geo->plane_mode = 1 << geo->ws_seq;
|
||||
geo->nr_planes = geo->ws_opt / geo->ws_min;
|
||||
geo->sec_per_pg = geo->ws_min;
|
||||
geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes;
|
||||
|
||||
dev->total_secs = geo->all_luns * geo->sec_per_lun;
|
||||
dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns),
|
||||
sizeof(unsigned long), GFP_KERNEL);
|
||||
if (!dev->lun_map)
|
||||
return -ENOMEM;
|
||||
|
||||
switch (grp->fmtype) {
|
||||
case NVM_ID_FMTYPE_SLC:
|
||||
if (nvm_init_slc_tbl(dev, grp)) {
|
||||
ret = -ENOMEM;
|
||||
goto err_fmtype;
|
||||
}
|
||||
break;
|
||||
case NVM_ID_FMTYPE_MLC:
|
||||
if (nvm_init_mlc_tbl(dev, grp)) {
|
||||
ret = -ENOMEM;
|
||||
goto err_fmtype;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
pr_err("nvm: flash type not supported\n");
|
||||
ret = -EINVAL;
|
||||
goto err_fmtype;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&dev->area_list);
|
||||
INIT_LIST_HEAD(&dev->targets);
|
||||
mutex_init(&dev->mlock);
|
||||
|
@ -1031,7 +918,6 @@ static void nvm_free(struct nvm_dev *dev)
|
|||
dev->ops->destroy_dma_pool(dev->dma_pool);
|
||||
|
||||
nvm_unregister_map(dev);
|
||||
kfree(dev->lptbl);
|
||||
kfree(dev->lun_map);
|
||||
kfree(dev);
|
||||
}
|
||||
|
@ -1062,8 +948,8 @@ static int nvm_init(struct nvm_dev *dev)
|
|||
|
||||
pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n",
|
||||
dev->name, geo->sec_per_pg, geo->nr_planes,
|
||||
geo->pgs_per_blk, geo->blks_per_lun,
|
||||
geo->nr_luns, geo->nr_chnls);
|
||||
geo->ws_per_chk, geo->nr_chks,
|
||||
geo->all_luns, geo->nr_chnls);
|
||||
return 0;
|
||||
err:
|
||||
pr_err("nvm: failed to initialize nvm\n");
|
||||
|
@ -1135,7 +1021,6 @@ EXPORT_SYMBOL(nvm_unregister);
|
|||
static int __nvm_configure_create(struct nvm_ioctl_create *create)
|
||||
{
|
||||
struct nvm_dev *dev;
|
||||
struct nvm_ioctl_create_simple *s;
|
||||
|
||||
down_write(&nvm_lock);
|
||||
dev = nvm_find_nvm_dev(create->dev);
|
||||
|
@ -1146,23 +1031,6 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) {
|
||||
pr_err("nvm: config type not valid\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
s = &create->conf.s;
|
||||
|
||||
if (s->lun_begin == -1 && s->lun_end == -1) {
|
||||
s->lun_begin = 0;
|
||||
s->lun_end = dev->geo.nr_luns - 1;
|
||||
}
|
||||
|
||||
if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.nr_luns) {
|
||||
pr_err("nvm: lun out of bound (%u:%u > %u)\n",
|
||||
s->lun_begin, s->lun_end, dev->geo.nr_luns - 1);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return nvm_create_tgt(dev, create);
|
||||
}
|
||||
|
||||
|
@ -1262,6 +1130,12 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
|
|||
if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create)))
|
||||
return -EFAULT;
|
||||
|
||||
if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED &&
|
||||
create.conf.e.rsv != 0) {
|
||||
pr_err("nvm: reserved config field in use\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
create.dev[DISK_NAME_LEN - 1] = '\0';
|
||||
create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0';
|
||||
create.tgtname[DISK_NAME_LEN - 1] = '\0';
|
||||
|
|
|
@ -19,12 +19,16 @@
|
|||
|
||||
int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
|
||||
{
|
||||
struct request_queue *q = pblk->dev->q;
|
||||
struct pblk_w_ctx w_ctx;
|
||||
sector_t lba = pblk_get_lba(bio);
|
||||
unsigned long start_time = jiffies;
|
||||
unsigned int bpos, pos;
|
||||
int nr_entries = pblk_get_secs(bio);
|
||||
int i, ret;
|
||||
|
||||
generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0);
|
||||
|
||||
/* Update the write buffer head (mem) with the entries that we can
|
||||
* write. The write in itself cannot fail, so there is no need to
|
||||
* rollback from here on.
|
||||
|
@ -67,6 +71,7 @@ retry:
|
|||
pblk_rl_inserted(&pblk->rl, nr_entries);
|
||||
|
||||
out:
|
||||
generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time);
|
||||
pblk_write_should_kick(pblk);
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -32,8 +32,8 @@ static void pblk_line_mark_bb(struct work_struct *work)
|
|||
struct pblk_line *line;
|
||||
int pos;
|
||||
|
||||
line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
|
||||
pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
|
||||
line = &pblk->lines[pblk_ppa_to_line(*ppa)];
|
||||
pos = pblk_ppa_to_pos(&dev->geo, *ppa);
|
||||
|
||||
pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
|
||||
line->id, pos);
|
||||
|
@ -48,7 +48,7 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
|
|||
{
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
int pos = pblk_dev_ppa_to_pos(geo, *ppa);
|
||||
int pos = pblk_ppa_to_pos(geo, *ppa);
|
||||
|
||||
pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
|
||||
atomic_long_inc(&pblk->erase_failed);
|
||||
|
@ -66,7 +66,7 @@ static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
|
|||
{
|
||||
struct pblk_line *line;
|
||||
|
||||
line = &pblk->lines[pblk_dev_ppa_to_line(rqd->ppa_addr)];
|
||||
line = &pblk->lines[pblk_ppa_to_line(rqd->ppa_addr)];
|
||||
atomic_dec(&line->left_seblks);
|
||||
|
||||
if (rqd->error) {
|
||||
|
@ -144,7 +144,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
|
|||
BUG_ON(pblk_ppa_empty(ppa));
|
||||
#endif
|
||||
|
||||
line_id = pblk_tgt_ppa_to_line(ppa);
|
||||
line_id = pblk_ppa_to_line(ppa);
|
||||
line = &pblk->lines[line_id];
|
||||
paddr = pblk_dev_ppa_to_line_addr(pblk, ppa);
|
||||
|
||||
|
@ -650,7 +650,7 @@ next_rq:
|
|||
} else {
|
||||
for (i = 0; i < rqd.nr_ppas; ) {
|
||||
struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id);
|
||||
int pos = pblk_dev_ppa_to_pos(geo, ppa);
|
||||
int pos = pblk_ppa_to_pos(geo, ppa);
|
||||
int read_type = PBLK_READ_RANDOM;
|
||||
|
||||
if (pblk_io_aligned(pblk, rq_ppas))
|
||||
|
@ -668,7 +668,7 @@ next_rq:
|
|||
}
|
||||
|
||||
ppa = addr_to_gen_ppa(pblk, paddr, id);
|
||||
pos = pblk_dev_ppa_to_pos(geo, ppa);
|
||||
pos = pblk_ppa_to_pos(geo, ppa);
|
||||
}
|
||||
|
||||
if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
|
||||
|
@ -742,7 +742,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
|
|||
cmd_op = NVM_OP_PWRITE;
|
||||
flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
|
||||
lba_list = emeta_to_lbas(pblk, line->emeta->buf);
|
||||
} else if (dir == PBLK_READ) {
|
||||
} else if (dir == PBLK_READ_RECOV || dir == PBLK_READ) {
|
||||
bio_op = REQ_OP_READ;
|
||||
cmd_op = NVM_OP_PREAD;
|
||||
flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
|
||||
|
@ -802,7 +802,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
|
|||
if (rqd.error) {
|
||||
if (dir == PBLK_WRITE)
|
||||
pblk_log_write_err(pblk, &rqd);
|
||||
else
|
||||
else if (dir == PBLK_READ)
|
||||
pblk_log_read_err(pblk, &rqd);
|
||||
}
|
||||
|
||||
|
@ -816,7 +816,7 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
|
|||
{
|
||||
u64 bpaddr = pblk_line_smeta_start(pblk, line);
|
||||
|
||||
return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ);
|
||||
return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ_RECOV);
|
||||
}
|
||||
|
||||
int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
|
||||
|
@ -854,8 +854,8 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
|
|||
struct nvm_geo *geo = &dev->geo;
|
||||
|
||||
pr_err("pblk: could not sync erase line:%d,blk:%d\n",
|
||||
pblk_dev_ppa_to_line(ppa),
|
||||
pblk_dev_ppa_to_pos(geo, ppa));
|
||||
pblk_ppa_to_line(ppa),
|
||||
pblk_ppa_to_pos(geo, ppa));
|
||||
|
||||
rqd.error = ret;
|
||||
goto out;
|
||||
|
@ -979,7 +979,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
|
|||
|
||||
/* Start metadata */
|
||||
smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
|
||||
smeta_buf->window_wr_lun = cpu_to_le32(geo->nr_luns);
|
||||
smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns);
|
||||
|
||||
/* Fill metadata among lines */
|
||||
if (cur) {
|
||||
|
@ -1032,7 +1032,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
|
|||
lm->sec_per_line);
|
||||
bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
|
||||
lm->sec_per_line);
|
||||
line->sec_in_line -= geo->sec_per_blk;
|
||||
line->sec_in_line -= geo->sec_per_chk;
|
||||
if (bit >= lm->emeta_bb)
|
||||
nr_bb++;
|
||||
}
|
||||
|
@ -1145,7 +1145,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
|
|||
}
|
||||
spin_unlock(&l_mg->free_lock);
|
||||
|
||||
pblk_rl_free_lines_dec(&pblk->rl, line);
|
||||
pblk_rl_free_lines_dec(&pblk->rl, line, true);
|
||||
|
||||
if (!pblk_line_init_bb(pblk, line, 0)) {
|
||||
list_add(&line->list, &l_mg->free_list);
|
||||
|
@ -1233,7 +1233,7 @@ retry:
|
|||
l_mg->data_line = retry_line;
|
||||
spin_unlock(&l_mg->free_lock);
|
||||
|
||||
pblk_rl_free_lines_dec(&pblk->rl, retry_line);
|
||||
pblk_rl_free_lines_dec(&pblk->rl, line, false);
|
||||
|
||||
if (pblk_line_erase(pblk, retry_line))
|
||||
goto retry;
|
||||
|
@ -1252,7 +1252,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
|
|||
{
|
||||
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
|
||||
struct pblk_line *line;
|
||||
int is_next = 0;
|
||||
|
||||
spin_lock(&l_mg->free_lock);
|
||||
line = pblk_line_get(pblk);
|
||||
|
@ -1280,7 +1279,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
|
|||
} else {
|
||||
l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
|
||||
l_mg->data_next->type = PBLK_LINETYPE_DATA;
|
||||
is_next = 1;
|
||||
}
|
||||
spin_unlock(&l_mg->free_lock);
|
||||
|
||||
|
@ -1290,10 +1288,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
pblk_rl_free_lines_dec(&pblk->rl, line);
|
||||
if (is_next)
|
||||
pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
|
||||
|
||||
retry_setup:
|
||||
if (!pblk_line_init_metadata(pblk, line, NULL)) {
|
||||
line = pblk_line_retry(pblk, line);
|
||||
|
@ -1311,6 +1305,8 @@ retry_setup:
|
|||
goto retry_setup;
|
||||
}
|
||||
|
||||
pblk_rl_free_lines_dec(&pblk->rl, line, true);
|
||||
|
||||
return line;
|
||||
}
|
||||
|
||||
|
@ -1395,7 +1391,6 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
|
|||
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
|
||||
struct pblk_line *cur, *new = NULL;
|
||||
unsigned int left_seblks;
|
||||
int is_next = 0;
|
||||
|
||||
cur = l_mg->data_line;
|
||||
new = l_mg->data_next;
|
||||
|
@ -1444,6 +1439,8 @@ retry_setup:
|
|||
goto retry_setup;
|
||||
}
|
||||
|
||||
pblk_rl_free_lines_dec(&pblk->rl, new, true);
|
||||
|
||||
/* Allocate next line for preparation */
|
||||
spin_lock(&l_mg->free_lock);
|
||||
l_mg->data_next = pblk_line_get(pblk);
|
||||
|
@ -1457,13 +1454,9 @@ retry_setup:
|
|||
} else {
|
||||
l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
|
||||
l_mg->data_next->type = PBLK_LINETYPE_DATA;
|
||||
is_next = 1;
|
||||
}
|
||||
spin_unlock(&l_mg->free_lock);
|
||||
|
||||
if (is_next)
|
||||
pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
|
||||
|
||||
out:
|
||||
return new;
|
||||
}
|
||||
|
@ -1561,8 +1554,8 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
|
|||
struct nvm_geo *geo = &dev->geo;
|
||||
|
||||
pr_err("pblk: could not async erase line:%d,blk:%d\n",
|
||||
pblk_dev_ppa_to_line(ppa),
|
||||
pblk_dev_ppa_to_pos(geo, ppa));
|
||||
pblk_ppa_to_line(ppa),
|
||||
pblk_ppa_to_pos(geo, ppa));
|
||||
}
|
||||
|
||||
return err;
|
||||
|
@ -1746,7 +1739,7 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
|
|||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
struct pblk_lun *rlun;
|
||||
int nr_luns = geo->nr_luns;
|
||||
int nr_luns = geo->all_luns;
|
||||
int bit = -1;
|
||||
|
||||
while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) {
|
||||
|
@ -1884,7 +1877,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
|
|||
|
||||
/* If the L2P entry maps to a line, the reference is valid */
|
||||
if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
|
||||
int line_id = pblk_dev_ppa_to_line(ppa);
|
||||
int line_id = pblk_ppa_to_line(ppa);
|
||||
struct pblk_line *line = &pblk->lines[line_id];
|
||||
|
||||
kref_get(&line->ref);
|
||||
|
|
|
@ -169,7 +169,14 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
|
|||
* the line untouched. TODO: Implement a recovery routine that scans and
|
||||
* moves all sectors on the line.
|
||||
*/
|
||||
lba_list = pblk_recov_get_lba_list(pblk, emeta_buf);
|
||||
|
||||
ret = pblk_recov_check_emeta(pblk, emeta_buf);
|
||||
if (ret) {
|
||||
pr_err("pblk: inconsistent emeta (line %d)\n", line->id);
|
||||
goto fail_free_emeta;
|
||||
}
|
||||
|
||||
lba_list = emeta_to_lbas(pblk, emeta_buf);
|
||||
if (!lba_list) {
|
||||
pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
|
||||
goto fail_free_emeta;
|
||||
|
@ -519,22 +526,12 @@ void pblk_gc_should_start(struct pblk *pblk)
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If flush_wq == 1 then no lock should be held by the caller since
|
||||
* flush_workqueue can sleep
|
||||
*/
|
||||
static void pblk_gc_stop(struct pblk *pblk, int flush_wq)
|
||||
{
|
||||
pblk->gc.gc_active = 0;
|
||||
pr_debug("pblk: gc stop\n");
|
||||
}
|
||||
|
||||
void pblk_gc_should_stop(struct pblk *pblk)
|
||||
{
|
||||
struct pblk_gc *gc = &pblk->gc;
|
||||
|
||||
if (gc->gc_active && !gc->gc_forced)
|
||||
pblk_gc_stop(pblk, 0);
|
||||
gc->gc_active = 0;
|
||||
}
|
||||
|
||||
void pblk_gc_should_kick(struct pblk *pblk)
|
||||
|
@ -660,7 +657,7 @@ void pblk_gc_exit(struct pblk *pblk)
|
|||
|
||||
gc->gc_enabled = 0;
|
||||
del_timer_sync(&gc->gc_timer);
|
||||
pblk_gc_stop(pblk, 1);
|
||||
gc->gc_active = 0;
|
||||
|
||||
if (gc->gc_ts)
|
||||
kthread_stop(gc->gc_ts);
|
||||
|
|
|
@ -169,8 +169,8 @@ static int pblk_set_ppaf(struct pblk *pblk)
|
|||
}
|
||||
ppaf.ch_len = power_len;
|
||||
|
||||
power_len = get_count_order(geo->luns_per_chnl);
|
||||
if (1 << power_len != geo->luns_per_chnl) {
|
||||
power_len = get_count_order(geo->nr_luns);
|
||||
if (1 << power_len != geo->nr_luns) {
|
||||
pr_err("pblk: supports only power-of-two LUN config.\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
@ -254,7 +254,7 @@ static int pblk_core_init(struct pblk *pblk)
|
|||
struct nvm_geo *geo = &dev->geo;
|
||||
|
||||
pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
|
||||
geo->nr_planes * geo->nr_luns;
|
||||
geo->nr_planes * geo->all_luns;
|
||||
|
||||
if (pblk_init_global_caches(pblk))
|
||||
return -ENOMEM;
|
||||
|
@ -270,21 +270,22 @@ static int pblk_core_init(struct pblk *pblk)
|
|||
if (!pblk->gen_ws_pool)
|
||||
goto free_page_bio_pool;
|
||||
|
||||
pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
|
||||
pblk->rec_pool = mempool_create_slab_pool(geo->all_luns,
|
||||
pblk_rec_cache);
|
||||
if (!pblk->rec_pool)
|
||||
goto free_gen_ws_pool;
|
||||
|
||||
pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns,
|
||||
pblk->r_rq_pool = mempool_create_slab_pool(geo->all_luns,
|
||||
pblk_g_rq_cache);
|
||||
if (!pblk->r_rq_pool)
|
||||
goto free_rec_pool;
|
||||
|
||||
pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns,
|
||||
pblk->e_rq_pool = mempool_create_slab_pool(geo->all_luns,
|
||||
pblk_g_rq_cache);
|
||||
if (!pblk->e_rq_pool)
|
||||
goto free_r_rq_pool;
|
||||
|
||||
pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns,
|
||||
pblk->w_rq_pool = mempool_create_slab_pool(geo->all_luns,
|
||||
pblk_w_rq_cache);
|
||||
if (!pblk->w_rq_pool)
|
||||
goto free_e_rq_pool;
|
||||
|
@ -354,6 +355,8 @@ static void pblk_core_free(struct pblk *pblk)
|
|||
mempool_destroy(pblk->e_rq_pool);
|
||||
mempool_destroy(pblk->w_rq_pool);
|
||||
|
||||
pblk_rwb_free(pblk);
|
||||
|
||||
pblk_free_global_caches(pblk);
|
||||
}
|
||||
|
||||
|
@ -409,7 +412,7 @@ static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
|
|||
u8 *blks;
|
||||
int nr_blks, ret;
|
||||
|
||||
nr_blks = geo->blks_per_lun * geo->plane_mode;
|
||||
nr_blks = geo->nr_chks * geo->plane_mode;
|
||||
blks = kmalloc(nr_blks, GFP_KERNEL);
|
||||
if (!blks)
|
||||
return -ENOMEM;
|
||||
|
@ -482,20 +485,21 @@ static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
|
|||
int i, ret;
|
||||
|
||||
/* TODO: Implement unbalanced LUN support */
|
||||
if (geo->luns_per_chnl < 0) {
|
||||
if (geo->nr_luns < 0) {
|
||||
pr_err("pblk: unbalanced LUN config.\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL);
|
||||
pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun),
|
||||
GFP_KERNEL);
|
||||
if (!pblk->luns)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < geo->nr_luns; i++) {
|
||||
for (i = 0; i < geo->all_luns; i++) {
|
||||
/* Stripe across channels */
|
||||
int ch = i % geo->nr_chnls;
|
||||
int lun_raw = i / geo->nr_chnls;
|
||||
int lunid = lun_raw + ch * geo->luns_per_chnl;
|
||||
int lunid = lun_raw + ch * geo->nr_luns;
|
||||
|
||||
rlun = &pblk->luns[i];
|
||||
rlun->bppa = luns[lunid];
|
||||
|
@ -577,22 +581,37 @@ static unsigned int calc_emeta_len(struct pblk *pblk)
|
|||
static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
|
||||
{
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
|
||||
struct pblk_line_meta *lm = &pblk->lm;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
sector_t provisioned;
|
||||
int sec_meta, blk_meta;
|
||||
|
||||
pblk->over_pct = 20;
|
||||
if (geo->op == NVM_TARGET_DEFAULT_OP)
|
||||
pblk->op = PBLK_DEFAULT_OP;
|
||||
else
|
||||
pblk->op = geo->op;
|
||||
|
||||
provisioned = nr_free_blks;
|
||||
provisioned *= (100 - pblk->over_pct);
|
||||
provisioned *= (100 - pblk->op);
|
||||
sector_div(provisioned, 100);
|
||||
|
||||
pblk->op_blks = nr_free_blks - provisioned;
|
||||
|
||||
/* Internally pblk manages all free blocks, but all calculations based
|
||||
* on user capacity consider only provisioned blocks
|
||||
*/
|
||||
pblk->rl.total_blocks = nr_free_blks;
|
||||
pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk;
|
||||
pblk->capacity = provisioned * geo->sec_per_blk;
|
||||
pblk->rl.nr_secs = nr_free_blks * geo->sec_per_chk;
|
||||
|
||||
/* Consider sectors used for metadata */
|
||||
sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
|
||||
blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk);
|
||||
|
||||
pblk->capacity = (provisioned - blk_meta) * geo->sec_per_chk;
|
||||
|
||||
atomic_set(&pblk->rl.free_blocks, nr_free_blks);
|
||||
atomic_set(&pblk->rl.free_user_blocks, nr_free_blks);
|
||||
}
|
||||
|
||||
static int pblk_lines_alloc_metadata(struct pblk *pblk)
|
||||
|
@ -683,7 +702,7 @@ static int pblk_lines_init(struct pblk *pblk)
|
|||
int i, ret;
|
||||
|
||||
pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
|
||||
max_write_ppas = pblk->min_write_pgs * geo->nr_luns;
|
||||
max_write_ppas = pblk->min_write_pgs * geo->all_luns;
|
||||
pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
|
||||
max_write_ppas : nvm_max_phys_sects(dev);
|
||||
pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
|
||||
|
@ -693,26 +712,26 @@ static int pblk_lines_init(struct pblk *pblk)
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod);
|
||||
div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, &mod);
|
||||
if (mod) {
|
||||
pr_err("pblk: bad configuration of sectors/pages\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
l_mg->nr_lines = geo->blks_per_lun;
|
||||
l_mg->nr_lines = geo->nr_chks;
|
||||
l_mg->log_line = l_mg->data_line = NULL;
|
||||
l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
|
||||
l_mg->nr_free_lines = 0;
|
||||
bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
|
||||
|
||||
lm->sec_per_line = geo->sec_per_blk * geo->nr_luns;
|
||||
lm->blk_per_line = geo->nr_luns;
|
||||
lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
|
||||
lm->sec_per_line = geo->sec_per_chk * geo->all_luns;
|
||||
lm->blk_per_line = geo->all_luns;
|
||||
lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
|
||||
lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
|
||||
lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
|
||||
lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
|
||||
lm->mid_thrs = lm->sec_per_line / 2;
|
||||
lm->high_thrs = lm->sec_per_line / 4;
|
||||
lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs;
|
||||
lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs;
|
||||
|
||||
/* Calculate necessary pages for smeta. See comment over struct
|
||||
* line_smeta definition
|
||||
|
@ -742,12 +761,12 @@ add_emeta_page:
|
|||
goto add_emeta_page;
|
||||
}
|
||||
|
||||
lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0;
|
||||
lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0;
|
||||
|
||||
lm->min_blk_line = 1;
|
||||
if (geo->nr_luns > 1)
|
||||
if (geo->all_luns > 1)
|
||||
lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
|
||||
lm->emeta_sec[0], geo->sec_per_blk);
|
||||
lm->emeta_sec[0], geo->sec_per_chk);
|
||||
|
||||
if (lm->min_blk_line > lm->blk_per_line) {
|
||||
pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
|
||||
|
@ -772,7 +791,7 @@ add_emeta_page:
|
|||
goto fail_free_bb_template;
|
||||
}
|
||||
|
||||
bb_distance = (geo->nr_luns) * geo->sec_per_pl;
|
||||
bb_distance = (geo->all_luns) * geo->sec_per_pl;
|
||||
for (i = 0; i < lm->sec_per_line; i += bb_distance)
|
||||
bitmap_set(l_mg->bb_template, i, geo->sec_per_pl);
|
||||
|
||||
|
@ -844,7 +863,7 @@ add_emeta_page:
|
|||
pblk_set_provision(pblk, nr_free_blks);
|
||||
|
||||
/* Cleanup per-LUN bad block lists - managed within lines on run-time */
|
||||
for (i = 0; i < geo->nr_luns; i++)
|
||||
for (i = 0; i < geo->all_luns; i++)
|
||||
kfree(pblk->luns[i].bb_list);
|
||||
|
||||
return 0;
|
||||
|
@ -858,7 +877,7 @@ fail_free_bb_template:
|
|||
fail_free_meta:
|
||||
pblk_line_meta_free(pblk);
|
||||
fail:
|
||||
for (i = 0; i < geo->nr_luns; i++)
|
||||
for (i = 0; i < geo->all_luns; i++)
|
||||
kfree(pblk->luns[i].bb_list);
|
||||
|
||||
return ret;
|
||||
|
@ -866,15 +885,19 @@ fail:
|
|||
|
||||
static int pblk_writer_init(struct pblk *pblk)
|
||||
{
|
||||
timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
|
||||
mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
|
||||
|
||||
pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
|
||||
if (IS_ERR(pblk->writer_ts)) {
|
||||
pr_err("pblk: could not allocate writer kthread\n");
|
||||
return PTR_ERR(pblk->writer_ts);
|
||||
int err = PTR_ERR(pblk->writer_ts);
|
||||
|
||||
if (err != -EINTR)
|
||||
pr_err("pblk: could not allocate writer kthread (%d)\n",
|
||||
err);
|
||||
return err;
|
||||
}
|
||||
|
||||
timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
|
||||
mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -910,7 +933,6 @@ static void pblk_tear_down(struct pblk *pblk)
|
|||
pblk_pipeline_stop(pblk);
|
||||
pblk_writer_stop(pblk);
|
||||
pblk_rb_sync_l2p(&pblk->rwb);
|
||||
pblk_rwb_free(pblk);
|
||||
pblk_rl_free(&pblk->rl);
|
||||
|
||||
pr_debug("pblk: consistent tear down\n");
|
||||
|
@ -1025,7 +1047,8 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
|
|||
|
||||
ret = pblk_writer_init(pblk);
|
||||
if (ret) {
|
||||
pr_err("pblk: could not initialize write thread\n");
|
||||
if (ret != -EINTR)
|
||||
pr_err("pblk: could not initialize write thread\n");
|
||||
goto fail_free_lines;
|
||||
}
|
||||
|
||||
|
@ -1041,13 +1064,14 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
|
|||
|
||||
blk_queue_write_cache(tqueue, true, false);
|
||||
|
||||
tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size;
|
||||
tqueue->limits.discard_granularity = geo->sec_per_chk * geo->sec_size;
|
||||
tqueue->limits.discard_alignment = 0;
|
||||
blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
|
||||
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue);
|
||||
|
||||
pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
|
||||
geo->nr_luns, pblk->l_mg.nr_lines,
|
||||
pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
|
||||
tdisk->disk_name,
|
||||
geo->all_luns, pblk->l_mg.nr_lines,
|
||||
(unsigned long long)pblk->rl.nr_secs,
|
||||
pblk->rwb.nr_entries);
|
||||
|
||||
|
|
|
@ -146,7 +146,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
|
|||
return;
|
||||
|
||||
/* Erase blocks that are bad in this line but might not be in next */
|
||||
if (unlikely(ppa_empty(*erase_ppa)) &&
|
||||
if (unlikely(pblk_ppa_empty(*erase_ppa)) &&
|
||||
bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) {
|
||||
int bit = -1;
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
|
|||
rb->seg_size = (1 << power_seg_sz);
|
||||
rb->nr_entries = (1 << power_size);
|
||||
rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
|
||||
rb->sync_point = EMPTY_ENTRY;
|
||||
rb->flush_point = EMPTY_ENTRY;
|
||||
|
||||
spin_lock_init(&rb->w_lock);
|
||||
spin_lock_init(&rb->s_lock);
|
||||
|
@ -112,7 +112,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
|
|||
up_write(&pblk_rb_lock);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
atomic_set(&rb->inflight_sync_point, 0);
|
||||
atomic_set(&rb->inflight_flush_point, 0);
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
@ -226,7 +226,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
|
|||
pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
|
||||
entry->cacheline);
|
||||
|
||||
line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)];
|
||||
line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)];
|
||||
kref_put(&line->ref, pblk_line_put);
|
||||
clean_wctx(w_ctx);
|
||||
rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1);
|
||||
|
@ -349,35 +349,35 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
|
|||
smp_store_release(&entry->w_ctx.flags, flags);
|
||||
}
|
||||
|
||||
static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
|
||||
static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
|
||||
unsigned int pos)
|
||||
{
|
||||
struct pblk_rb_entry *entry;
|
||||
unsigned int subm, sync_point;
|
||||
unsigned int sync, flush_point;
|
||||
|
||||
subm = READ_ONCE(rb->subm);
|
||||
sync = READ_ONCE(rb->sync);
|
||||
|
||||
if (pos == sync)
|
||||
return 0;
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
atomic_inc(&rb->inflight_sync_point);
|
||||
atomic_inc(&rb->inflight_flush_point);
|
||||
#endif
|
||||
|
||||
if (pos == subm)
|
||||
return 0;
|
||||
flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
|
||||
entry = &rb->entries[flush_point];
|
||||
|
||||
sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
|
||||
entry = &rb->entries[sync_point];
|
||||
pblk_rb_sync_init(rb, NULL);
|
||||
|
||||
/* Protect syncs */
|
||||
smp_store_release(&rb->sync_point, sync_point);
|
||||
/* Protect flush points */
|
||||
smp_store_release(&rb->flush_point, flush_point);
|
||||
|
||||
if (!bio)
|
||||
return 0;
|
||||
if (bio)
|
||||
bio_list_add(&entry->w_ctx.bios, bio);
|
||||
|
||||
spin_lock_irq(&rb->s_lock);
|
||||
bio_list_add(&entry->w_ctx.bios, bio);
|
||||
spin_unlock_irq(&rb->s_lock);
|
||||
pblk_rb_sync_end(rb, NULL);
|
||||
|
||||
return 1;
|
||||
return bio ? 1 : 0;
|
||||
}
|
||||
|
||||
static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
|
||||
|
@ -416,7 +416,7 @@ void pblk_rb_flush(struct pblk_rb *rb)
|
|||
struct pblk *pblk = container_of(rb, struct pblk, rwb);
|
||||
unsigned int mem = READ_ONCE(rb->mem);
|
||||
|
||||
if (pblk_rb_sync_point_set(rb, NULL, mem))
|
||||
if (pblk_rb_flush_point_set(rb, NULL, mem))
|
||||
return;
|
||||
|
||||
pblk_write_should_kick(pblk);
|
||||
|
@ -440,7 +440,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
|
|||
#ifdef CONFIG_NVM_DEBUG
|
||||
atomic_long_inc(&pblk->nr_flush);
|
||||
#endif
|
||||
if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem))
|
||||
if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem))
|
||||
*io_ret = NVM_IO_OK;
|
||||
}
|
||||
|
||||
|
@ -606,21 +606,6 @@ try:
|
|||
return NVM_IO_ERR;
|
||||
}
|
||||
|
||||
if (flags & PBLK_FLUSH_ENTRY) {
|
||||
unsigned int sync_point;
|
||||
|
||||
sync_point = READ_ONCE(rb->sync_point);
|
||||
if (sync_point == pos) {
|
||||
/* Protect syncs */
|
||||
smp_store_release(&rb->sync_point, EMPTY_ENTRY);
|
||||
}
|
||||
|
||||
flags &= ~PBLK_FLUSH_ENTRY;
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
atomic_dec(&rb->inflight_sync_point);
|
||||
#endif
|
||||
}
|
||||
|
||||
flags &= ~PBLK_WRITTEN_DATA;
|
||||
flags |= PBLK_SUBMITTED_ENTRY;
|
||||
|
||||
|
@ -730,15 +715,24 @@ void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
|
|||
|
||||
unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
|
||||
{
|
||||
unsigned int sync;
|
||||
unsigned int i;
|
||||
|
||||
unsigned int sync, flush_point;
|
||||
lockdep_assert_held(&rb->s_lock);
|
||||
|
||||
sync = READ_ONCE(rb->sync);
|
||||
flush_point = READ_ONCE(rb->flush_point);
|
||||
|
||||
for (i = 0; i < nr_entries; i++)
|
||||
sync = (sync + 1) & (rb->nr_entries - 1);
|
||||
if (flush_point != EMPTY_ENTRY) {
|
||||
unsigned int secs_to_flush;
|
||||
|
||||
secs_to_flush = pblk_rb_ring_count(flush_point, sync,
|
||||
rb->nr_entries);
|
||||
if (secs_to_flush < nr_entries) {
|
||||
/* Protect flush points */
|
||||
smp_store_release(&rb->flush_point, EMPTY_ENTRY);
|
||||
}
|
||||
}
|
||||
|
||||
sync = (sync + nr_entries) & (rb->nr_entries - 1);
|
||||
|
||||
/* Protect from counts */
|
||||
smp_store_release(&rb->sync, sync);
|
||||
|
@ -746,22 +740,27 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
|
|||
return sync;
|
||||
}
|
||||
|
||||
unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb)
|
||||
/* Calculate how many sectors to submit up to the current flush point. */
|
||||
unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb)
|
||||
{
|
||||
unsigned int subm, sync_point;
|
||||
unsigned int count;
|
||||
unsigned int subm, sync, flush_point;
|
||||
unsigned int submitted, to_flush;
|
||||
|
||||
/* Protect syncs */
|
||||
sync_point = smp_load_acquire(&rb->sync_point);
|
||||
if (sync_point == EMPTY_ENTRY)
|
||||
/* Protect flush points */
|
||||
flush_point = smp_load_acquire(&rb->flush_point);
|
||||
if (flush_point == EMPTY_ENTRY)
|
||||
return 0;
|
||||
|
||||
/* Protect syncs */
|
||||
sync = smp_load_acquire(&rb->sync);
|
||||
|
||||
subm = READ_ONCE(rb->subm);
|
||||
submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries);
|
||||
|
||||
/* The sync point itself counts as a sector to sync */
|
||||
count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1;
|
||||
to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1;
|
||||
|
||||
return count;
|
||||
return (submitted < to_flush) ? (to_flush - submitted) : 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -801,7 +800,7 @@ int pblk_rb_tear_down_check(struct pblk_rb *rb)
|
|||
|
||||
if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
|
||||
(rb->sync == rb->l2p_update) &&
|
||||
(rb->sync_point == EMPTY_ENTRY)) {
|
||||
(rb->flush_point == EMPTY_ENTRY)) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
@ -848,7 +847,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
|
|||
queued_entries++;
|
||||
spin_unlock_irq(&rb->s_lock);
|
||||
|
||||
if (rb->sync_point != EMPTY_ENTRY)
|
||||
if (rb->flush_point != EMPTY_ENTRY)
|
||||
offset = scnprintf(buf, PAGE_SIZE,
|
||||
"%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
|
||||
rb->nr_entries,
|
||||
|
@ -857,14 +856,14 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
|
|||
rb->sync,
|
||||
rb->l2p_update,
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
atomic_read(&rb->inflight_sync_point),
|
||||
atomic_read(&rb->inflight_flush_point),
|
||||
#else
|
||||
0,
|
||||
#endif
|
||||
rb->sync_point,
|
||||
rb->flush_point,
|
||||
pblk_rb_read_count(rb),
|
||||
pblk_rb_space(rb),
|
||||
pblk_rb_sync_point_count(rb),
|
||||
pblk_rb_flush_point_count(rb),
|
||||
queued_entries);
|
||||
else
|
||||
offset = scnprintf(buf, PAGE_SIZE,
|
||||
|
@ -875,13 +874,13 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
|
|||
rb->sync,
|
||||
rb->l2p_update,
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
atomic_read(&rb->inflight_sync_point),
|
||||
atomic_read(&rb->inflight_flush_point),
|
||||
#else
|
||||
0,
|
||||
#endif
|
||||
pblk_rb_read_count(rb),
|
||||
pblk_rb_space(rb),
|
||||
pblk_rb_sync_point_count(rb),
|
||||
pblk_rb_flush_point_count(rb),
|
||||
queued_entries);
|
||||
|
||||
return offset;
|
||||
|
|
|
@ -141,7 +141,7 @@ static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
|
|||
struct ppa_addr ppa = ppa_list[i];
|
||||
struct pblk_line *line;
|
||||
|
||||
line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
|
||||
line = &pblk->lines[pblk_ppa_to_line(ppa)];
|
||||
kref_put(&line->ref, pblk_line_put_wq);
|
||||
}
|
||||
}
|
||||
|
@ -158,8 +158,12 @@ static void pblk_end_user_read(struct bio *bio)
|
|||
static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
bool put_line)
|
||||
{
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
|
||||
struct bio *bio = rqd->bio;
|
||||
unsigned long start_time = r_ctx->start_time;
|
||||
|
||||
generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time);
|
||||
|
||||
if (rqd->error)
|
||||
pblk_log_read_err(pblk, rqd);
|
||||
|
@ -193,9 +197,9 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
|
|||
__pblk_end_io_read(pblk, rqd, true);
|
||||
}
|
||||
|
||||
static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
unsigned int bio_init_idx,
|
||||
unsigned long *read_bitmap)
|
||||
static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
unsigned int bio_init_idx,
|
||||
unsigned long *read_bitmap)
|
||||
{
|
||||
struct bio *new_bio, *bio = rqd->bio;
|
||||
struct pblk_sec_meta *meta_list = rqd->meta_list;
|
||||
|
@ -270,7 +274,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
|
|||
i = 0;
|
||||
hole = find_first_zero_bit(read_bitmap, nr_secs);
|
||||
do {
|
||||
int line_id = pblk_dev_ppa_to_line(rqd->ppa_list[i]);
|
||||
int line_id = pblk_ppa_to_line(rqd->ppa_list[i]);
|
||||
struct pblk_line *line = &pblk->lines[line_id];
|
||||
|
||||
kref_put(&line->ref, pblk_line_put);
|
||||
|
@ -306,6 +310,8 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
|
|||
return NVM_IO_OK;
|
||||
|
||||
err:
|
||||
pr_err("pblk: failed to perform partial read\n");
|
||||
|
||||
/* Free allocated pages in new bio */
|
||||
pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
|
||||
__pblk_end_io_read(pblk, rqd, false);
|
||||
|
@ -357,6 +363,7 @@ retry:
|
|||
int pblk_submit_read(struct pblk *pblk, struct bio *bio)
|
||||
{
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct request_queue *q = dev->q;
|
||||
sector_t blba = pblk_get_lba(bio);
|
||||
unsigned int nr_secs = pblk_get_secs(bio);
|
||||
struct pblk_g_ctx *r_ctx;
|
||||
|
@ -372,6 +379,8 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
|
|||
return NVM_IO_ERR;
|
||||
}
|
||||
|
||||
generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0);
|
||||
|
||||
bitmap_zero(&read_bitmap, nr_secs);
|
||||
|
||||
rqd = pblk_alloc_rqd(pblk, PBLK_READ);
|
||||
|
@ -383,6 +392,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
|
|||
rqd->end_io = pblk_end_io_read;
|
||||
|
||||
r_ctx = nvm_rq_to_pdu(rqd);
|
||||
r_ctx->start_time = jiffies;
|
||||
r_ctx->lba = blba;
|
||||
|
||||
/* Save the index for this bio's start. This is needed in case
|
||||
|
@ -422,7 +432,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
|
|||
int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set);
|
||||
if (!int_bio) {
|
||||
pr_err("pblk: could not clone read bio\n");
|
||||
return NVM_IO_ERR;
|
||||
goto fail_end_io;
|
||||
}
|
||||
|
||||
rqd->bio = int_bio;
|
||||
|
@ -433,7 +443,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
|
|||
pr_err("pblk: read IO submission failed\n");
|
||||
if (int_bio)
|
||||
bio_put(int_bio);
|
||||
return ret;
|
||||
goto fail_end_io;
|
||||
}
|
||||
|
||||
return NVM_IO_OK;
|
||||
|
@ -442,17 +452,14 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
|
|||
/* The read bio request could be partially filled by the write buffer,
|
||||
* but there are some holes that need to be read from the drive.
|
||||
*/
|
||||
ret = pblk_fill_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
|
||||
if (ret) {
|
||||
pr_err("pblk: failed to perform partial read\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
return NVM_IO_OK;
|
||||
return pblk_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
|
||||
|
||||
fail_rqd_free:
|
||||
pblk_free_rqd(pblk, rqd, PBLK_READ);
|
||||
return ret;
|
||||
fail_end_io:
|
||||
__pblk_end_io_read(pblk, rqd, false);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
|
|
|
@ -111,18 +111,18 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
|
|||
return 0;
|
||||
}
|
||||
|
||||
__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta_buf)
|
||||
int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
|
||||
{
|
||||
u32 crc;
|
||||
|
||||
crc = pblk_calc_emeta_crc(pblk, emeta_buf);
|
||||
if (le32_to_cpu(emeta_buf->crc) != crc)
|
||||
return NULL;
|
||||
return 1;
|
||||
|
||||
if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC)
|
||||
return NULL;
|
||||
return 1;
|
||||
|
||||
return emeta_to_lbas(pblk, emeta_buf);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
|
||||
|
@ -137,7 +137,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
|
|||
u64 nr_valid_lbas, nr_lbas = 0;
|
||||
u64 i;
|
||||
|
||||
lba_list = pblk_recov_get_lba_list(pblk, emeta_buf);
|
||||
lba_list = emeta_to_lbas(pblk, emeta_buf);
|
||||
if (!lba_list)
|
||||
return 1;
|
||||
|
||||
|
@ -149,7 +149,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
|
|||
struct ppa_addr ppa;
|
||||
int pos;
|
||||
|
||||
ppa = addr_to_pblk_ppa(pblk, i, line->id);
|
||||
ppa = addr_to_gen_ppa(pblk, i, line->id);
|
||||
pos = pblk_ppa_to_pos(geo, ppa);
|
||||
|
||||
/* Do not update bad blocks */
|
||||
|
@ -188,7 +188,7 @@ static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
|
|||
int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
|
||||
|
||||
return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] -
|
||||
nr_bb * geo->sec_per_blk;
|
||||
nr_bb * geo->sec_per_chk;
|
||||
}
|
||||
|
||||
struct pblk_recov_alloc {
|
||||
|
@ -263,12 +263,12 @@ next_read_rq:
|
|||
int pos;
|
||||
|
||||
ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
|
||||
pos = pblk_dev_ppa_to_pos(geo, ppa);
|
||||
pos = pblk_ppa_to_pos(geo, ppa);
|
||||
|
||||
while (test_bit(pos, line->blk_bitmap)) {
|
||||
r_ptr_int += pblk->min_write_pgs;
|
||||
ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
|
||||
pos = pblk_dev_ppa_to_pos(geo, ppa);
|
||||
pos = pblk_ppa_to_pos(geo, ppa);
|
||||
}
|
||||
|
||||
for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++)
|
||||
|
@ -288,7 +288,7 @@ next_read_rq:
|
|||
/* At this point, the read should not fail. If it does, it is a problem
|
||||
* we cannot recover from here. Need FTL log.
|
||||
*/
|
||||
if (rqd->error) {
|
||||
if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) {
|
||||
pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
|
||||
return -EINTR;
|
||||
}
|
||||
|
@ -411,12 +411,12 @@ next_pad_rq:
|
|||
int pos;
|
||||
|
||||
w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
|
||||
ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
|
||||
ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
|
||||
pos = pblk_ppa_to_pos(geo, ppa);
|
||||
|
||||
while (test_bit(pos, line->blk_bitmap)) {
|
||||
w_ptr += pblk->min_write_pgs;
|
||||
ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
|
||||
ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
|
||||
pos = pblk_ppa_to_pos(geo, ppa);
|
||||
}
|
||||
|
||||
|
@ -541,12 +541,12 @@ next_rq:
|
|||
|
||||
w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
|
||||
ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
|
||||
pos = pblk_dev_ppa_to_pos(geo, ppa);
|
||||
pos = pblk_ppa_to_pos(geo, ppa);
|
||||
|
||||
while (test_bit(pos, line->blk_bitmap)) {
|
||||
w_ptr += pblk->min_write_pgs;
|
||||
ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
|
||||
pos = pblk_dev_ppa_to_pos(geo, ppa);
|
||||
pos = pblk_ppa_to_pos(geo, ppa);
|
||||
}
|
||||
|
||||
for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++)
|
||||
|
@ -672,12 +672,12 @@ next_rq:
|
|||
|
||||
paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
|
||||
ppa = addr_to_gen_ppa(pblk, paddr, line->id);
|
||||
pos = pblk_dev_ppa_to_pos(geo, ppa);
|
||||
pos = pblk_ppa_to_pos(geo, ppa);
|
||||
|
||||
while (test_bit(pos, line->blk_bitmap)) {
|
||||
paddr += pblk->min_write_pgs;
|
||||
ppa = addr_to_gen_ppa(pblk, paddr, line->id);
|
||||
pos = pblk_dev_ppa_to_pos(geo, ppa);
|
||||
pos = pblk_ppa_to_pos(geo, ppa);
|
||||
}
|
||||
|
||||
for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++)
|
||||
|
@ -817,7 +817,7 @@ static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line)
|
|||
|
||||
while (emeta_secs) {
|
||||
emeta_start--;
|
||||
ppa = addr_to_pblk_ppa(pblk, emeta_start, line->id);
|
||||
ppa = addr_to_gen_ppa(pblk, emeta_start, line->id);
|
||||
pos = pblk_ppa_to_pos(geo, ppa);
|
||||
if (!test_bit(pos, line->blk_bitmap))
|
||||
emeta_secs--;
|
||||
|
@ -938,6 +938,11 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
|
|||
goto next;
|
||||
}
|
||||
|
||||
if (pblk_recov_check_emeta(pblk, line->emeta->buf)) {
|
||||
pblk_recov_l2p_from_oob(pblk, line);
|
||||
goto next;
|
||||
}
|
||||
|
||||
if (pblk_recov_l2p_from_emeta(pblk, line))
|
||||
pblk_recov_l2p_from_oob(pblk, line);
|
||||
|
||||
|
@ -984,10 +989,8 @@ next:
|
|||
}
|
||||
spin_unlock(&l_mg->free_lock);
|
||||
|
||||
if (is_next) {
|
||||
if (is_next)
|
||||
pblk_line_erase(pblk, l_mg->data_next);
|
||||
pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
|
||||
}
|
||||
|
||||
out:
|
||||
if (found_lines != recovered_lines)
|
||||
|
|
|
@ -89,17 +89,15 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
|
|||
return atomic_read(&rl->free_blocks);
|
||||
}
|
||||
|
||||
/*
|
||||
* We check for (i) the number of free blocks in the current LUN and (ii) the
|
||||
* total number of free blocks in the pblk instance. This is to even out the
|
||||
* number of free blocks on each LUN when GC kicks in.
|
||||
*
|
||||
* Only the total number of free blocks is used to configure the rate limiter.
|
||||
*/
|
||||
void pblk_rl_update_rates(struct pblk_rl *rl)
|
||||
unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl)
|
||||
{
|
||||
return atomic_read(&rl->free_user_blocks);
|
||||
}
|
||||
|
||||
static void __pblk_rl_update_rates(struct pblk_rl *rl,
|
||||
unsigned long free_blocks)
|
||||
{
|
||||
struct pblk *pblk = container_of(rl, struct pblk, rl);
|
||||
unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
|
||||
int max = rl->rb_budget;
|
||||
|
||||
if (free_blocks >= rl->high) {
|
||||
|
@ -132,20 +130,37 @@ void pblk_rl_update_rates(struct pblk_rl *rl)
|
|||
pblk_gc_should_stop(pblk);
|
||||
}
|
||||
|
||||
void pblk_rl_update_rates(struct pblk_rl *rl)
|
||||
{
|
||||
__pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl));
|
||||
}
|
||||
|
||||
void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
|
||||
{
|
||||
int blk_in_line = atomic_read(&line->blk_in_line);
|
||||
int free_blocks;
|
||||
|
||||
atomic_add(blk_in_line, &rl->free_blocks);
|
||||
pblk_rl_update_rates(rl);
|
||||
free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks);
|
||||
|
||||
__pblk_rl_update_rates(rl, free_blocks);
|
||||
}
|
||||
|
||||
void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
|
||||
void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
|
||||
bool used)
|
||||
{
|
||||
int blk_in_line = atomic_read(&line->blk_in_line);
|
||||
int free_blocks;
|
||||
|
||||
atomic_sub(blk_in_line, &rl->free_blocks);
|
||||
pblk_rl_update_rates(rl);
|
||||
|
||||
if (used)
|
||||
free_blocks = atomic_sub_return(blk_in_line,
|
||||
&rl->free_user_blocks);
|
||||
else
|
||||
free_blocks = atomic_read(&rl->free_user_blocks);
|
||||
|
||||
__pblk_rl_update_rates(rl, free_blocks);
|
||||
}
|
||||
|
||||
int pblk_rl_high_thrs(struct pblk_rl *rl)
|
||||
|
@ -174,16 +189,21 @@ void pblk_rl_free(struct pblk_rl *rl)
|
|||
void pblk_rl_init(struct pblk_rl *rl, int budget)
|
||||
{
|
||||
struct pblk *pblk = container_of(rl, struct pblk, rl);
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
|
||||
struct pblk_line_meta *lm = &pblk->lm;
|
||||
int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE;
|
||||
int sec_meta, blk_meta;
|
||||
|
||||
unsigned int rb_windows;
|
||||
|
||||
rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS;
|
||||
rl->high_pw = get_count_order(rl->high);
|
||||
/* Consider sectors used for metadata */
|
||||
sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
|
||||
blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk);
|
||||
|
||||
rl->low = rl->total_blocks / PBLK_USER_LOW_THRS;
|
||||
if (rl->low < min_blocks)
|
||||
rl->low = min_blocks;
|
||||
rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
|
||||
rl->high_pw = get_count_order(rl->high);
|
||||
|
||||
rl->rsv_blocks = min_blocks;
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
|
|||
ssize_t sz = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < geo->nr_luns; i++) {
|
||||
for (i = 0; i < geo->all_luns; i++) {
|
||||
int active = 1;
|
||||
|
||||
rlun = &pblk->luns[i];
|
||||
|
@ -49,11 +49,12 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
|
|||
|
||||
static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
|
||||
{
|
||||
int free_blocks, total_blocks;
|
||||
int free_blocks, free_user_blocks, total_blocks;
|
||||
int rb_user_max, rb_user_cnt;
|
||||
int rb_gc_max, rb_gc_cnt, rb_budget, rb_state;
|
||||
|
||||
free_blocks = atomic_read(&pblk->rl.free_blocks);
|
||||
free_blocks = pblk_rl_nr_free_blks(&pblk->rl);
|
||||
free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl);
|
||||
rb_user_max = pblk->rl.rb_user_max;
|
||||
rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
|
||||
rb_gc_max = pblk->rl.rb_gc_max;
|
||||
|
@ -64,16 +65,16 @@ static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
|
|||
total_blocks = pblk->rl.total_blocks;
|
||||
|
||||
return snprintf(page, PAGE_SIZE,
|
||||
"u:%u/%u,gc:%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n",
|
||||
"u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n",
|
||||
rb_user_cnt,
|
||||
rb_user_max,
|
||||
rb_gc_cnt,
|
||||
rb_gc_max,
|
||||
rb_state,
|
||||
rb_budget,
|
||||
pblk->rl.low,
|
||||
pblk->rl.high,
|
||||
free_blocks,
|
||||
free_user_blocks,
|
||||
total_blocks,
|
||||
READ_ONCE(pblk->rl.rb_user_active));
|
||||
}
|
||||
|
@ -238,7 +239,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
|
|||
|
||||
sz = snprintf(page, PAGE_SIZE - sz,
|
||||
"line: nluns:%d, nblks:%d, nsecs:%d\n",
|
||||
geo->nr_luns, lm->blk_per_line, lm->sec_per_line);
|
||||
geo->all_luns, lm->blk_per_line, lm->sec_per_line);
|
||||
|
||||
sz += snprintf(page + sz, PAGE_SIZE - sz,
|
||||
"lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n",
|
||||
|
@ -287,7 +288,7 @@ static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
|
|||
"blk_line:%d, sec_line:%d, sec_blk:%d\n",
|
||||
lm->blk_per_line,
|
||||
lm->sec_per_line,
|
||||
geo->sec_per_blk);
|
||||
geo->sec_per_chk);
|
||||
|
||||
return sz;
|
||||
}
|
||||
|
|
|
@ -21,13 +21,28 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
|
|||
struct pblk_c_ctx *c_ctx)
|
||||
{
|
||||
struct bio *original_bio;
|
||||
struct pblk_rb *rwb = &pblk->rwb;
|
||||
unsigned long ret;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < c_ctx->nr_valid; i++) {
|
||||
struct pblk_w_ctx *w_ctx;
|
||||
int pos = c_ctx->sentry + i;
|
||||
int flags;
|
||||
|
||||
w_ctx = pblk_rb_w_ctx(rwb, pos);
|
||||
flags = READ_ONCE(w_ctx->flags);
|
||||
|
||||
if (flags & PBLK_FLUSH_ENTRY) {
|
||||
flags &= ~PBLK_FLUSH_ENTRY;
|
||||
/* Release flags on context. Protect from writes */
|
||||
smp_store_release(&w_ctx->flags, flags);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
atomic_dec(&rwb->inflight_flush_point);
|
||||
#endif
|
||||
}
|
||||
|
||||
w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
|
||||
while ((original_bio = bio_list_pop(&w_ctx->bios)))
|
||||
bio_endio(original_bio);
|
||||
}
|
||||
|
@ -439,7 +454,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
|
|||
struct pblk_line *meta_line;
|
||||
int err;
|
||||
|
||||
ppa_set_empty(&erase_ppa);
|
||||
pblk_ppa_set_empty(&erase_ppa);
|
||||
|
||||
/* Assign lbas to ppas and populate request structure */
|
||||
err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
|
||||
|
@ -457,7 +472,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
|
|||
return NVM_IO_ERR;
|
||||
}
|
||||
|
||||
if (!ppa_empty(erase_ppa)) {
|
||||
if (!pblk_ppa_empty(erase_ppa)) {
|
||||
/* Submit erase for next data line */
|
||||
if (pblk_blk_erase_async(pblk, erase_ppa)) {
|
||||
struct pblk_line *e_line = pblk_line_get_erase(pblk);
|
||||
|
@ -508,7 +523,7 @@ static int pblk_submit_write(struct pblk *pblk)
|
|||
if (!secs_avail)
|
||||
return 1;
|
||||
|
||||
secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb);
|
||||
secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
|
||||
if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
|
||||
return 1;
|
||||
|
||||
|
|
|
@ -51,17 +51,16 @@
|
|||
|
||||
#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR)
|
||||
|
||||
#define pblk_for_each_lun(pblk, rlun, i) \
|
||||
for ((i) = 0, rlun = &(pblk)->luns[0]; \
|
||||
(i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
|
||||
|
||||
/* Static pool sizes */
|
||||
#define PBLK_GEN_WS_POOL_SIZE (2)
|
||||
|
||||
#define PBLK_DEFAULT_OP (11)
|
||||
|
||||
enum {
|
||||
PBLK_READ = READ,
|
||||
PBLK_WRITE = WRITE,/* Write from write buffer */
|
||||
PBLK_WRITE_INT, /* Internal write - no write buffer */
|
||||
PBLK_READ_RECOV, /* Recovery read - errors allowed */
|
||||
PBLK_ERASE,
|
||||
};
|
||||
|
||||
|
@ -114,6 +113,7 @@ struct pblk_c_ctx {
|
|||
/* read context */
|
||||
struct pblk_g_ctx {
|
||||
void *private;
|
||||
unsigned long start_time;
|
||||
u64 lba;
|
||||
};
|
||||
|
||||
|
@ -170,7 +170,7 @@ struct pblk_rb {
|
|||
* the last submitted entry that has
|
||||
* been successfully persisted to media
|
||||
*/
|
||||
unsigned int sync_point; /* Sync point - last entry that must be
|
||||
unsigned int flush_point; /* Sync point - last entry that must be
|
||||
* flushed to the media. Used with
|
||||
* REQ_FLUSH and REQ_FUA
|
||||
*/
|
||||
|
@ -193,7 +193,7 @@ struct pblk_rb {
|
|||
spinlock_t s_lock; /* Sync lock */
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
atomic_t inflight_sync_point; /* Not served REQ_FLUSH | REQ_FUA */
|
||||
atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */
|
||||
#endif
|
||||
};
|
||||
|
||||
|
@ -256,9 +256,6 @@ struct pblk_rl {
|
|||
unsigned int high; /* Upper threshold for rate limiter (free run -
|
||||
* user I/O rate limiter
|
||||
*/
|
||||
unsigned int low; /* Lower threshold for rate limiter (user I/O
|
||||
* rate limiter - stall)
|
||||
*/
|
||||
unsigned int high_pw; /* High rounded up as a power of 2 */
|
||||
|
||||
#define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */
|
||||
|
@ -292,7 +289,9 @@ struct pblk_rl {
|
|||
|
||||
unsigned long long nr_secs;
|
||||
unsigned long total_blocks;
|
||||
atomic_t free_blocks;
|
||||
|
||||
atomic_t free_blocks; /* Total number of free blocks (+ OP) */
|
||||
atomic_t free_user_blocks; /* Number of user free blocks (no OP) */
|
||||
};
|
||||
|
||||
#define PBLK_LINE_EMPTY (~0U)
|
||||
|
@ -583,7 +582,9 @@ struct pblk {
|
|||
*/
|
||||
|
||||
sector_t capacity; /* Device capacity when bad blocks are subtracted */
|
||||
int over_pct; /* Percentage of device used for over-provisioning */
|
||||
|
||||
int op; /* Percentage of device used for over-provisioning */
|
||||
int op_blks; /* Number of blocks used for over-provisioning */
|
||||
|
||||
/* pblk provisioning values. Used by rate limiter */
|
||||
struct pblk_rl rl;
|
||||
|
@ -691,7 +692,7 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries);
|
|||
struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
|
||||
struct ppa_addr *ppa);
|
||||
void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags);
|
||||
unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb);
|
||||
unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb);
|
||||
|
||||
unsigned int pblk_rb_read_count(struct pblk_rb *rb);
|
||||
unsigned int pblk_rb_sync_count(struct pblk_rb *rb);
|
||||
|
@ -812,7 +813,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
|
|||
void pblk_submit_rec(struct work_struct *work);
|
||||
struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
|
||||
int pblk_recov_pad(struct pblk *pblk);
|
||||
__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta);
|
||||
int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
|
||||
int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
|
||||
struct pblk_rec_ctx *recovery, u64 *comp_bits,
|
||||
unsigned int comp);
|
||||
|
@ -843,6 +844,7 @@ void pblk_rl_free(struct pblk_rl *rl);
|
|||
void pblk_rl_update_rates(struct pblk_rl *rl);
|
||||
int pblk_rl_high_thrs(struct pblk_rl *rl);
|
||||
unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
|
||||
unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl);
|
||||
int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
|
||||
void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries);
|
||||
void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
|
||||
|
@ -851,7 +853,8 @@ void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
|
|||
void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
|
||||
int pblk_rl_max_io(struct pblk_rl *rl);
|
||||
void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
|
||||
void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line);
|
||||
void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
|
||||
bool used);
|
||||
int pblk_rl_is_limit(struct pblk_rl *rl);
|
||||
|
||||
/*
|
||||
|
@ -907,15 +910,10 @@ static inline int pblk_pad_distance(struct pblk *pblk)
|
|||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
|
||||
return NVM_MEM_PAGE_WRITE * geo->nr_luns * geo->sec_per_pl;
|
||||
return NVM_MEM_PAGE_WRITE * geo->all_luns * geo->sec_per_pl;
|
||||
}
|
||||
|
||||
static inline int pblk_dev_ppa_to_line(struct ppa_addr p)
|
||||
{
|
||||
return p.g.blk;
|
||||
}
|
||||
|
||||
static inline int pblk_tgt_ppa_to_line(struct ppa_addr p)
|
||||
static inline int pblk_ppa_to_line(struct ppa_addr p)
|
||||
{
|
||||
return p.g.blk;
|
||||
}
|
||||
|
@ -925,10 +923,34 @@ static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
|
|||
return p.g.lun * geo->nr_chnls + p.g.ch;
|
||||
}
|
||||
|
||||
/* A block within a line corresponds to the lun */
|
||||
static inline int pblk_dev_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
|
||||
static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
|
||||
u64 line_id)
|
||||
{
|
||||
return p.g.lun * geo->nr_chnls + p.g.ch;
|
||||
struct ppa_addr ppa;
|
||||
|
||||
ppa.ppa = 0;
|
||||
ppa.g.blk = line_id;
|
||||
ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
|
||||
ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
|
||||
ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
|
||||
ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
|
||||
ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
|
||||
|
||||
return ppa;
|
||||
}
|
||||
|
||||
static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
|
||||
struct ppa_addr p)
|
||||
{
|
||||
u64 paddr;
|
||||
|
||||
paddr = (u64)p.g.pg << pblk->ppaf.pg_offset;
|
||||
paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
|
||||
paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
|
||||
paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
|
||||
paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
|
||||
|
||||
return paddr;
|
||||
}
|
||||
|
||||
static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
|
||||
|
@ -960,24 +982,6 @@ static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
|
|||
return ppa64;
|
||||
}
|
||||
|
||||
static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
|
||||
sector_t lba)
|
||||
{
|
||||
struct ppa_addr ppa;
|
||||
|
||||
if (pblk->ppaf_bitsize < 32) {
|
||||
u32 *map = (u32 *)pblk->trans_map;
|
||||
|
||||
ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
|
||||
} else {
|
||||
struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
|
||||
|
||||
ppa = map[lba];
|
||||
}
|
||||
|
||||
return ppa;
|
||||
}
|
||||
|
||||
static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
|
||||
{
|
||||
u32 ppa32 = 0;
|
||||
|
@ -999,6 +1003,24 @@ static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
|
|||
return ppa32;
|
||||
}
|
||||
|
||||
static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
|
||||
sector_t lba)
|
||||
{
|
||||
struct ppa_addr ppa;
|
||||
|
||||
if (pblk->ppaf_bitsize < 32) {
|
||||
u32 *map = (u32 *)pblk->trans_map;
|
||||
|
||||
ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
|
||||
} else {
|
||||
struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
|
||||
|
||||
ppa = map[lba];
|
||||
}
|
||||
|
||||
return ppa;
|
||||
}
|
||||
|
||||
static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
|
||||
struct ppa_addr ppa)
|
||||
{
|
||||
|
@ -1013,21 +1035,6 @@ static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
|
|||
}
|
||||
}
|
||||
|
||||
static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
|
||||
struct ppa_addr p)
|
||||
{
|
||||
u64 paddr;
|
||||
|
||||
paddr = 0;
|
||||
paddr |= (u64)p.g.pg << pblk->ppaf.pg_offset;
|
||||
paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
|
||||
paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
|
||||
paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
|
||||
paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
|
||||
|
||||
return paddr;
|
||||
}
|
||||
|
||||
static inline int pblk_ppa_empty(struct ppa_addr ppa_addr)
|
||||
{
|
||||
return (ppa_addr.ppa == ADDR_EMPTY);
|
||||
|
@ -1040,10 +1047,7 @@ static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr)
|
|||
|
||||
static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa)
|
||||
{
|
||||
if (lppa.ppa == rppa.ppa)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
return (lppa.ppa == rppa.ppa);
|
||||
}
|
||||
|
||||
static inline int pblk_addr_in_cache(struct ppa_addr ppa)
|
||||
|
@ -1066,32 +1070,6 @@ static inline struct ppa_addr pblk_cacheline_to_addr(int addr)
|
|||
return p;
|
||||
}
|
||||
|
||||
static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
|
||||
u64 line_id)
|
||||
{
|
||||
struct ppa_addr ppa;
|
||||
|
||||
ppa.ppa = 0;
|
||||
ppa.g.blk = line_id;
|
||||
ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
|
||||
ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
|
||||
ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
|
||||
ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
|
||||
ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
|
||||
|
||||
return ppa;
|
||||
}
|
||||
|
||||
static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr,
|
||||
u64 line_id)
|
||||
{
|
||||
struct ppa_addr ppa;
|
||||
|
||||
ppa = addr_to_gen_ppa(pblk, paddr, line_id);
|
||||
|
||||
return ppa;
|
||||
}
|
||||
|
||||
static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk,
|
||||
struct line_header *header)
|
||||
{
|
||||
|
@ -1212,10 +1190,10 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
|
|||
|
||||
if (!ppa->c.is_cached &&
|
||||
ppa->g.ch < geo->nr_chnls &&
|
||||
ppa->g.lun < geo->luns_per_chnl &&
|
||||
ppa->g.lun < geo->nr_luns &&
|
||||
ppa->g.pl < geo->nr_planes &&
|
||||
ppa->g.blk < geo->blks_per_lun &&
|
||||
ppa->g.pg < geo->pgs_per_blk &&
|
||||
ppa->g.blk < geo->nr_chks &&
|
||||
ppa->g.pg < geo->ws_per_chk &&
|
||||
ppa->g.sec < geo->sec_per_pg)
|
||||
continue;
|
||||
|
||||
|
@ -1245,7 +1223,7 @@ static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
|
|||
|
||||
for (i = 0; i < rqd->nr_ppas; i++) {
|
||||
ppa = ppa_list[i];
|
||||
line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
|
||||
line = &pblk->lines[pblk_ppa_to_line(ppa)];
|
||||
|
||||
spin_lock(&line->lock);
|
||||
if (line->state != PBLK_LINESTATE_OPEN) {
|
||||
|
@ -1288,11 +1266,6 @@ static inline unsigned int pblk_get_secs(struct bio *bio)
|
|||
return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
|
||||
}
|
||||
|
||||
static inline sector_t pblk_get_sector(sector_t lba)
|
||||
{
|
||||
return lba * NR_PHY_IN_LOG;
|
||||
}
|
||||
|
||||
static inline void pblk_setup_uuid(struct pblk *pblk)
|
||||
{
|
||||
uuid_le uuid;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,290 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2015 IT University of Copenhagen
|
||||
* Initial release: Matias Bjorling <m@bjorling.me>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License version
|
||||
* 2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* Implementation of a Round-robin page-based Hybrid FTL for Open-channel SSDs.
|
||||
*/
|
||||
|
||||
#ifndef RRPC_H_
|
||||
#define RRPC_H_
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/vmalloc.h>
|
||||
|
||||
#include <linux/lightnvm.h>
|
||||
|
||||
/* Run only GC if less than 1/X blocks are free */
|
||||
#define GC_LIMIT_INVERSE 10
|
||||
#define GC_TIME_SECS 100
|
||||
|
||||
#define RRPC_SECTOR (512)
|
||||
#define RRPC_EXPOSED_PAGE_SIZE (4096)
|
||||
|
||||
#define NR_PHY_IN_LOG (RRPC_EXPOSED_PAGE_SIZE / RRPC_SECTOR)
|
||||
|
||||
struct rrpc_inflight {
|
||||
struct list_head reqs;
|
||||
spinlock_t lock;
|
||||
};
|
||||
|
||||
struct rrpc_inflight_rq {
|
||||
struct list_head list;
|
||||
sector_t l_start;
|
||||
sector_t l_end;
|
||||
};
|
||||
|
||||
struct rrpc_rq {
|
||||
struct rrpc_inflight_rq inflight_rq;
|
||||
unsigned long flags;
|
||||
};
|
||||
|
||||
struct rrpc_block {
|
||||
int id; /* id inside of LUN */
|
||||
struct rrpc_lun *rlun;
|
||||
|
||||
struct list_head prio; /* LUN CG list */
|
||||
struct list_head list; /* LUN free, used, bb list */
|
||||
|
||||
#define MAX_INVALID_PAGES_STORAGE 8
|
||||
/* Bitmap for invalid page intries */
|
||||
unsigned long invalid_pages[MAX_INVALID_PAGES_STORAGE];
|
||||
/* points to the next writable page within a block */
|
||||
unsigned int next_page;
|
||||
/* number of pages that are invalid, wrt host page size */
|
||||
unsigned int nr_invalid_pages;
|
||||
|
||||
int state;
|
||||
|
||||
spinlock_t lock;
|
||||
atomic_t data_cmnt_size; /* data pages committed to stable storage */
|
||||
};
|
||||
|
||||
struct rrpc_lun {
|
||||
struct rrpc *rrpc;
|
||||
|
||||
int id;
|
||||
struct ppa_addr bppa;
|
||||
|
||||
struct rrpc_block *cur, *gc_cur;
|
||||
struct rrpc_block *blocks; /* Reference to block allocation */
|
||||
|
||||
struct list_head prio_list; /* Blocks that may be GC'ed */
|
||||
struct list_head wblk_list; /* Queued blocks to be written to */
|
||||
|
||||
/* lun block lists */
|
||||
struct list_head used_list; /* In-use blocks */
|
||||
struct list_head free_list; /* Not used blocks i.e. released
|
||||
* and ready for use
|
||||
*/
|
||||
struct list_head bb_list; /* Bad blocks. Mutually exclusive with
|
||||
* free_list and used_list
|
||||
*/
|
||||
unsigned int nr_free_blocks; /* Number of unused blocks */
|
||||
|
||||
struct work_struct ws_gc;
|
||||
|
||||
int reserved_blocks;
|
||||
|
||||
spinlock_t lock;
|
||||
};
|
||||
|
||||
struct rrpc {
|
||||
struct nvm_tgt_dev *dev;
|
||||
struct gendisk *disk;
|
||||
|
||||
sector_t soffset; /* logical sector offset */
|
||||
|
||||
int nr_luns;
|
||||
struct rrpc_lun *luns;
|
||||
|
||||
/* calculated values */
|
||||
unsigned long long nr_sects;
|
||||
|
||||
/* Write strategy variables. Move these into each for structure for each
|
||||
* strategy
|
||||
*/
|
||||
atomic_t next_lun; /* Whenever a page is written, this is updated
|
||||
* to point to the next write lun
|
||||
*/
|
||||
|
||||
spinlock_t bio_lock;
|
||||
struct bio_list requeue_bios;
|
||||
struct work_struct ws_requeue;
|
||||
|
||||
/* Simple translation map of logical addresses to physical addresses.
|
||||
* The logical addresses is known by the host system, while the physical
|
||||
* addresses are used when writing to the disk block device.
|
||||
*/
|
||||
struct rrpc_addr *trans_map;
|
||||
/* also store a reverse map for garbage collection */
|
||||
struct rrpc_rev_addr *rev_trans_map;
|
||||
spinlock_t rev_lock;
|
||||
|
||||
struct rrpc_inflight inflights;
|
||||
|
||||
mempool_t *addr_pool;
|
||||
mempool_t *page_pool;
|
||||
mempool_t *gcb_pool;
|
||||
mempool_t *rq_pool;
|
||||
|
||||
struct timer_list gc_timer;
|
||||
struct workqueue_struct *krqd_wq;
|
||||
struct workqueue_struct *kgc_wq;
|
||||
};
|
||||
|
||||
struct rrpc_block_gc {
|
||||
struct rrpc *rrpc;
|
||||
struct rrpc_block *rblk;
|
||||
struct work_struct ws_gc;
|
||||
};
|
||||
|
||||
/* Logical to physical mapping */
|
||||
struct rrpc_addr {
|
||||
u64 addr;
|
||||
struct rrpc_block *rblk;
|
||||
};
|
||||
|
||||
/* Physical to logical mapping */
|
||||
struct rrpc_rev_addr {
|
||||
u64 addr;
|
||||
};
|
||||
|
||||
static inline struct ppa_addr rrpc_linear_to_generic_addr(struct nvm_geo *geo,
|
||||
struct ppa_addr r)
|
||||
{
|
||||
struct ppa_addr l;
|
||||
int secs, pgs;
|
||||
sector_t ppa = r.ppa;
|
||||
|
||||
l.ppa = 0;
|
||||
|
||||
div_u64_rem(ppa, geo->sec_per_pg, &secs);
|
||||
l.g.sec = secs;
|
||||
|
||||
sector_div(ppa, geo->sec_per_pg);
|
||||
div_u64_rem(ppa, geo->pgs_per_blk, &pgs);
|
||||
l.g.pg = pgs;
|
||||
|
||||
return l;
|
||||
}
|
||||
|
||||
static inline struct ppa_addr rrpc_recov_addr(struct nvm_tgt_dev *dev, u64 pba)
|
||||
{
|
||||
return linear_to_generic_addr(&dev->geo, pba);
|
||||
}
|
||||
|
||||
static inline u64 rrpc_blk_to_ppa(struct rrpc *rrpc, struct rrpc_block *rblk)
|
||||
{
|
||||
struct nvm_tgt_dev *dev = rrpc->dev;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
struct rrpc_lun *rlun = rblk->rlun;
|
||||
|
||||
return (rlun->id * geo->sec_per_lun) + (rblk->id * geo->sec_per_blk);
|
||||
}
|
||||
|
||||
static inline sector_t rrpc_get_laddr(struct bio *bio)
|
||||
{
|
||||
return bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
|
||||
}
|
||||
|
||||
static inline unsigned int rrpc_get_pages(struct bio *bio)
|
||||
{
|
||||
return bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE;
|
||||
}
|
||||
|
||||
static inline sector_t rrpc_get_sector(sector_t laddr)
|
||||
{
|
||||
return laddr * NR_PHY_IN_LOG;
|
||||
}
|
||||
|
||||
static inline int request_intersects(struct rrpc_inflight_rq *r,
|
||||
sector_t laddr_start, sector_t laddr_end)
|
||||
{
|
||||
return (laddr_end >= r->l_start) && (laddr_start <= r->l_end);
|
||||
}
|
||||
|
||||
static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
|
||||
unsigned int pages, struct rrpc_inflight_rq *r)
|
||||
{
|
||||
sector_t laddr_end = laddr + pages - 1;
|
||||
struct rrpc_inflight_rq *rtmp;
|
||||
|
||||
WARN_ON(irqs_disabled());
|
||||
|
||||
spin_lock_irq(&rrpc->inflights.lock);
|
||||
list_for_each_entry(rtmp, &rrpc->inflights.reqs, list) {
|
||||
if (unlikely(request_intersects(rtmp, laddr, laddr_end))) {
|
||||
/* existing, overlapping request, come back later */
|
||||
spin_unlock_irq(&rrpc->inflights.lock);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
r->l_start = laddr;
|
||||
r->l_end = laddr_end;
|
||||
|
||||
list_add_tail(&r->list, &rrpc->inflights.reqs);
|
||||
spin_unlock_irq(&rrpc->inflights.lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
|
||||
unsigned int pages,
|
||||
struct rrpc_inflight_rq *r)
|
||||
{
|
||||
BUG_ON((laddr + pages) > rrpc->nr_sects);
|
||||
|
||||
return __rrpc_lock_laddr(rrpc, laddr, pages, r);
|
||||
}
|
||||
|
||||
static inline struct rrpc_inflight_rq *rrpc_get_inflight_rq(struct nvm_rq *rqd)
|
||||
{
|
||||
struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
|
||||
|
||||
return &rrqd->inflight_rq;
|
||||
}
|
||||
|
||||
static inline int rrpc_lock_rq(struct rrpc *rrpc, struct bio *bio,
|
||||
struct nvm_rq *rqd)
|
||||
{
|
||||
sector_t laddr = rrpc_get_laddr(bio);
|
||||
unsigned int pages = rrpc_get_pages(bio);
|
||||
struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
|
||||
|
||||
return rrpc_lock_laddr(rrpc, laddr, pages, r);
|
||||
}
|
||||
|
||||
static inline void rrpc_unlock_laddr(struct rrpc *rrpc,
|
||||
struct rrpc_inflight_rq *r)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&rrpc->inflights.lock, flags);
|
||||
list_del_init(&r->list);
|
||||
spin_unlock_irqrestore(&rrpc->inflights.lock, flags);
|
||||
}
|
||||
|
||||
static inline void rrpc_unlock_rq(struct rrpc *rrpc, struct nvm_rq *rqd)
|
||||
{
|
||||
struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
|
||||
uint8_t pages = rqd->nr_ppas;
|
||||
|
||||
BUG_ON((r->l_start + pages) > rrpc->nr_sects);
|
||||
|
||||
rrpc_unlock_laddr(rrpc, r);
|
||||
}
|
||||
|
||||
#endif /* RRPC_H_ */
|
|
@ -525,15 +525,21 @@ struct open_bucket {
|
|||
|
||||
/*
|
||||
* We keep multiple buckets open for writes, and try to segregate different
|
||||
* write streams for better cache utilization: first we look for a bucket where
|
||||
* the last write to it was sequential with the current write, and failing that
|
||||
* we look for a bucket that was last used by the same task.
|
||||
* write streams for better cache utilization: first we try to segregate flash
|
||||
* only volume write streams from cached devices, secondly we look for a bucket
|
||||
* where the last write to it was sequential with the current write, and
|
||||
* failing that we look for a bucket that was last used by the same task.
|
||||
*
|
||||
* The ideas is if you've got multiple tasks pulling data into the cache at the
|
||||
* same time, you'll get better cache utilization if you try to segregate their
|
||||
* data and preserve locality.
|
||||
*
|
||||
* For example, say you've starting Firefox at the same time you're copying a
|
||||
* For example, dirty sectors of flash only volume is not reclaimable, if their
|
||||
* dirty sectors mixed with dirty sectors of cached device, such buckets will
|
||||
* be marked as dirty and won't be reclaimed, though the dirty data of cached
|
||||
* device have been written back to backend device.
|
||||
*
|
||||
* And say you've starting Firefox at the same time you're copying a
|
||||
* bunch of files. Firefox will likely end up being fairly hot and stay in the
|
||||
* cache awhile, but the data you copied might not be; if you wrote all that
|
||||
* data to the same buckets it'd get invalidated at the same time.
|
||||
|
@ -550,7 +556,10 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c,
|
|||
struct open_bucket *ret, *ret_task = NULL;
|
||||
|
||||
list_for_each_entry_reverse(ret, &c->data_buckets, list)
|
||||
if (!bkey_cmp(&ret->key, search))
|
||||
if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) !=
|
||||
UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)]))
|
||||
continue;
|
||||
else if (!bkey_cmp(&ret->key, search))
|
||||
goto found;
|
||||
else if (ret->last_write_point == write_point)
|
||||
ret_task = ret;
|
||||
|
|
|
@ -320,15 +320,16 @@ struct cached_dev {
|
|||
*/
|
||||
atomic_t has_dirty;
|
||||
|
||||
/*
|
||||
* Set to zero by things that touch the backing volume-- except
|
||||
* writeback. Incremented by writeback. Used to determine when to
|
||||
* accelerate idle writeback.
|
||||
*/
|
||||
atomic_t backing_idle;
|
||||
|
||||
struct bch_ratelimit writeback_rate;
|
||||
struct delayed_work writeback_rate_update;
|
||||
|
||||
/*
|
||||
* Internal to the writeback code, so read_dirty() can keep track of
|
||||
* where it's at.
|
||||
*/
|
||||
sector_t last_read;
|
||||
|
||||
/* Limit number of writeback bios in flight */
|
||||
struct semaphore in_flight;
|
||||
struct task_struct *writeback_thread;
|
||||
|
@ -336,6 +337,14 @@ struct cached_dev {
|
|||
|
||||
struct keybuf writeback_keys;
|
||||
|
||||
/*
|
||||
* Order the write-half of writeback operations strongly in dispatch
|
||||
* order. (Maintain LBA order; don't allow reads completing out of
|
||||
* order to re-order the writes...)
|
||||
*/
|
||||
struct closure_waitlist writeback_ordering_wait;
|
||||
atomic_t writeback_sequence_next;
|
||||
|
||||
/* For tracking sequential IO */
|
||||
#define RECENT_IO_BITS 7
|
||||
#define RECENT_IO (1 << RECENT_IO_BITS)
|
||||
|
@ -488,6 +497,7 @@ struct cache_set {
|
|||
int caches_loaded;
|
||||
|
||||
struct bcache_device **devices;
|
||||
unsigned devices_max_used;
|
||||
struct list_head cached_devs;
|
||||
uint64_t cached_dev_sectors;
|
||||
struct closure caching;
|
||||
|
@ -852,7 +862,7 @@ static inline void wake_up_allocators(struct cache_set *c)
|
|||
|
||||
/* Forward declarations */
|
||||
|
||||
void bch_count_io_errors(struct cache *, blk_status_t, const char *);
|
||||
void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
|
||||
void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
|
||||
blk_status_t, const char *);
|
||||
void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t,
|
||||
|
|
|
@ -419,7 +419,7 @@ static void do_btree_node_write(struct btree *b)
|
|||
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
|
||||
bset_sector_offset(&b->keys, i));
|
||||
|
||||
if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
|
||||
if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
|
||||
int j;
|
||||
struct bio_vec *bv;
|
||||
void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
|
||||
|
@ -432,6 +432,7 @@ static void do_btree_node_write(struct btree *b)
|
|||
|
||||
continue_at(cl, btree_node_write_done, NULL);
|
||||
} else {
|
||||
/* No problem for multipage bvec since the bio is just allocated */
|
||||
b->bio->bi_vcnt = 0;
|
||||
bch_bio_map(b->bio, i);
|
||||
|
||||
|
@ -1678,7 +1679,7 @@ static void bch_btree_gc_finish(struct cache_set *c)
|
|||
|
||||
/* don't reclaim buckets to which writeback keys point */
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < c->nr_uuids; i++) {
|
||||
for (i = 0; i < c->devices_max_used; i++) {
|
||||
struct bcache_device *d = c->devices[i];
|
||||
struct cached_dev *dc;
|
||||
struct keybuf_key *w, *n;
|
||||
|
@ -1803,10 +1804,7 @@ static int bch_gc_thread(void *arg)
|
|||
int bch_gc_thread_start(struct cache_set *c)
|
||||
{
|
||||
c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
|
||||
if (IS_ERR(c->gc_thread))
|
||||
return PTR_ERR(c->gc_thread);
|
||||
|
||||
return 0;
|
||||
return PTR_ERR_OR_ZERO(c->gc_thread);
|
||||
}
|
||||
|
||||
/* Initial partial gc */
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
#include <linux/debugfs.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/sched/debug.h>
|
||||
|
||||
#include "closure.h"
|
||||
|
||||
|
@ -18,10 +19,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
|
|||
BUG_ON(flags & CLOSURE_GUARD_MASK);
|
||||
BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
|
||||
|
||||
/* Must deliver precisely one wakeup */
|
||||
if (r == 1 && (flags & CLOSURE_SLEEPING))
|
||||
wake_up_process(cl->task);
|
||||
|
||||
if (!r) {
|
||||
if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
|
||||
atomic_set(&cl->remaining,
|
||||
|
@ -100,28 +97,34 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
|
|||
}
|
||||
EXPORT_SYMBOL(closure_wait);
|
||||
|
||||
/**
|
||||
* closure_sync - sleep until a closure has nothing left to wait on
|
||||
*
|
||||
* Sleeps until the refcount hits 1 - the thread that's running the closure owns
|
||||
* the last refcount.
|
||||
*/
|
||||
void closure_sync(struct closure *cl)
|
||||
struct closure_syncer {
|
||||
struct task_struct *task;
|
||||
int done;
|
||||
};
|
||||
|
||||
static void closure_sync_fn(struct closure *cl)
|
||||
{
|
||||
cl->s->done = 1;
|
||||
wake_up_process(cl->s->task);
|
||||
}
|
||||
|
||||
void __sched __closure_sync(struct closure *cl)
|
||||
{
|
||||
struct closure_syncer s = { .task = current };
|
||||
|
||||
cl->s = &s;
|
||||
continue_at(cl, closure_sync_fn, NULL);
|
||||
|
||||
while (1) {
|
||||
__closure_start_sleep(cl);
|
||||
closure_set_ret_ip(cl);
|
||||
|
||||
if ((atomic_read(&cl->remaining) &
|
||||
CLOSURE_REMAINING_MASK) == 1)
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
if (s.done)
|
||||
break;
|
||||
|
||||
schedule();
|
||||
}
|
||||
|
||||
__closure_end_sleep(cl);
|
||||
__set_current_state(TASK_RUNNING);
|
||||
}
|
||||
EXPORT_SYMBOL(closure_sync);
|
||||
EXPORT_SYMBOL(__closure_sync);
|
||||
|
||||
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
|
||||
|
||||
|
@ -168,12 +171,10 @@ static int debug_seq_show(struct seq_file *f, void *data)
|
|||
cl, (void *) cl->ip, cl->fn, cl->parent,
|
||||
r & CLOSURE_REMAINING_MASK);
|
||||
|
||||
seq_printf(f, "%s%s%s%s\n",
|
||||
seq_printf(f, "%s%s\n",
|
||||
test_bit(WORK_STRUCT_PENDING_BIT,
|
||||
work_data_bits(&cl->work)) ? "Q" : "",
|
||||
r & CLOSURE_RUNNING ? "R" : "",
|
||||
r & CLOSURE_STACK ? "S" : "",
|
||||
r & CLOSURE_SLEEPING ? "Sl" : "");
|
||||
r & CLOSURE_RUNNING ? "R" : "");
|
||||
|
||||
if (r & CLOSURE_WAITING)
|
||||
seq_printf(f, " W %pF\n",
|
||||
|
|
|
@ -103,6 +103,7 @@
|
|||
*/
|
||||
|
||||
struct closure;
|
||||
struct closure_syncer;
|
||||
typedef void (closure_fn) (struct closure *);
|
||||
|
||||
struct closure_waitlist {
|
||||
|
@ -115,10 +116,6 @@ enum closure_state {
|
|||
* the thread that owns the closure, and cleared by the thread that's
|
||||
* waking up the closure.
|
||||
*
|
||||
* CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
|
||||
* - indicates that cl->task is valid and closure_put() may wake it up.
|
||||
* Only set or cleared by the thread that owns the closure.
|
||||
*
|
||||
* The rest are for debugging and don't affect behaviour:
|
||||
*
|
||||
* CLOSURE_RUNNING: Set when a closure is running (i.e. by
|
||||
|
@ -128,22 +125,16 @@ enum closure_state {
|
|||
* continue_at() and closure_return() clear it for you, if you're doing
|
||||
* something unusual you can use closure_set_dead() which also helps
|
||||
* annotate where references are being transferred.
|
||||
*
|
||||
* CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
|
||||
* closure with this flag set
|
||||
*/
|
||||
|
||||
CLOSURE_BITS_START = (1 << 23),
|
||||
CLOSURE_DESTRUCTOR = (1 << 23),
|
||||
CLOSURE_WAITING = (1 << 25),
|
||||
CLOSURE_SLEEPING = (1 << 27),
|
||||
CLOSURE_RUNNING = (1 << 29),
|
||||
CLOSURE_STACK = (1 << 31),
|
||||
CLOSURE_BITS_START = (1U << 26),
|
||||
CLOSURE_DESTRUCTOR = (1U << 26),
|
||||
CLOSURE_WAITING = (1U << 28),
|
||||
CLOSURE_RUNNING = (1U << 30),
|
||||
};
|
||||
|
||||
#define CLOSURE_GUARD_MASK \
|
||||
((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \
|
||||
CLOSURE_RUNNING|CLOSURE_STACK) << 1)
|
||||
((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
|
||||
|
||||
#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
|
||||
#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
|
||||
|
@ -152,7 +143,7 @@ struct closure {
|
|||
union {
|
||||
struct {
|
||||
struct workqueue_struct *wq;
|
||||
struct task_struct *task;
|
||||
struct closure_syncer *s;
|
||||
struct llist_node list;
|
||||
closure_fn *fn;
|
||||
};
|
||||
|
@ -178,7 +169,19 @@ void closure_sub(struct closure *cl, int v);
|
|||
void closure_put(struct closure *cl);
|
||||
void __closure_wake_up(struct closure_waitlist *list);
|
||||
bool closure_wait(struct closure_waitlist *list, struct closure *cl);
|
||||
void closure_sync(struct closure *cl);
|
||||
void __closure_sync(struct closure *cl);
|
||||
|
||||
/**
|
||||
* closure_sync - sleep until a closure a closure has nothing left to wait on
|
||||
*
|
||||
* Sleeps until the refcount hits 1 - the thread that's running the closure owns
|
||||
* the last refcount.
|
||||
*/
|
||||
static inline void closure_sync(struct closure *cl)
|
||||
{
|
||||
if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
|
||||
__closure_sync(cl);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
|
||||
|
||||
|
@ -215,24 +218,6 @@ static inline void closure_set_waiting(struct closure *cl, unsigned long f)
|
|||
#endif
|
||||
}
|
||||
|
||||
static inline void __closure_end_sleep(struct closure *cl)
|
||||
{
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
|
||||
atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
|
||||
}
|
||||
|
||||
static inline void __closure_start_sleep(struct closure *cl)
|
||||
{
|
||||
closure_set_ip(cl);
|
||||
cl->task = current;
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
|
||||
if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
|
||||
atomic_add(CLOSURE_SLEEPING, &cl->remaining);
|
||||
}
|
||||
|
||||
static inline void closure_set_stopped(struct closure *cl)
|
||||
{
|
||||
atomic_sub(CLOSURE_RUNNING, &cl->remaining);
|
||||
|
@ -241,7 +226,6 @@ static inline void closure_set_stopped(struct closure *cl)
|
|||
static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
|
||||
struct workqueue_struct *wq)
|
||||
{
|
||||
BUG_ON(object_is_on_stack(cl));
|
||||
closure_set_ip(cl);
|
||||
cl->fn = fn;
|
||||
cl->wq = wq;
|
||||
|
@ -300,7 +284,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
|
|||
static inline void closure_init_stack(struct closure *cl)
|
||||
{
|
||||
memset(cl, 0, sizeof(struct closure));
|
||||
atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
|
||||
atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -322,6 +306,8 @@ static inline void closure_wake_up(struct closure_waitlist *list)
|
|||
* This is because after calling continue_at() you no longer have a ref on @cl,
|
||||
* and whatever @cl owns may be freed out from under you - a running closure fn
|
||||
* has a ref on its own closure which continue_at() drops.
|
||||
*
|
||||
* Note you are expected to immediately return after using this macro.
|
||||
*/
|
||||
#define continue_at(_cl, _fn, _wq) \
|
||||
do { \
|
||||
|
|
|
@ -116,7 +116,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
|
|||
return;
|
||||
check->bi_opf = REQ_OP_READ;
|
||||
|
||||
if (bio_alloc_pages(check, GFP_NOIO))
|
||||
if (bch_bio_alloc_pages(check, GFP_NOIO))
|
||||
goto out_put;
|
||||
|
||||
submit_bio_wait(check);
|
||||
|
@ -251,8 +251,7 @@ void bch_debug_exit(void)
|
|||
|
||||
int __init bch_debug_init(struct kobject *kobj)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
debug = debugfs_create_dir("bcache", NULL);
|
||||
return ret;
|
||||
|
||||
return IS_ERR_OR_NULL(debug);
|
||||
}
|
||||
|
|
|
@ -51,7 +51,10 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
|
|||
|
||||
/* IO errors */
|
||||
|
||||
void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
|
||||
void bch_count_io_errors(struct cache *ca,
|
||||
blk_status_t error,
|
||||
int is_read,
|
||||
const char *m)
|
||||
{
|
||||
/*
|
||||
* The halflife of an error is:
|
||||
|
@ -94,8 +97,9 @@ void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
|
|||
errors >>= IO_ERROR_SHIFT;
|
||||
|
||||
if (errors < ca->set->error_limit)
|
||||
pr_err("%s: IO error on %s, recovering",
|
||||
bdevname(ca->bdev, buf), m);
|
||||
pr_err("%s: IO error on %s%s",
|
||||
bdevname(ca->bdev, buf), m,
|
||||
is_read ? ", recovering." : ".");
|
||||
else
|
||||
bch_cache_set_error(ca->set,
|
||||
"%s: too many IO errors %s",
|
||||
|
@ -108,6 +112,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
|
|||
{
|
||||
struct bbio *b = container_of(bio, struct bbio, bio);
|
||||
struct cache *ca = PTR_CACHE(c, &b->key, 0);
|
||||
int is_read = (bio_data_dir(bio) == READ ? 1 : 0);
|
||||
|
||||
unsigned threshold = op_is_write(bio_op(bio))
|
||||
? c->congested_write_threshold_us
|
||||
|
@ -129,7 +134,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
|
|||
atomic_inc(&c->congested);
|
||||
}
|
||||
|
||||
bch_count_io_errors(ca, error, m);
|
||||
bch_count_io_errors(ca, error, is_read, m);
|
||||
}
|
||||
|
||||
void bch_bbio_endio(struct cache_set *c, struct bio *bio,
|
||||
|
|
|
@ -162,7 +162,7 @@ static void read_moving(struct cache_set *c)
|
|||
bio_set_op_attrs(bio, REQ_OP_READ, 0);
|
||||
bio->bi_end_io = read_moving_endio;
|
||||
|
||||
if (bio_alloc_pages(bio, GFP_KERNEL))
|
||||
if (bch_bio_alloc_pages(bio, GFP_KERNEL))
|
||||
goto err;
|
||||
|
||||
trace_bcache_gc_copy(&w->key);
|
||||
|
|
|
@ -576,6 +576,7 @@ static void cache_lookup(struct closure *cl)
|
|||
{
|
||||
struct search *s = container_of(cl, struct search, iop.cl);
|
||||
struct bio *bio = &s->bio.bio;
|
||||
struct cached_dev *dc;
|
||||
int ret;
|
||||
|
||||
bch_btree_op_init(&s->op, -1);
|
||||
|
@ -588,6 +589,27 @@ static void cache_lookup(struct closure *cl)
|
|||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* We might meet err when searching the btree, If that happens, we will
|
||||
* get negative ret, in this scenario we should not recover data from
|
||||
* backing device (when cache device is dirty) because we don't know
|
||||
* whether bkeys the read request covered are all clean.
|
||||
*
|
||||
* And after that happened, s->iop.status is still its initial value
|
||||
* before we submit s->bio.bio
|
||||
*/
|
||||
if (ret < 0) {
|
||||
BUG_ON(ret == -EINTR);
|
||||
if (s->d && s->d->c &&
|
||||
!UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) {
|
||||
dc = container_of(s->d, struct cached_dev, disk);
|
||||
if (dc && atomic_read(&dc->has_dirty))
|
||||
s->recoverable = false;
|
||||
}
|
||||
if (!s->iop.status)
|
||||
s->iop.status = BLK_STS_IOERR;
|
||||
}
|
||||
|
||||
closure_return(cl);
|
||||
}
|
||||
|
||||
|
@ -611,8 +633,8 @@ static void request_endio(struct bio *bio)
|
|||
static void bio_complete(struct search *s)
|
||||
{
|
||||
if (s->orig_bio) {
|
||||
struct request_queue *q = s->orig_bio->bi_disk->queue;
|
||||
generic_end_io_acct(q, bio_data_dir(s->orig_bio),
|
||||
generic_end_io_acct(s->d->disk->queue,
|
||||
bio_data_dir(s->orig_bio),
|
||||
&s->d->disk->part0, s->start_time);
|
||||
|
||||
trace_bcache_request_end(s->d, s->orig_bio);
|
||||
|
@ -841,7 +863,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
|
|||
cache_bio->bi_private = &s->cl;
|
||||
|
||||
bch_bio_map(cache_bio, NULL);
|
||||
if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
|
||||
if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
|
||||
goto out_put;
|
||||
|
||||
if (reada)
|
||||
|
@ -974,6 +996,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
|
|||
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
|
||||
int rw = bio_data_dir(bio);
|
||||
|
||||
atomic_set(&dc->backing_idle, 0);
|
||||
generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
|
||||
|
||||
bio_set_dev(bio, dc->bdev);
|
||||
|
|
|
@ -211,7 +211,7 @@ static void write_bdev_super_endio(struct bio *bio)
|
|||
|
||||
static void __write_super(struct cache_sb *sb, struct bio *bio)
|
||||
{
|
||||
struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
|
||||
struct cache_sb *out = page_address(bio_first_page_all(bio));
|
||||
unsigned i;
|
||||
|
||||
bio->bi_iter.bi_sector = SB_SECTOR;
|
||||
|
@ -274,7 +274,9 @@ static void write_super_endio(struct bio *bio)
|
|||
{
|
||||
struct cache *ca = bio->bi_private;
|
||||
|
||||
bch_count_io_errors(ca, bio->bi_status, "writing superblock");
|
||||
/* is_read = 0 */
|
||||
bch_count_io_errors(ca, bio->bi_status, 0,
|
||||
"writing superblock");
|
||||
closure_put(&ca->set->sb_write);
|
||||
}
|
||||
|
||||
|
@ -721,6 +723,9 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
|
|||
d->c = c;
|
||||
c->devices[id] = d;
|
||||
|
||||
if (id >= c->devices_max_used)
|
||||
c->devices_max_used = id + 1;
|
||||
|
||||
closure_get(&c->caching);
|
||||
}
|
||||
|
||||
|
@ -906,6 +911,12 @@ static void cached_dev_detach_finish(struct work_struct *w)
|
|||
|
||||
mutex_lock(&bch_register_lock);
|
||||
|
||||
cancel_delayed_work_sync(&dc->writeback_rate_update);
|
||||
if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
|
||||
kthread_stop(dc->writeback_thread);
|
||||
dc->writeback_thread = NULL;
|
||||
}
|
||||
|
||||
memset(&dc->sb.set_uuid, 0, 16);
|
||||
SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
|
||||
|
||||
|
@ -1166,7 +1177,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
|
|||
dc->bdev->bd_holder = dc;
|
||||
|
||||
bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
|
||||
dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
|
||||
bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
|
||||
get_page(sb_page);
|
||||
|
||||
if (cached_dev_init(dc, sb->block_size << 9))
|
||||
|
@ -1261,7 +1272,7 @@ static int flash_devs_run(struct cache_set *c)
|
|||
struct uuid_entry *u;
|
||||
|
||||
for (u = c->uuids;
|
||||
u < c->uuids + c->nr_uuids && !ret;
|
||||
u < c->uuids + c->devices_max_used && !ret;
|
||||
u++)
|
||||
if (UUID_FLASH_ONLY(u))
|
||||
ret = flash_dev_run(c, u);
|
||||
|
@ -1427,7 +1438,7 @@ static void __cache_set_unregister(struct closure *cl)
|
|||
|
||||
mutex_lock(&bch_register_lock);
|
||||
|
||||
for (i = 0; i < c->nr_uuids; i++)
|
||||
for (i = 0; i < c->devices_max_used; i++)
|
||||
if (c->devices[i]) {
|
||||
if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
|
||||
test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
|
||||
|
@ -1490,7 +1501,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
|
|||
c->bucket_bits = ilog2(sb->bucket_size);
|
||||
c->block_bits = ilog2(sb->block_size);
|
||||
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
|
||||
|
||||
c->devices_max_used = 0;
|
||||
c->btree_pages = bucket_pages(c);
|
||||
if (c->btree_pages > BTREE_MAX_PAGES)
|
||||
c->btree_pages = max_t(int, c->btree_pages / 4,
|
||||
|
@ -1810,7 +1821,7 @@ void bch_cache_release(struct kobject *kobj)
|
|||
free_fifo(&ca->free[i]);
|
||||
|
||||
if (ca->sb_bio.bi_inline_vecs[0].bv_page)
|
||||
put_page(ca->sb_bio.bi_io_vec[0].bv_page);
|
||||
put_page(bio_first_page_all(&ca->sb_bio));
|
||||
|
||||
if (!IS_ERR_OR_NULL(ca->bdev))
|
||||
blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
|
||||
|
@ -1864,7 +1875,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
|
|||
ca->bdev->bd_holder = ca;
|
||||
|
||||
bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
|
||||
ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
|
||||
bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
|
||||
get_page(sb_page);
|
||||
|
||||
if (blk_queue_discard(bdev_get_queue(ca->bdev)))
|
||||
|
|
|
@ -249,6 +249,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
|
|||
: 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Generally it isn't good to access .bi_io_vec and .bi_vcnt directly,
|
||||
* the preferred way is bio_add_page, but in this case, bch_bio_map()
|
||||
* supposes that the bvec table is empty, so it is safe to access
|
||||
* .bi_vcnt & .bi_io_vec in this way even after multipage bvec is
|
||||
* supported.
|
||||
*/
|
||||
void bch_bio_map(struct bio *bio, void *base)
|
||||
{
|
||||
size_t size = bio->bi_iter.bi_size;
|
||||
|
@ -276,6 +283,33 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* bch_bio_alloc_pages - allocates a single page for each bvec in a bio
|
||||
* @bio: bio to allocate pages for
|
||||
* @gfp_mask: flags for allocation
|
||||
*
|
||||
* Allocates pages up to @bio->bi_vcnt.
|
||||
*
|
||||
* Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
|
||||
* freed.
|
||||
*/
|
||||
int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
|
||||
{
|
||||
int i;
|
||||
struct bio_vec *bv;
|
||||
|
||||
bio_for_each_segment_all(bv, bio, i) {
|
||||
bv->bv_page = alloc_page(gfp_mask);
|
||||
if (!bv->bv_page) {
|
||||
while (--bv >= bio->bi_io_vec)
|
||||
__free_page(bv->bv_page);
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
|
||||
* use permitted, subject to terms of PostgreSQL license; see.)
|
||||
|
|
|
@ -558,6 +558,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
|
|||
}
|
||||
|
||||
void bch_bio_map(struct bio *bio, void *base);
|
||||
int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
|
||||
|
||||
static inline sector_t bdev_sectors(struct block_device *bdev)
|
||||
{
|
||||
|
|
|
@ -18,17 +18,39 @@
|
|||
#include <trace/events/bcache.h>
|
||||
|
||||
/* Rate limiting */
|
||||
static uint64_t __calc_target_rate(struct cached_dev *dc)
|
||||
{
|
||||
struct cache_set *c = dc->disk.c;
|
||||
|
||||
/*
|
||||
* This is the size of the cache, minus the amount used for
|
||||
* flash-only devices
|
||||
*/
|
||||
uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
|
||||
bcache_flash_devs_sectors_dirty(c);
|
||||
|
||||
/*
|
||||
* Unfortunately there is no control of global dirty data. If the
|
||||
* user states that they want 10% dirty data in the cache, and has,
|
||||
* e.g., 5 backing volumes of equal size, we try and ensure each
|
||||
* backing volume uses about 2% of the cache for dirty data.
|
||||
*/
|
||||
uint32_t bdev_share =
|
||||
div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
|
||||
c->cached_dev_sectors);
|
||||
|
||||
uint64_t cache_dirty_target =
|
||||
div_u64(cache_sectors * dc->writeback_percent, 100);
|
||||
|
||||
/* Ensure each backing dev gets at least one dirty share */
|
||||
if (bdev_share < 1)
|
||||
bdev_share = 1;
|
||||
|
||||
return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
|
||||
}
|
||||
|
||||
static void __update_writeback_rate(struct cached_dev *dc)
|
||||
{
|
||||
struct cache_set *c = dc->disk.c;
|
||||
uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
|
||||
bcache_flash_devs_sectors_dirty(c);
|
||||
uint64_t cache_dirty_target =
|
||||
div_u64(cache_sectors * dc->writeback_percent, 100);
|
||||
int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
|
||||
c->cached_dev_sectors);
|
||||
|
||||
/*
|
||||
* PI controller:
|
||||
* Figures out the amount that should be written per second.
|
||||
|
@ -49,6 +71,7 @@ static void __update_writeback_rate(struct cached_dev *dc)
|
|||
* This acts as a slow, long-term average that is not subject to
|
||||
* variations in usage like the p term.
|
||||
*/
|
||||
int64_t target = __calc_target_rate(dc);
|
||||
int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
|
||||
int64_t error = dirty - target;
|
||||
int64_t proportional_scaled =
|
||||
|
@ -116,6 +139,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
|
|||
struct dirty_io {
|
||||
struct closure cl;
|
||||
struct cached_dev *dc;
|
||||
uint16_t sequence;
|
||||
struct bio bio;
|
||||
};
|
||||
|
||||
|
@ -194,6 +218,27 @@ static void write_dirty(struct closure *cl)
|
|||
{
|
||||
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
|
||||
struct keybuf_key *w = io->bio.bi_private;
|
||||
struct cached_dev *dc = io->dc;
|
||||
|
||||
uint16_t next_sequence;
|
||||
|
||||
if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
|
||||
/* Not our turn to write; wait for a write to complete */
|
||||
closure_wait(&dc->writeback_ordering_wait, cl);
|
||||
|
||||
if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
|
||||
/*
|
||||
* Edge case-- it happened in indeterminate order
|
||||
* relative to when we were added to wait list..
|
||||
*/
|
||||
closure_wake_up(&dc->writeback_ordering_wait);
|
||||
}
|
||||
|
||||
continue_at(cl, write_dirty, io->dc->writeback_write_wq);
|
||||
return;
|
||||
}
|
||||
|
||||
next_sequence = io->sequence + 1;
|
||||
|
||||
/*
|
||||
* IO errors are signalled using the dirty bit on the key.
|
||||
|
@ -211,6 +256,9 @@ static void write_dirty(struct closure *cl)
|
|||
closure_bio_submit(&io->bio, cl);
|
||||
}
|
||||
|
||||
atomic_set(&dc->writeback_sequence_next, next_sequence);
|
||||
closure_wake_up(&dc->writeback_ordering_wait);
|
||||
|
||||
continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
|
||||
}
|
||||
|
||||
|
@ -219,8 +267,10 @@ static void read_dirty_endio(struct bio *bio)
|
|||
struct keybuf_key *w = bio->bi_private;
|
||||
struct dirty_io *io = w->private;
|
||||
|
||||
/* is_read = 1 */
|
||||
bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
|
||||
bio->bi_status, "reading dirty data from cache");
|
||||
bio->bi_status, 1,
|
||||
"reading dirty data from cache");
|
||||
|
||||
dirty_endio(bio);
|
||||
}
|
||||
|
@ -237,10 +287,15 @@ static void read_dirty_submit(struct closure *cl)
|
|||
static void read_dirty(struct cached_dev *dc)
|
||||
{
|
||||
unsigned delay = 0;
|
||||
struct keybuf_key *w;
|
||||
struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
|
||||
size_t size;
|
||||
int nk, i;
|
||||
struct dirty_io *io;
|
||||
struct closure cl;
|
||||
uint16_t sequence = 0;
|
||||
|
||||
BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
|
||||
atomic_set(&dc->writeback_sequence_next, sequence);
|
||||
closure_init_stack(&cl);
|
||||
|
||||
/*
|
||||
|
@ -248,45 +303,109 @@ static void read_dirty(struct cached_dev *dc)
|
|||
* mempools.
|
||||
*/
|
||||
|
||||
while (!kthread_should_stop()) {
|
||||
next = bch_keybuf_next(&dc->writeback_keys);
|
||||
|
||||
w = bch_keybuf_next(&dc->writeback_keys);
|
||||
if (!w)
|
||||
break;
|
||||
while (!kthread_should_stop() && next) {
|
||||
size = 0;
|
||||
nk = 0;
|
||||
|
||||
BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
|
||||
do {
|
||||
BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
|
||||
|
||||
if (KEY_START(&w->key) != dc->last_read ||
|
||||
jiffies_to_msecs(delay) > 50)
|
||||
while (!kthread_should_stop() && delay)
|
||||
delay = schedule_timeout_interruptible(delay);
|
||||
/*
|
||||
* Don't combine too many operations, even if they
|
||||
* are all small.
|
||||
*/
|
||||
if (nk >= MAX_WRITEBACKS_IN_PASS)
|
||||
break;
|
||||
|
||||
dc->last_read = KEY_OFFSET(&w->key);
|
||||
/*
|
||||
* If the current operation is very large, don't
|
||||
* further combine operations.
|
||||
*/
|
||||
if (size >= MAX_WRITESIZE_IN_PASS)
|
||||
break;
|
||||
|
||||
io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
|
||||
* DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
|
||||
GFP_KERNEL);
|
||||
if (!io)
|
||||
goto err;
|
||||
/*
|
||||
* Operations are only eligible to be combined
|
||||
* if they are contiguous.
|
||||
*
|
||||
* TODO: add a heuristic willing to fire a
|
||||
* certain amount of non-contiguous IO per pass,
|
||||
* so that we can benefit from backing device
|
||||
* command queueing.
|
||||
*/
|
||||
if ((nk != 0) && bkey_cmp(&keys[nk-1]->key,
|
||||
&START_KEY(&next->key)))
|
||||
break;
|
||||
|
||||
w->private = io;
|
||||
io->dc = dc;
|
||||
size += KEY_SIZE(&next->key);
|
||||
keys[nk++] = next;
|
||||
} while ((next = bch_keybuf_next(&dc->writeback_keys)));
|
||||
|
||||
dirty_init(w);
|
||||
bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
|
||||
io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
|
||||
bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
|
||||
io->bio.bi_end_io = read_dirty_endio;
|
||||
/* Now we have gathered a set of 1..5 keys to write back. */
|
||||
for (i = 0; i < nk; i++) {
|
||||
w = keys[i];
|
||||
|
||||
if (bio_alloc_pages(&io->bio, GFP_KERNEL))
|
||||
goto err_free;
|
||||
io = kzalloc(sizeof(struct dirty_io) +
|
||||
sizeof(struct bio_vec) *
|
||||
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
|
||||
GFP_KERNEL);
|
||||
if (!io)
|
||||
goto err;
|
||||
|
||||
trace_bcache_writeback(&w->key);
|
||||
w->private = io;
|
||||
io->dc = dc;
|
||||
io->sequence = sequence++;
|
||||
|
||||
down(&dc->in_flight);
|
||||
closure_call(&io->cl, read_dirty_submit, NULL, &cl);
|
||||
dirty_init(w);
|
||||
bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
|
||||
io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
|
||||
bio_set_dev(&io->bio,
|
||||
PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
|
||||
io->bio.bi_end_io = read_dirty_endio;
|
||||
|
||||
delay = writeback_delay(dc, KEY_SIZE(&w->key));
|
||||
if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
|
||||
goto err_free;
|
||||
|
||||
trace_bcache_writeback(&w->key);
|
||||
|
||||
down(&dc->in_flight);
|
||||
|
||||
/* We've acquired a semaphore for the maximum
|
||||
* simultaneous number of writebacks; from here
|
||||
* everything happens asynchronously.
|
||||
*/
|
||||
closure_call(&io->cl, read_dirty_submit, NULL, &cl);
|
||||
}
|
||||
|
||||
delay = writeback_delay(dc, size);
|
||||
|
||||
/* If the control system would wait for at least half a
|
||||
* second, and there's been no reqs hitting the backing disk
|
||||
* for awhile: use an alternate mode where we have at most
|
||||
* one contiguous set of writebacks in flight at a time. If
|
||||
* someone wants to do IO it will be quick, as it will only
|
||||
* have to contend with one operation in flight, and we'll
|
||||
* be round-tripping data to the backing disk as quickly as
|
||||
* it can accept it.
|
||||
*/
|
||||
if (delay >= HZ / 2) {
|
||||
/* 3 means at least 1.5 seconds, up to 7.5 if we
|
||||
* have slowed way down.
|
||||
*/
|
||||
if (atomic_inc_return(&dc->backing_idle) >= 3) {
|
||||
/* Wait for current I/Os to finish */
|
||||
closure_sync(&cl);
|
||||
/* And immediately launch a new set. */
|
||||
delay = 0;
|
||||
}
|
||||
}
|
||||
|
||||
while (!kthread_should_stop() && delay) {
|
||||
schedule_timeout_interruptible(delay);
|
||||
delay = writeback_delay(dc, 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (0) {
|
||||
|
|
|
@ -5,6 +5,16 @@
|
|||
#define CUTOFF_WRITEBACK 40
|
||||
#define CUTOFF_WRITEBACK_SYNC 70
|
||||
|
||||
#define MAX_WRITEBACKS_IN_PASS 5
|
||||
#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
|
||||
|
||||
/*
|
||||
* 14 (16384ths) is chosen here as something that each backing device
|
||||
* should be a reasonable fraction of the share, and not to blow up
|
||||
* until individual backing devices are a petabyte.
|
||||
*/
|
||||
#define WRITEBACK_SHARE_SHIFT 14
|
||||
|
||||
static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
|
||||
{
|
||||
uint64_t i, ret = 0;
|
||||
|
@ -21,7 +31,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
|
|||
|
||||
mutex_lock(&bch_register_lock);
|
||||
|
||||
for (i = 0; i < c->nr_uuids; i++) {
|
||||
for (i = 0; i < c->devices_max_used; i++) {
|
||||
struct bcache_device *d = c->devices[i];
|
||||
|
||||
if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
|
||||
|
|
|
@ -1446,7 +1446,6 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
|
|||
bio_for_each_segment_all(bv, clone, i) {
|
||||
BUG_ON(!bv->bv_page);
|
||||
mempool_free(bv->bv_page, cc->page_pool);
|
||||
bv->bv_page = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1475,21 +1475,6 @@ static void activate_path_work(struct work_struct *work)
|
|||
activate_or_offline_path(pgpath);
|
||||
}
|
||||
|
||||
static int noretry_error(blk_status_t error)
|
||||
{
|
||||
switch (error) {
|
||||
case BLK_STS_NOTSUPP:
|
||||
case BLK_STS_NOSPC:
|
||||
case BLK_STS_TARGET:
|
||||
case BLK_STS_NEXUS:
|
||||
case BLK_STS_MEDIUM:
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Anything else could be a path failure, so should be retried */
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int multipath_end_io(struct dm_target *ti, struct request *clone,
|
||||
blk_status_t error, union map_info *map_context)
|
||||
{
|
||||
|
@ -1508,7 +1493,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
|
|||
* request into dm core, which will remake a clone request and
|
||||
* clone bios for it and resubmit it later.
|
||||
*/
|
||||
if (error && !noretry_error(error)) {
|
||||
if (error && blk_path_error(error)) {
|
||||
struct multipath *m = ti->private;
|
||||
|
||||
r = DM_ENDIO_REQUEUE;
|
||||
|
@ -1544,7 +1529,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
|
|||
unsigned long flags;
|
||||
int r = DM_ENDIO_DONE;
|
||||
|
||||
if (!*error || noretry_error(*error))
|
||||
if (!*error || !blk_path_error(*error))
|
||||
goto done;
|
||||
|
||||
if (pgpath)
|
||||
|
|
|
@ -395,7 +395,7 @@ static void end_clone_request(struct request *clone, blk_status_t error)
|
|||
dm_complete_request(tio->orig, error);
|
||||
}
|
||||
|
||||
static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
|
||||
static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq)
|
||||
{
|
||||
blk_status_t r;
|
||||
|
||||
|
@ -404,9 +404,10 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
|
|||
|
||||
clone->start_time = jiffies;
|
||||
r = blk_insert_cloned_request(clone->q, clone);
|
||||
if (r)
|
||||
if (r != BLK_STS_OK && r != BLK_STS_RESOURCE)
|
||||
/* must complete clone in terms of original request */
|
||||
dm_complete_request(rq, r);
|
||||
return r;
|
||||
}
|
||||
|
||||
static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
|
||||
|
@ -476,8 +477,10 @@ static int map_request(struct dm_rq_target_io *tio)
|
|||
struct mapped_device *md = tio->md;
|
||||
struct request *rq = tio->orig;
|
||||
struct request *clone = NULL;
|
||||
blk_status_t ret;
|
||||
|
||||
r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
|
||||
check_again:
|
||||
switch (r) {
|
||||
case DM_MAPIO_SUBMITTED:
|
||||
/* The target has taken the I/O to submit by itself later */
|
||||
|
@ -492,7 +495,17 @@ static int map_request(struct dm_rq_target_io *tio)
|
|||
/* The target has remapped the I/O so dispatch it */
|
||||
trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
|
||||
blk_rq_pos(rq));
|
||||
dm_dispatch_clone_request(clone, rq);
|
||||
ret = dm_dispatch_clone_request(clone, rq);
|
||||
if (ret == BLK_STS_RESOURCE) {
|
||||
blk_rq_unprep_clone(clone);
|
||||
tio->ti->type->release_clone_rq(clone);
|
||||
tio->clone = NULL;
|
||||
if (!rq->q->mq_ops)
|
||||
r = DM_MAPIO_DELAY_REQUEUE;
|
||||
else
|
||||
r = DM_MAPIO_REQUEUE;
|
||||
goto check_again;
|
||||
}
|
||||
break;
|
||||
case DM_MAPIO_REQUEUE:
|
||||
/* The target wants to requeue the I/O */
|
||||
|
@ -713,8 +726,6 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
|
|||
return error;
|
||||
}
|
||||
|
||||
elv_register_queue(md->queue);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -812,15 +823,8 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
|
|||
}
|
||||
dm_init_md_queue(md);
|
||||
|
||||
/* backfill 'mq' sysfs registration normally done in blk_register_queue */
|
||||
err = blk_mq_register_dev(disk_to_dev(md->disk), q);
|
||||
if (err)
|
||||
goto out_cleanup_queue;
|
||||
|
||||
return 0;
|
||||
|
||||
out_cleanup_queue:
|
||||
blk_cleanup_queue(q);
|
||||
out_tag_set:
|
||||
blk_mq_free_tag_set(md->tag_set);
|
||||
out_kfree_tag_set:
|
||||
|
|
|
@ -920,7 +920,15 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
ti->max_io_len = (uint32_t) len;
|
||||
/*
|
||||
* BIO based queue uses its own splitting. When multipage bvecs
|
||||
* is switched on, size of the incoming bio may be too big to
|
||||
* be handled in some targets, such as crypt.
|
||||
*
|
||||
* When these targets are ready for the big bio, we can remove
|
||||
* the limit.
|
||||
*/
|
||||
ti->max_io_len = min_t(uint32_t, len, BIO_MAX_PAGES * PAGE_SIZE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1753,7 +1761,7 @@ static struct mapped_device *alloc_dev(int minor)
|
|||
goto bad;
|
||||
md->dax_dev = dax_dev;
|
||||
|
||||
add_disk(md->disk);
|
||||
add_disk_no_queue_reg(md->disk);
|
||||
format_dev_t(md->name, MKDEV(_major, minor));
|
||||
|
||||
md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
|
||||
|
@ -2013,6 +2021,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
|
|||
int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
|
||||
{
|
||||
int r;
|
||||
struct queue_limits limits;
|
||||
enum dm_queue_mode type = dm_get_md_type(md);
|
||||
|
||||
switch (type) {
|
||||
|
@ -2049,6 +2058,14 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
|
|||
break;
|
||||
}
|
||||
|
||||
r = dm_calculate_queue_limits(t, &limits);
|
||||
if (r) {
|
||||
DMERR("Cannot calculate initial queue limits");
|
||||
return r;
|
||||
}
|
||||
dm_table_set_restrictions(t, md->queue, &limits);
|
||||
blk_register_queue(md->disk);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
# SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
ccflags-y += -I$(src)
|
||||
|
||||
obj-$(CONFIG_NVME_CORE) += nvme-core.o
|
||||
obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
|
||||
obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o
|
||||
|
@ -6,6 +9,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o
|
|||
obj-$(CONFIG_NVME_FC) += nvme-fc.o
|
||||
|
||||
nvme-core-y := core.o
|
||||
nvme-core-$(CONFIG_TRACING) += trace.o
|
||||
nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
|
||||
nvme-core-$(CONFIG_NVM) += lightnvm.o
|
||||
|
||||
|
|
|
@ -29,6 +29,9 @@
|
|||
#include <linux/pm_qos.h>
|
||||
#include <asm/unaligned.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include "trace.h"
|
||||
|
||||
#include "nvme.h"
|
||||
#include "fabrics.h"
|
||||
|
||||
|
@ -65,9 +68,26 @@ static bool streams;
|
|||
module_param(streams, bool, 0644);
|
||||
MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
|
||||
|
||||
/*
|
||||
* nvme_wq - hosts nvme related works that are not reset or delete
|
||||
* nvme_reset_wq - hosts nvme reset works
|
||||
* nvme_delete_wq - hosts nvme delete works
|
||||
*
|
||||
* nvme_wq will host works such are scan, aen handling, fw activation,
|
||||
* keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
|
||||
* runs reset works which also flush works hosted on nvme_wq for
|
||||
* serialization purposes. nvme_delete_wq host controller deletion
|
||||
* works which flush reset works for serialization.
|
||||
*/
|
||||
struct workqueue_struct *nvme_wq;
|
||||
EXPORT_SYMBOL_GPL(nvme_wq);
|
||||
|
||||
struct workqueue_struct *nvme_reset_wq;
|
||||
EXPORT_SYMBOL_GPL(nvme_reset_wq);
|
||||
|
||||
struct workqueue_struct *nvme_delete_wq;
|
||||
EXPORT_SYMBOL_GPL(nvme_delete_wq);
|
||||
|
||||
static DEFINE_IDA(nvme_subsystems_ida);
|
||||
static LIST_HEAD(nvme_subsystems);
|
||||
static DEFINE_MUTEX(nvme_subsystems_lock);
|
||||
|
@ -89,13 +109,13 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
|
|||
{
|
||||
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
|
||||
return -EBUSY;
|
||||
if (!queue_work(nvme_wq, &ctrl->reset_work))
|
||||
if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
|
||||
return -EBUSY;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
|
||||
|
||||
static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
|
||||
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
int ret;
|
||||
|
||||
|
@ -104,6 +124,7 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
|
|||
flush_work(&ctrl->reset_work);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
|
||||
|
||||
static void nvme_delete_ctrl_work(struct work_struct *work)
|
||||
{
|
||||
|
@ -122,7 +143,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
|
|||
{
|
||||
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
|
||||
return -EBUSY;
|
||||
if (!queue_work(nvme_wq, &ctrl->delete_work))
|
||||
if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
|
||||
return -EBUSY;
|
||||
return 0;
|
||||
}
|
||||
|
@ -157,13 +178,20 @@ static blk_status_t nvme_error_status(struct request *req)
|
|||
return BLK_STS_OK;
|
||||
case NVME_SC_CAP_EXCEEDED:
|
||||
return BLK_STS_NOSPC;
|
||||
case NVME_SC_LBA_RANGE:
|
||||
return BLK_STS_TARGET;
|
||||
case NVME_SC_BAD_ATTRIBUTES:
|
||||
case NVME_SC_ONCS_NOT_SUPPORTED:
|
||||
case NVME_SC_INVALID_OPCODE:
|
||||
case NVME_SC_INVALID_FIELD:
|
||||
case NVME_SC_INVALID_NS:
|
||||
return BLK_STS_NOTSUPP;
|
||||
case NVME_SC_WRITE_FAULT:
|
||||
case NVME_SC_READ_ERROR:
|
||||
case NVME_SC_UNWRITTEN_BLOCK:
|
||||
case NVME_SC_ACCESS_DENIED:
|
||||
case NVME_SC_READ_ONLY:
|
||||
case NVME_SC_COMPARE_FAILED:
|
||||
return BLK_STS_MEDIUM;
|
||||
case NVME_SC_GUARD_CHECK:
|
||||
case NVME_SC_APPTAG_CHECK:
|
||||
|
@ -190,8 +218,12 @@ static inline bool nvme_req_needs_retry(struct request *req)
|
|||
|
||||
void nvme_complete_rq(struct request *req)
|
||||
{
|
||||
if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
|
||||
if (nvme_req_needs_failover(req)) {
|
||||
blk_status_t status = nvme_error_status(req);
|
||||
|
||||
trace_nvme_complete_rq(req);
|
||||
|
||||
if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
|
||||
if (nvme_req_needs_failover(req, status)) {
|
||||
nvme_failover_req(req);
|
||||
return;
|
||||
}
|
||||
|
@ -202,8 +234,7 @@ void nvme_complete_rq(struct request *req)
|
|||
return;
|
||||
}
|
||||
}
|
||||
|
||||
blk_mq_end_request(req, nvme_error_status(req));
|
||||
blk_mq_end_request(req, status);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_complete_rq);
|
||||
|
||||
|
@ -232,6 +263,15 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
|
|||
|
||||
old_state = ctrl->state;
|
||||
switch (new_state) {
|
||||
case NVME_CTRL_ADMIN_ONLY:
|
||||
switch (old_state) {
|
||||
case NVME_CTRL_RECONNECTING:
|
||||
changed = true;
|
||||
/* FALLTHRU */
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case NVME_CTRL_LIVE:
|
||||
switch (old_state) {
|
||||
case NVME_CTRL_NEW:
|
||||
|
@ -247,6 +287,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
|
|||
switch (old_state) {
|
||||
case NVME_CTRL_NEW:
|
||||
case NVME_CTRL_LIVE:
|
||||
case NVME_CTRL_ADMIN_ONLY:
|
||||
changed = true;
|
||||
/* FALLTHRU */
|
||||
default:
|
||||
|
@ -266,6 +307,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
|
|||
case NVME_CTRL_DELETING:
|
||||
switch (old_state) {
|
||||
case NVME_CTRL_LIVE:
|
||||
case NVME_CTRL_ADMIN_ONLY:
|
||||
case NVME_CTRL_RESETTING:
|
||||
case NVME_CTRL_RECONNECTING:
|
||||
changed = true;
|
||||
|
@ -591,6 +633,10 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
|
|||
}
|
||||
|
||||
cmd->common.command_id = req->tag;
|
||||
if (ns)
|
||||
trace_nvme_setup_nvm_cmd(req->q->id, cmd);
|
||||
else
|
||||
trace_nvme_setup_admin_cmd(cmd);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_setup_cmd);
|
||||
|
@ -1217,16 +1263,27 @@ static int nvme_open(struct block_device *bdev, fmode_t mode)
|
|||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
/* should never be called due to GENHD_FL_HIDDEN */
|
||||
if (WARN_ON_ONCE(ns->head->disk))
|
||||
return -ENXIO;
|
||||
goto fail;
|
||||
#endif
|
||||
if (!kref_get_unless_zero(&ns->kref))
|
||||
return -ENXIO;
|
||||
goto fail;
|
||||
if (!try_module_get(ns->ctrl->ops->module))
|
||||
goto fail_put_ns;
|
||||
|
||||
return 0;
|
||||
|
||||
fail_put_ns:
|
||||
nvme_put_ns(ns);
|
||||
fail:
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
static void nvme_release(struct gendisk *disk, fmode_t mode)
|
||||
{
|
||||
nvme_put_ns(disk->private_data);
|
||||
struct nvme_ns *ns = disk->private_data;
|
||||
|
||||
module_put(ns->ctrl->ops->module);
|
||||
nvme_put_ns(ns);
|
||||
}
|
||||
|
||||
static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
|
||||
|
@ -2052,6 +2109,22 @@ static const struct attribute_group *nvme_subsys_attrs_groups[] = {
|
|||
NULL,
|
||||
};
|
||||
|
||||
static int nvme_active_ctrls(struct nvme_subsystem *subsys)
|
||||
{
|
||||
int count = 0;
|
||||
struct nvme_ctrl *ctrl;
|
||||
|
||||
mutex_lock(&subsys->lock);
|
||||
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
|
||||
if (ctrl->state != NVME_CTRL_DELETING &&
|
||||
ctrl->state != NVME_CTRL_DEAD)
|
||||
count++;
|
||||
}
|
||||
mutex_unlock(&subsys->lock);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
||||
{
|
||||
struct nvme_subsystem *subsys, *found;
|
||||
|
@ -2090,7 +2163,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
|||
* Verify that the subsystem actually supports multiple
|
||||
* controllers, else bail out.
|
||||
*/
|
||||
if (!(id->cmic & (1 << 1))) {
|
||||
if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
|
||||
dev_err(ctrl->device,
|
||||
"ignoring ctrl due to duplicate subnqn (%s).\n",
|
||||
found->subnqn);
|
||||
|
@ -2257,7 +2330,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
|
|||
shutdown_timeout, 60);
|
||||
|
||||
if (ctrl->shutdown_timeout != shutdown_timeout)
|
||||
dev_warn(ctrl->device,
|
||||
dev_info(ctrl->device,
|
||||
"Shutdown timeout set to %u seconds\n",
|
||||
ctrl->shutdown_timeout);
|
||||
} else
|
||||
|
@ -2341,8 +2414,14 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
|
|||
struct nvme_ctrl *ctrl =
|
||||
container_of(inode->i_cdev, struct nvme_ctrl, cdev);
|
||||
|
||||
if (ctrl->state != NVME_CTRL_LIVE)
|
||||
switch (ctrl->state) {
|
||||
case NVME_CTRL_LIVE:
|
||||
case NVME_CTRL_ADMIN_ONLY:
|
||||
break;
|
||||
default:
|
||||
return -EWOULDBLOCK;
|
||||
}
|
||||
|
||||
file->private_data = ctrl;
|
||||
return 0;
|
||||
}
|
||||
|
@ -2606,6 +2685,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
|
|||
static const char *const state_name[] = {
|
||||
[NVME_CTRL_NEW] = "new",
|
||||
[NVME_CTRL_LIVE] = "live",
|
||||
[NVME_CTRL_ADMIN_ONLY] = "only-admin",
|
||||
[NVME_CTRL_RESETTING] = "resetting",
|
||||
[NVME_CTRL_RECONNECTING]= "reconnecting",
|
||||
[NVME_CTRL_DELETING] = "deleting",
|
||||
|
@ -3079,6 +3159,8 @@ static void nvme_scan_work(struct work_struct *work)
|
|||
if (ctrl->state != NVME_CTRL_LIVE)
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(!ctrl->tagset);
|
||||
|
||||
if (nvme_identify_ctrl(ctrl, &id))
|
||||
return;
|
||||
|
||||
|
@ -3099,8 +3181,7 @@ static void nvme_scan_work(struct work_struct *work)
|
|||
void nvme_queue_scan(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
/*
|
||||
* Do not queue new scan work when a controller is reset during
|
||||
* removal.
|
||||
* Only new queue scan work when admin and IO queues are both alive
|
||||
*/
|
||||
if (ctrl->state == NVME_CTRL_LIVE)
|
||||
queue_work(nvme_wq, &ctrl->scan_work);
|
||||
|
@ -3477,16 +3558,26 @@ EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
|
|||
|
||||
int __init nvme_core_init(void)
|
||||
{
|
||||
int result;
|
||||
int result = -ENOMEM;
|
||||
|
||||
nvme_wq = alloc_workqueue("nvme-wq",
|
||||
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
|
||||
if (!nvme_wq)
|
||||
return -ENOMEM;
|
||||
goto out;
|
||||
|
||||
nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
|
||||
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
|
||||
if (!nvme_reset_wq)
|
||||
goto destroy_wq;
|
||||
|
||||
nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
|
||||
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
|
||||
if (!nvme_delete_wq)
|
||||
goto destroy_reset_wq;
|
||||
|
||||
result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
|
||||
if (result < 0)
|
||||
goto destroy_wq;
|
||||
goto destroy_delete_wq;
|
||||
|
||||
nvme_class = class_create(THIS_MODULE, "nvme");
|
||||
if (IS_ERR(nvme_class)) {
|
||||
|
@ -3505,8 +3596,13 @@ destroy_class:
|
|||
class_destroy(nvme_class);
|
||||
unregister_chrdev:
|
||||
unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
|
||||
destroy_delete_wq:
|
||||
destroy_workqueue(nvme_delete_wq);
|
||||
destroy_reset_wq:
|
||||
destroy_workqueue(nvme_reset_wq);
|
||||
destroy_wq:
|
||||
destroy_workqueue(nvme_wq);
|
||||
out:
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -3516,6 +3612,8 @@ void nvme_core_exit(void)
|
|||
class_destroy(nvme_subsys_class);
|
||||
class_destroy(nvme_class);
|
||||
unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
|
||||
destroy_workqueue(nvme_delete_wq);
|
||||
destroy_workqueue(nvme_reset_wq);
|
||||
destroy_workqueue(nvme_wq);
|
||||
}
|
||||
|
||||
|
|
|
@ -493,7 +493,7 @@ EXPORT_SYMBOL_GPL(nvmf_should_reconnect);
|
|||
*/
|
||||
int nvmf_register_transport(struct nvmf_transport_ops *ops)
|
||||
{
|
||||
if (!ops->create_ctrl)
|
||||
if (!ops->create_ctrl || !ops->module)
|
||||
return -EINVAL;
|
||||
|
||||
down_write(&nvmf_transports_rwsem);
|
||||
|
@ -739,11 +739,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
|
|||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
if (uuid_parse(p, &hostid)) {
|
||||
ret = uuid_parse(p, &hostid);
|
||||
if (ret) {
|
||||
pr_err("Invalid hostid %s\n", p);
|
||||
ret = -EINVAL;
|
||||
kfree(p);
|
||||
goto out;
|
||||
}
|
||||
kfree(p);
|
||||
break;
|
||||
case NVMF_OPT_DUP_CONNECT:
|
||||
opts->duplicate_connect = true;
|
||||
|
@ -869,32 +872,41 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
|
|||
goto out_unlock;
|
||||
}
|
||||
|
||||
if (!try_module_get(ops->module)) {
|
||||
ret = -EBUSY;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
ret = nvmf_check_required_opts(opts, ops->required_opts);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
goto out_module_put;
|
||||
ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS |
|
||||
ops->allowed_opts | ops->required_opts);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
goto out_module_put;
|
||||
|
||||
ctrl = ops->create_ctrl(dev, opts);
|
||||
if (IS_ERR(ctrl)) {
|
||||
ret = PTR_ERR(ctrl);
|
||||
goto out_unlock;
|
||||
goto out_module_put;
|
||||
}
|
||||
|
||||
if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
|
||||
dev_warn(ctrl->device,
|
||||
"controller returned incorrect NQN: \"%s\".\n",
|
||||
ctrl->subsys->subnqn);
|
||||
module_put(ops->module);
|
||||
up_read(&nvmf_transports_rwsem);
|
||||
nvme_delete_ctrl_sync(ctrl);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
module_put(ops->module);
|
||||
up_read(&nvmf_transports_rwsem);
|
||||
return ctrl;
|
||||
|
||||
out_module_put:
|
||||
module_put(ops->module);
|
||||
out_unlock:
|
||||
up_read(&nvmf_transports_rwsem);
|
||||
out_free_opts:
|
||||
|
|
|
@ -108,6 +108,7 @@ struct nvmf_ctrl_options {
|
|||
* fabric implementation of NVMe fabrics.
|
||||
* @entry: Used by the fabrics library to add the new
|
||||
* registration entry to its linked-list internal tree.
|
||||
* @module: Transport module reference
|
||||
* @name: Name of the NVMe fabric driver implementation.
|
||||
* @required_opts: sysfs command-line options that must be specified
|
||||
* when adding a new NVMe controller.
|
||||
|
@ -126,6 +127,7 @@ struct nvmf_ctrl_options {
|
|||
*/
|
||||
struct nvmf_transport_ops {
|
||||
struct list_head entry;
|
||||
struct module *module;
|
||||
const char *name;
|
||||
int required_opts;
|
||||
int allowed_opts;
|
||||
|
|
|
@ -2921,6 +2921,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
|
|||
__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
|
||||
nvme_fc_free_queue(&ctrl->queues[0]);
|
||||
|
||||
/* re-enable the admin_q so anything new can fast fail */
|
||||
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
|
||||
|
||||
nvme_fc_ctlr_inactive_on_rport(ctrl);
|
||||
}
|
||||
|
||||
|
@ -2935,6 +2938,9 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
|
|||
* waiting for io to terminate
|
||||
*/
|
||||
nvme_fc_delete_association(ctrl);
|
||||
|
||||
/* resume the io queues so that things will fast fail */
|
||||
nvme_start_queues(nctrl);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -3380,6 +3386,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
|
|||
|
||||
static struct nvmf_transport_ops nvme_fc_transport = {
|
||||
.name = "fc",
|
||||
.module = THIS_MODULE,
|
||||
.required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR,
|
||||
.allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO,
|
||||
.create_ctrl = nvme_fc_create_ctrl,
|
||||
|
|
|
@ -31,27 +31,10 @@
|
|||
|
||||
enum nvme_nvm_admin_opcode {
|
||||
nvme_nvm_admin_identity = 0xe2,
|
||||
nvme_nvm_admin_get_l2p_tbl = 0xea,
|
||||
nvme_nvm_admin_get_bb_tbl = 0xf2,
|
||||
nvme_nvm_admin_set_bb_tbl = 0xf1,
|
||||
};
|
||||
|
||||
struct nvme_nvm_hb_rw {
|
||||
__u8 opcode;
|
||||
__u8 flags;
|
||||
__u16 command_id;
|
||||
__le32 nsid;
|
||||
__u64 rsvd2;
|
||||
__le64 metadata;
|
||||
__le64 prp1;
|
||||
__le64 prp2;
|
||||
__le64 spba;
|
||||
__le16 length;
|
||||
__le16 control;
|
||||
__le32 dsmgmt;
|
||||
__le64 slba;
|
||||
};
|
||||
|
||||
struct nvme_nvm_ph_rw {
|
||||
__u8 opcode;
|
||||
__u8 flags;
|
||||
|
@ -80,19 +63,6 @@ struct nvme_nvm_identity {
|
|||
__u32 rsvd11[5];
|
||||
};
|
||||
|
||||
struct nvme_nvm_l2ptbl {
|
||||
__u8 opcode;
|
||||
__u8 flags;
|
||||
__u16 command_id;
|
||||
__le32 nsid;
|
||||
__le32 cdw2[4];
|
||||
__le64 prp1;
|
||||
__le64 prp2;
|
||||
__le64 slba;
|
||||
__le32 nlb;
|
||||
__le16 cdw14[6];
|
||||
};
|
||||
|
||||
struct nvme_nvm_getbbtbl {
|
||||
__u8 opcode;
|
||||
__u8 flags;
|
||||
|
@ -139,9 +109,7 @@ struct nvme_nvm_command {
|
|||
union {
|
||||
struct nvme_common_command common;
|
||||
struct nvme_nvm_identity identity;
|
||||
struct nvme_nvm_hb_rw hb_rw;
|
||||
struct nvme_nvm_ph_rw ph_rw;
|
||||
struct nvme_nvm_l2ptbl l2p;
|
||||
struct nvme_nvm_getbbtbl get_bb;
|
||||
struct nvme_nvm_setbbtbl set_bb;
|
||||
struct nvme_nvm_erase_blk erase;
|
||||
|
@ -167,7 +135,7 @@ struct nvme_nvm_id_group {
|
|||
__u8 num_lun;
|
||||
__u8 num_pln;
|
||||
__u8 rsvd1;
|
||||
__le16 num_blk;
|
||||
__le16 num_chk;
|
||||
__le16 num_pg;
|
||||
__le16 fpg_sz;
|
||||
__le16 csecs;
|
||||
|
@ -234,11 +202,9 @@ struct nvme_nvm_bb_tbl {
|
|||
static inline void _nvme_nvm_check_size(void)
|
||||
{
|
||||
BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_nvm_hb_rw) != 64);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16);
|
||||
|
@ -249,51 +215,58 @@ static inline void _nvme_nvm_check_size(void)
|
|||
static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
|
||||
{
|
||||
struct nvme_nvm_id_group *src;
|
||||
struct nvm_id_group *dst;
|
||||
struct nvm_id_group *grp;
|
||||
int sec_per_pg, sec_per_pl, pg_per_blk;
|
||||
|
||||
if (nvme_nvm_id->cgrps != 1)
|
||||
return -EINVAL;
|
||||
|
||||
src = &nvme_nvm_id->groups[0];
|
||||
dst = &nvm_id->grp;
|
||||
grp = &nvm_id->grp;
|
||||
|
||||
dst->mtype = src->mtype;
|
||||
dst->fmtype = src->fmtype;
|
||||
dst->num_ch = src->num_ch;
|
||||
dst->num_lun = src->num_lun;
|
||||
dst->num_pln = src->num_pln;
|
||||
grp->mtype = src->mtype;
|
||||
grp->fmtype = src->fmtype;
|
||||
|
||||
dst->num_pg = le16_to_cpu(src->num_pg);
|
||||
dst->num_blk = le16_to_cpu(src->num_blk);
|
||||
dst->fpg_sz = le16_to_cpu(src->fpg_sz);
|
||||
dst->csecs = le16_to_cpu(src->csecs);
|
||||
dst->sos = le16_to_cpu(src->sos);
|
||||
grp->num_ch = src->num_ch;
|
||||
grp->num_lun = src->num_lun;
|
||||
|
||||
dst->trdt = le32_to_cpu(src->trdt);
|
||||
dst->trdm = le32_to_cpu(src->trdm);
|
||||
dst->tprt = le32_to_cpu(src->tprt);
|
||||
dst->tprm = le32_to_cpu(src->tprm);
|
||||
dst->tbet = le32_to_cpu(src->tbet);
|
||||
dst->tbem = le32_to_cpu(src->tbem);
|
||||
dst->mpos = le32_to_cpu(src->mpos);
|
||||
dst->mccap = le32_to_cpu(src->mccap);
|
||||
grp->num_chk = le16_to_cpu(src->num_chk);
|
||||
grp->csecs = le16_to_cpu(src->csecs);
|
||||
grp->sos = le16_to_cpu(src->sos);
|
||||
|
||||
dst->cpar = le16_to_cpu(src->cpar);
|
||||
pg_per_blk = le16_to_cpu(src->num_pg);
|
||||
sec_per_pg = le16_to_cpu(src->fpg_sz) / grp->csecs;
|
||||
sec_per_pl = sec_per_pg * src->num_pln;
|
||||
grp->clba = sec_per_pl * pg_per_blk;
|
||||
grp->ws_per_chk = pg_per_blk;
|
||||
|
||||
if (dst->fmtype == NVM_ID_FMTYPE_MLC) {
|
||||
memcpy(dst->lptbl.id, src->lptbl.id, 8);
|
||||
dst->lptbl.mlc.num_pairs =
|
||||
le16_to_cpu(src->lptbl.mlc.num_pairs);
|
||||
grp->mpos = le32_to_cpu(src->mpos);
|
||||
grp->cpar = le16_to_cpu(src->cpar);
|
||||
grp->mccap = le32_to_cpu(src->mccap);
|
||||
|
||||
if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) {
|
||||
pr_err("nvm: number of MLC pairs not supported\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
grp->ws_opt = grp->ws_min = sec_per_pg;
|
||||
grp->ws_seq = NVM_IO_SNGL_ACCESS;
|
||||
|
||||
memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs,
|
||||
dst->lptbl.mlc.num_pairs);
|
||||
if (grp->mpos & 0x020202) {
|
||||
grp->ws_seq = NVM_IO_DUAL_ACCESS;
|
||||
grp->ws_opt <<= 1;
|
||||
} else if (grp->mpos & 0x040404) {
|
||||
grp->ws_seq = NVM_IO_QUAD_ACCESS;
|
||||
grp->ws_opt <<= 2;
|
||||
}
|
||||
|
||||
grp->trdt = le32_to_cpu(src->trdt);
|
||||
grp->trdm = le32_to_cpu(src->trdm);
|
||||
grp->tprt = le32_to_cpu(src->tprt);
|
||||
grp->tprm = le32_to_cpu(src->tprm);
|
||||
grp->tbet = le32_to_cpu(src->tbet);
|
||||
grp->tbem = le32_to_cpu(src->tbem);
|
||||
|
||||
/* 1.2 compatibility */
|
||||
grp->num_pln = src->num_pln;
|
||||
grp->num_pg = le16_to_cpu(src->num_pg);
|
||||
grp->fpg_sz = le16_to_cpu(src->fpg_sz);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -332,62 +305,6 @@ out:
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
|
||||
nvm_l2p_update_fn *update_l2p, void *priv)
|
||||
{
|
||||
struct nvme_ns *ns = nvmdev->q->queuedata;
|
||||
struct nvme_nvm_command c = {};
|
||||
u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9;
|
||||
u32 nlb_pr_rq = len / sizeof(u64);
|
||||
u64 cmd_slba = slba;
|
||||
void *entries;
|
||||
int ret = 0;
|
||||
|
||||
c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl;
|
||||
c.l2p.nsid = cpu_to_le32(ns->head->ns_id);
|
||||
entries = kmalloc(len, GFP_KERNEL);
|
||||
if (!entries)
|
||||
return -ENOMEM;
|
||||
|
||||
while (nlb) {
|
||||
u32 cmd_nlb = min(nlb_pr_rq, nlb);
|
||||
u64 elba = slba + cmd_nlb;
|
||||
|
||||
c.l2p.slba = cpu_to_le64(cmd_slba);
|
||||
c.l2p.nlb = cpu_to_le32(cmd_nlb);
|
||||
|
||||
ret = nvme_submit_sync_cmd(ns->ctrl->admin_q,
|
||||
(struct nvme_command *)&c, entries, len);
|
||||
if (ret) {
|
||||
dev_err(ns->ctrl->device,
|
||||
"L2P table transfer failed (%d)\n", ret);
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (unlikely(elba > nvmdev->total_secs)) {
|
||||
pr_err("nvm: L2P data from device is out of bounds!\n");
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Transform physical address to target address space */
|
||||
nvm_part_to_tgt(nvmdev, entries, cmd_nlb);
|
||||
|
||||
if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) {
|
||||
ret = -EINTR;
|
||||
goto out;
|
||||
}
|
||||
|
||||
cmd_slba += cmd_nlb;
|
||||
nlb -= cmd_nlb;
|
||||
}
|
||||
|
||||
out:
|
||||
kfree(entries);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
|
||||
u8 *blks)
|
||||
{
|
||||
|
@ -397,7 +314,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
|
|||
struct nvme_ctrl *ctrl = ns->ctrl;
|
||||
struct nvme_nvm_command c = {};
|
||||
struct nvme_nvm_bb_tbl *bb_tbl;
|
||||
int nr_blks = geo->blks_per_lun * geo->plane_mode;
|
||||
int nr_blks = geo->nr_chks * geo->plane_mode;
|
||||
int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks;
|
||||
int ret = 0;
|
||||
|
||||
|
@ -438,7 +355,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
|
|||
goto out;
|
||||
}
|
||||
|
||||
memcpy(blks, bb_tbl->blk, geo->blks_per_lun * geo->plane_mode);
|
||||
memcpy(blks, bb_tbl->blk, geo->nr_chks * geo->plane_mode);
|
||||
out:
|
||||
kfree(bb_tbl);
|
||||
return ret;
|
||||
|
@ -474,10 +391,6 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
|
|||
c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list);
|
||||
c->ph_rw.control = cpu_to_le16(rqd->flags);
|
||||
c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1);
|
||||
|
||||
if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD)
|
||||
c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns,
|
||||
rqd->bio->bi_iter.bi_sector));
|
||||
}
|
||||
|
||||
static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
|
||||
|
@ -597,8 +510,6 @@ static void nvme_nvm_dev_dma_free(void *pool, void *addr,
|
|||
static struct nvm_dev_ops nvme_nvm_dev_ops = {
|
||||
.identity = nvme_nvm_identity,
|
||||
|
||||
.get_l2p_tbl = nvme_nvm_get_l2p_tbl,
|
||||
|
||||
.get_bb_tbl = nvme_nvm_get_bb_tbl,
|
||||
.set_bb_tbl = nvme_nvm_set_bb_tbl,
|
||||
|
||||
|
@ -883,7 +794,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev,
|
|||
} else if (strcmp(attr->name, "num_planes") == 0) {
|
||||
return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln);
|
||||
} else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */
|
||||
return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk);
|
||||
return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_chk);
|
||||
} else if (strcmp(attr->name, "num_pages") == 0) {
|
||||
return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg);
|
||||
} else if (strcmp(attr->name, "page_size") == 0) {
|
||||
|
|
|
@ -33,51 +33,11 @@ void nvme_failover_req(struct request *req)
|
|||
kblockd_schedule_work(&ns->head->requeue_work);
|
||||
}
|
||||
|
||||
bool nvme_req_needs_failover(struct request *req)
|
||||
bool nvme_req_needs_failover(struct request *req, blk_status_t error)
|
||||
{
|
||||
if (!(req->cmd_flags & REQ_NVME_MPATH))
|
||||
return false;
|
||||
|
||||
switch (nvme_req(req)->status & 0x7ff) {
|
||||
/*
|
||||
* Generic command status:
|
||||
*/
|
||||
case NVME_SC_INVALID_OPCODE:
|
||||
case NVME_SC_INVALID_FIELD:
|
||||
case NVME_SC_INVALID_NS:
|
||||
case NVME_SC_LBA_RANGE:
|
||||
case NVME_SC_CAP_EXCEEDED:
|
||||
case NVME_SC_RESERVATION_CONFLICT:
|
||||
return false;
|
||||
|
||||
/*
|
||||
* I/O command set specific error. Unfortunately these values are
|
||||
* reused for fabrics commands, but those should never get here.
|
||||
*/
|
||||
case NVME_SC_BAD_ATTRIBUTES:
|
||||
case NVME_SC_INVALID_PI:
|
||||
case NVME_SC_READ_ONLY:
|
||||
case NVME_SC_ONCS_NOT_SUPPORTED:
|
||||
WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode ==
|
||||
nvme_fabrics_command);
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Media and Data Integrity Errors:
|
||||
*/
|
||||
case NVME_SC_WRITE_FAULT:
|
||||
case NVME_SC_READ_ERROR:
|
||||
case NVME_SC_GUARD_CHECK:
|
||||
case NVME_SC_APPTAG_CHECK:
|
||||
case NVME_SC_REFTAG_CHECK:
|
||||
case NVME_SC_COMPARE_FAILED:
|
||||
case NVME_SC_ACCESS_DENIED:
|
||||
case NVME_SC_UNWRITTEN_BLOCK:
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Everything else could be a path failure, so should be retried */
|
||||
return true;
|
||||
return blk_path_error(error);
|
||||
}
|
||||
|
||||
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
|
||||
|
|
|
@ -32,6 +32,8 @@ extern unsigned int admin_timeout;
|
|||
#define NVME_KATO_GRACE 10
|
||||
|
||||
extern struct workqueue_struct *nvme_wq;
|
||||
extern struct workqueue_struct *nvme_reset_wq;
|
||||
extern struct workqueue_struct *nvme_delete_wq;
|
||||
|
||||
enum {
|
||||
NVME_NS_LBA = 0,
|
||||
|
@ -119,6 +121,7 @@ static inline struct nvme_request *nvme_req(struct request *req)
|
|||
enum nvme_ctrl_state {
|
||||
NVME_CTRL_NEW,
|
||||
NVME_CTRL_LIVE,
|
||||
NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */
|
||||
NVME_CTRL_RESETTING,
|
||||
NVME_CTRL_RECONNECTING,
|
||||
NVME_CTRL_DELETING,
|
||||
|
@ -393,6 +396,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
|
|||
void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
|
||||
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
|
||||
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
|
||||
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
|
||||
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
|
||||
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
|
||||
|
||||
|
@ -401,7 +405,7 @@ extern const struct block_device_operations nvme_ns_head_ops;
|
|||
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
void nvme_failover_req(struct request *req);
|
||||
bool nvme_req_needs_failover(struct request *req);
|
||||
bool nvme_req_needs_failover(struct request *req, blk_status_t error);
|
||||
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
|
||||
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
|
||||
void nvme_mpath_add_disk(struct nvme_ns_head *head);
|
||||
|
@ -430,7 +434,8 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
|
|||
static inline void nvme_failover_req(struct request *req)
|
||||
{
|
||||
}
|
||||
static inline bool nvme_req_needs_failover(struct request *req)
|
||||
static inline bool nvme_req_needs_failover(struct request *req,
|
||||
blk_status_t error)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
|
|||
* Represents an NVM Express device. Each nvme_dev is a PCI function.
|
||||
*/
|
||||
struct nvme_dev {
|
||||
struct nvme_queue **queues;
|
||||
struct nvme_queue *queues;
|
||||
struct blk_mq_tag_set tagset;
|
||||
struct blk_mq_tag_set admin_tagset;
|
||||
u32 __iomem *dbs;
|
||||
|
@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
|
|||
unsigned int hctx_idx)
|
||||
{
|
||||
struct nvme_dev *dev = data;
|
||||
struct nvme_queue *nvmeq = dev->queues[0];
|
||||
struct nvme_queue *nvmeq = &dev->queues[0];
|
||||
|
||||
WARN_ON(hctx_idx != 0);
|
||||
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
|
||||
|
@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
|
|||
unsigned int hctx_idx)
|
||||
{
|
||||
struct nvme_dev *dev = data;
|
||||
struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
|
||||
struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
|
||||
|
||||
if (!nvmeq->tags)
|
||||
nvmeq->tags = &dev->tagset.tags[hctx_idx];
|
||||
|
@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
|
|||
struct nvme_dev *dev = set->driver_data;
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
|
||||
struct nvme_queue *nvmeq = dev->queues[queue_idx];
|
||||
struct nvme_queue *nvmeq = &dev->queues[queue_idx];
|
||||
|
||||
BUG_ON(!nvmeq);
|
||||
iod->nvmeq = nvmeq;
|
||||
|
@ -1044,7 +1044,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
|
|||
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_dev *dev = to_nvme_dev(ctrl);
|
||||
struct nvme_queue *nvmeq = dev->queues[0];
|
||||
struct nvme_queue *nvmeq = &dev->queues[0];
|
||||
struct nvme_command c;
|
||||
|
||||
memset(&c, 0, sizeof(c));
|
||||
|
@ -1138,9 +1138,14 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
|
|||
*/
|
||||
bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
|
||||
|
||||
/* If there is a reset ongoing, we shouldn't reset again. */
|
||||
if (dev->ctrl.state == NVME_CTRL_RESETTING)
|
||||
/* If there is a reset/reinit ongoing, we shouldn't reset again. */
|
||||
switch (dev->ctrl.state) {
|
||||
case NVME_CTRL_RESETTING:
|
||||
case NVME_CTRL_RECONNECTING:
|
||||
return false;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
/* We shouldn't reset unless the controller is on fatal error state
|
||||
* _or_ if we lost the communication with it.
|
||||
|
@ -1280,7 +1285,6 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
|
|||
if (nvmeq->sq_cmds)
|
||||
dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
|
||||
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
|
||||
kfree(nvmeq);
|
||||
}
|
||||
|
||||
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
|
||||
|
@ -1288,10 +1292,8 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
|
|||
int i;
|
||||
|
||||
for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
|
||||
struct nvme_queue *nvmeq = dev->queues[i];
|
||||
dev->ctrl.queue_count--;
|
||||
dev->queues[i] = NULL;
|
||||
nvme_free_queue(nvmeq);
|
||||
nvme_free_queue(&dev->queues[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1323,12 +1325,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
|
|||
|
||||
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
|
||||
{
|
||||
struct nvme_queue *nvmeq = dev->queues[0];
|
||||
|
||||
if (!nvmeq)
|
||||
return;
|
||||
if (nvme_suspend_queue(nvmeq))
|
||||
return;
|
||||
struct nvme_queue *nvmeq = &dev->queues[0];
|
||||
|
||||
if (shutdown)
|
||||
nvme_shutdown_ctrl(&dev->ctrl);
|
||||
|
@ -1367,7 +1364,7 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
|
|||
static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
|
||||
int qid, int depth)
|
||||
{
|
||||
if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
|
||||
if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
|
||||
unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
|
||||
dev->ctrl.page_size);
|
||||
nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
|
||||
|
@ -1382,13 +1379,13 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
|
||||
int depth, int node)
|
||||
static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
|
||||
int depth, int node)
|
||||
{
|
||||
struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
|
||||
node);
|
||||
if (!nvmeq)
|
||||
return NULL;
|
||||
struct nvme_queue *nvmeq = &dev->queues[qid];
|
||||
|
||||
if (dev->ctrl.queue_count > qid)
|
||||
return 0;
|
||||
|
||||
nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
|
||||
&nvmeq->cq_dma_addr, GFP_KERNEL);
|
||||
|
@ -1407,17 +1404,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
|
|||
nvmeq->q_depth = depth;
|
||||
nvmeq->qid = qid;
|
||||
nvmeq->cq_vector = -1;
|
||||
dev->queues[qid] = nvmeq;
|
||||
dev->ctrl.queue_count++;
|
||||
|
||||
return nvmeq;
|
||||
return 0;
|
||||
|
||||
free_cqdma:
|
||||
dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
|
||||
nvmeq->cq_dma_addr);
|
||||
free_nvmeq:
|
||||
kfree(nvmeq);
|
||||
return NULL;
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static int queue_request_irq(struct nvme_queue *nvmeq)
|
||||
|
@ -1590,14 +1585,12 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
|
|||
if (result < 0)
|
||||
return result;
|
||||
|
||||
nvmeq = dev->queues[0];
|
||||
if (!nvmeq) {
|
||||
nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
|
||||
dev_to_node(dev->dev));
|
||||
if (!nvmeq)
|
||||
return -ENOMEM;
|
||||
}
|
||||
result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
|
||||
dev_to_node(dev->dev));
|
||||
if (result)
|
||||
return result;
|
||||
|
||||
nvmeq = &dev->queues[0];
|
||||
aqa = nvmeq->q_depth - 1;
|
||||
aqa |= aqa << 16;
|
||||
|
||||
|
@ -1627,7 +1620,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
|
|||
|
||||
for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
|
||||
/* vector == qid - 1, match nvme_create_queue */
|
||||
if (!nvme_alloc_queue(dev, i, dev->q_depth,
|
||||
if (nvme_alloc_queue(dev, i, dev->q_depth,
|
||||
pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
|
||||
ret = -ENOMEM;
|
||||
break;
|
||||
|
@ -1636,15 +1629,15 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
|
|||
|
||||
max = min(dev->max_qid, dev->ctrl.queue_count - 1);
|
||||
for (i = dev->online_queues; i <= max; i++) {
|
||||
ret = nvme_create_queue(dev->queues[i], i);
|
||||
ret = nvme_create_queue(&dev->queues[i], i);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ignore failing Create SQ/CQ commands, we can continue with less
|
||||
* than the desired aount of queues, and even a controller without
|
||||
* I/O queues an still be used to issue admin commands. This might
|
||||
* than the desired amount of queues, and even a controller without
|
||||
* I/O queues can still be used to issue admin commands. This might
|
||||
* be useful to upgrade a buggy firmware for example.
|
||||
*/
|
||||
return ret >= 0 ? 0 : ret;
|
||||
|
@ -1661,30 +1654,40 @@ static ssize_t nvme_cmb_show(struct device *dev,
|
|||
}
|
||||
static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
|
||||
|
||||
static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
|
||||
static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
|
||||
{
|
||||
u64 szu, size, offset;
|
||||
u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
|
||||
|
||||
return 1ULL << (12 + 4 * szu);
|
||||
}
|
||||
|
||||
static u32 nvme_cmb_size(struct nvme_dev *dev)
|
||||
{
|
||||
return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
|
||||
}
|
||||
|
||||
static void nvme_map_cmb(struct nvme_dev *dev)
|
||||
{
|
||||
u64 size, offset;
|
||||
resource_size_t bar_size;
|
||||
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
||||
void __iomem *cmb;
|
||||
int bar;
|
||||
|
||||
dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
|
||||
if (!(NVME_CMB_SZ(dev->cmbsz)))
|
||||
return NULL;
|
||||
if (!dev->cmbsz)
|
||||
return;
|
||||
dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
|
||||
|
||||
if (!use_cmb_sqes)
|
||||
return NULL;
|
||||
return;
|
||||
|
||||
szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
|
||||
size = szu * NVME_CMB_SZ(dev->cmbsz);
|
||||
offset = szu * NVME_CMB_OFST(dev->cmbloc);
|
||||
size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
|
||||
offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
|
||||
bar = NVME_CMB_BIR(dev->cmbloc);
|
||||
bar_size = pci_resource_len(pdev, bar);
|
||||
|
||||
if (offset > bar_size)
|
||||
return NULL;
|
||||
return;
|
||||
|
||||
/*
|
||||
* Controllers may support a CMB size larger than their BAR,
|
||||
|
@ -1694,13 +1697,16 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
|
|||
if (size > bar_size - offset)
|
||||
size = bar_size - offset;
|
||||
|
||||
cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
|
||||
if (!cmb)
|
||||
return NULL;
|
||||
|
||||
dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
|
||||
if (!dev->cmb)
|
||||
return;
|
||||
dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
|
||||
dev->cmb_size = size;
|
||||
return cmb;
|
||||
|
||||
if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
|
||||
&dev_attr_cmb.attr, NULL))
|
||||
dev_warn(dev->ctrl.device,
|
||||
"failed to add sysfs attribute for CMB\n");
|
||||
}
|
||||
|
||||
static inline void nvme_release_cmb(struct nvme_dev *dev)
|
||||
|
@ -1768,7 +1774,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
|
|||
dma_addr_t descs_dma;
|
||||
int i = 0;
|
||||
void **bufs;
|
||||
u64 size = 0, tmp;
|
||||
u64 size, tmp;
|
||||
|
||||
tmp = (preferred + chunk_size - 1);
|
||||
do_div(tmp, chunk_size);
|
||||
|
@ -1851,7 +1857,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
|
|||
u64 preferred = (u64)dev->ctrl.hmpre * 4096;
|
||||
u64 min = (u64)dev->ctrl.hmmin * 4096;
|
||||
u32 enable_bits = NVME_HOST_MEM_ENABLE;
|
||||
int ret = 0;
|
||||
int ret;
|
||||
|
||||
preferred = min(preferred, max);
|
||||
if (min > max) {
|
||||
|
@ -1892,7 +1898,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
|
|||
|
||||
static int nvme_setup_io_queues(struct nvme_dev *dev)
|
||||
{
|
||||
struct nvme_queue *adminq = dev->queues[0];
|
||||
struct nvme_queue *adminq = &dev->queues[0];
|
||||
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
||||
int result, nr_io_queues;
|
||||
unsigned long size;
|
||||
|
@ -1905,7 +1911,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
|
|||
if (nr_io_queues == 0)
|
||||
return 0;
|
||||
|
||||
if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
|
||||
if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
|
||||
result = nvme_cmb_qdepth(dev, nr_io_queues,
|
||||
sizeof(struct nvme_command));
|
||||
if (result > 0)
|
||||
|
@ -2005,9 +2011,9 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
|
||||
static void nvme_disable_io_queues(struct nvme_dev *dev)
|
||||
{
|
||||
int pass;
|
||||
int pass, queues = dev->online_queues - 1;
|
||||
unsigned long timeout;
|
||||
u8 opcode = nvme_admin_delete_sq;
|
||||
|
||||
|
@ -2018,7 +2024,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
|
|||
retry:
|
||||
timeout = ADMIN_TIMEOUT;
|
||||
for (; i > 0; i--, sent++)
|
||||
if (nvme_delete_queue(dev->queues[i], opcode))
|
||||
if (nvme_delete_queue(&dev->queues[i], opcode))
|
||||
break;
|
||||
|
||||
while (sent--) {
|
||||
|
@ -2033,13 +2039,12 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
|
|||
}
|
||||
|
||||
/*
|
||||
* Return: error value if an error occurred setting up the queues or calling
|
||||
* Identify Device. 0 if these succeeded, even if adding some of the
|
||||
* namespaces failed. At the moment, these failures are silent. TBD which
|
||||
* failures should be reported.
|
||||
* return error value only when tagset allocation failed
|
||||
*/
|
||||
static int nvme_dev_add(struct nvme_dev *dev)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!dev->ctrl.tagset) {
|
||||
dev->tagset.ops = &nvme_mq_ops;
|
||||
dev->tagset.nr_hw_queues = dev->online_queues - 1;
|
||||
|
@ -2055,8 +2060,12 @@ static int nvme_dev_add(struct nvme_dev *dev)
|
|||
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
dev->tagset.driver_data = dev;
|
||||
|
||||
if (blk_mq_alloc_tag_set(&dev->tagset))
|
||||
return 0;
|
||||
ret = blk_mq_alloc_tag_set(&dev->tagset);
|
||||
if (ret) {
|
||||
dev_warn(dev->ctrl.device,
|
||||
"IO queues tagset allocation failed %d\n", ret);
|
||||
return ret;
|
||||
}
|
||||
dev->ctrl.tagset = &dev->tagset;
|
||||
|
||||
nvme_dbbuf_set(dev);
|
||||
|
@ -2122,22 +2131,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
|
|||
"set queue depth=%u\n", dev->q_depth);
|
||||
}
|
||||
|
||||
/*
|
||||
* CMBs can currently only exist on >=1.2 PCIe devices. We only
|
||||
* populate sysfs if a CMB is implemented. Since nvme_dev_attrs_group
|
||||
* has no name we can pass NULL as final argument to
|
||||
* sysfs_add_file_to_group.
|
||||
*/
|
||||
|
||||
if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) {
|
||||
dev->cmb = nvme_map_cmb(dev);
|
||||
if (dev->cmb) {
|
||||
if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
|
||||
&dev_attr_cmb.attr, NULL))
|
||||
dev_warn(dev->ctrl.device,
|
||||
"failed to add sysfs attribute for CMB\n");
|
||||
}
|
||||
}
|
||||
nvme_map_cmb(dev);
|
||||
|
||||
pci_enable_pcie_error_reporting(pdev);
|
||||
pci_save_state(pdev);
|
||||
|
@ -2170,7 +2164,7 @@ static void nvme_pci_disable(struct nvme_dev *dev)
|
|||
|
||||
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
|
||||
{
|
||||
int i, queues;
|
||||
int i;
|
||||
bool dead = true;
|
||||
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
||||
|
||||
|
@ -2205,21 +2199,13 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
|
|||
}
|
||||
nvme_stop_queues(&dev->ctrl);
|
||||
|
||||
queues = dev->online_queues - 1;
|
||||
for (i = dev->ctrl.queue_count - 1; i > 0; i--)
|
||||
nvme_suspend_queue(dev->queues[i]);
|
||||
|
||||
if (dead) {
|
||||
/* A device might become IO incapable very soon during
|
||||
* probe, before the admin queue is configured. Thus,
|
||||
* queue_count can be 0 here.
|
||||
*/
|
||||
if (dev->ctrl.queue_count)
|
||||
nvme_suspend_queue(dev->queues[0]);
|
||||
} else {
|
||||
nvme_disable_io_queues(dev, queues);
|
||||
if (!dead) {
|
||||
nvme_disable_io_queues(dev);
|
||||
nvme_disable_admin_queue(dev, shutdown);
|
||||
}
|
||||
for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
|
||||
nvme_suspend_queue(&dev->queues[i]);
|
||||
|
||||
nvme_pci_disable(dev);
|
||||
|
||||
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
|
||||
|
@ -2289,6 +2275,7 @@ static void nvme_reset_work(struct work_struct *work)
|
|||
container_of(work, struct nvme_dev, ctrl.reset_work);
|
||||
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
|
||||
int result = -ENODEV;
|
||||
enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
|
||||
|
||||
if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
|
||||
goto out;
|
||||
|
@ -2300,6 +2287,16 @@ static void nvme_reset_work(struct work_struct *work)
|
|||
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
|
||||
nvme_dev_disable(dev, false);
|
||||
|
||||
/*
|
||||
* Introduce RECONNECTING state from nvme-fc/rdma transports to mark the
|
||||
* initializing procedure here.
|
||||
*/
|
||||
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) {
|
||||
dev_warn(dev->ctrl.device,
|
||||
"failed to mark controller RECONNECTING\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
result = nvme_pci_enable(dev);
|
||||
if (result)
|
||||
goto out;
|
||||
|
@ -2352,15 +2349,23 @@ static void nvme_reset_work(struct work_struct *work)
|
|||
dev_warn(dev->ctrl.device, "IO queues not created\n");
|
||||
nvme_kill_queues(&dev->ctrl);
|
||||
nvme_remove_namespaces(&dev->ctrl);
|
||||
new_state = NVME_CTRL_ADMIN_ONLY;
|
||||
} else {
|
||||
nvme_start_queues(&dev->ctrl);
|
||||
nvme_wait_freeze(&dev->ctrl);
|
||||
nvme_dev_add(dev);
|
||||
/* hit this only when allocate tagset fails */
|
||||
if (nvme_dev_add(dev))
|
||||
new_state = NVME_CTRL_ADMIN_ONLY;
|
||||
nvme_unfreeze(&dev->ctrl);
|
||||
}
|
||||
|
||||
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
|
||||
dev_warn(dev->ctrl.device, "failed to mark controller live\n");
|
||||
/*
|
||||
* If only admin queue live, keep it to do further investigation or
|
||||
* recovery.
|
||||
*/
|
||||
if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
|
||||
dev_warn(dev->ctrl.device,
|
||||
"failed to mark controller state %d\n", new_state);
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
@ -2468,8 +2473,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
|
|||
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
|
||||
if (!dev)
|
||||
return -ENOMEM;
|
||||
dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
|
||||
GFP_KERNEL, node);
|
||||
|
||||
dev->queues = kcalloc_node(num_possible_cpus() + 1,
|
||||
sizeof(struct nvme_queue), GFP_KERNEL, node);
|
||||
if (!dev->queues)
|
||||
goto free;
|
||||
|
||||
|
@ -2496,10 +2502,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
|
|||
if (result)
|
||||
goto release_pools;
|
||||
|
||||
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
|
||||
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
|
||||
|
||||
queue_work(nvme_wq, &dev->ctrl.reset_work);
|
||||
nvme_reset_ctrl(&dev->ctrl);
|
||||
|
||||
return 0;
|
||||
|
||||
release_pools:
|
||||
|
@ -2523,7 +2529,7 @@ static void nvme_reset_prepare(struct pci_dev *pdev)
|
|||
static void nvme_reset_done(struct pci_dev *pdev)
|
||||
{
|
||||
struct nvme_dev *dev = pci_get_drvdata(pdev);
|
||||
nvme_reset_ctrl(&dev->ctrl);
|
||||
nvme_reset_ctrl_sync(&dev->ctrl);
|
||||
}
|
||||
|
||||
static void nvme_shutdown(struct pci_dev *pdev)
|
||||
|
|
|
@ -66,7 +66,6 @@ struct nvme_rdma_request {
|
|||
struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
|
||||
u32 num_sge;
|
||||
int nents;
|
||||
bool inline_data;
|
||||
struct ib_reg_wr reg_wr;
|
||||
struct ib_cqe reg_cqe;
|
||||
struct nvme_rdma_queue *queue;
|
||||
|
@ -1092,7 +1091,6 @@ static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
|
|||
sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
|
||||
sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
|
||||
|
||||
req->inline_data = true;
|
||||
req->num_sge++;
|
||||
return 0;
|
||||
}
|
||||
|
@ -1164,7 +1162,6 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
|
|||
int count, ret;
|
||||
|
||||
req->num_sge = 1;
|
||||
req->inline_data = false;
|
||||
refcount_set(&req->ref, 2); /* send and recv completions */
|
||||
|
||||
c->common.flags |= NVME_CMD_SGL_METABUF;
|
||||
|
@ -2018,6 +2015,7 @@ out_free_ctrl:
|
|||
|
||||
static struct nvmf_transport_ops nvme_rdma_transport = {
|
||||
.name = "rdma",
|
||||
.module = THIS_MODULE,
|
||||
.required_opts = NVMF_OPT_TRADDR,
|
||||
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
|
||||
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
|
||||
|
@ -2040,7 +2038,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
|
|||
}
|
||||
mutex_unlock(&nvme_rdma_ctrl_mutex);
|
||||
|
||||
flush_workqueue(nvme_wq);
|
||||
flush_workqueue(nvme_delete_wq);
|
||||
}
|
||||
|
||||
static struct ib_client nvme_rdma_ib_client = {
|
||||
|
|
|
@ -0,0 +1,130 @@
|
|||
/*
|
||||
* NVM Express device driver tracepoints
|
||||
* Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <asm/unaligned.h>
|
||||
#include "trace.h"
|
||||
|
||||
static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10)
|
||||
{
|
||||
const char *ret = trace_seq_buffer_ptr(p);
|
||||
u16 sqid = get_unaligned_le16(cdw10);
|
||||
u16 qsize = get_unaligned_le16(cdw10 + 2);
|
||||
u16 sq_flags = get_unaligned_le16(cdw10 + 4);
|
||||
u16 cqid = get_unaligned_le16(cdw10 + 6);
|
||||
|
||||
|
||||
trace_seq_printf(p, "sqid=%u, qsize=%u, sq_flags=0x%x, cqid=%u",
|
||||
sqid, qsize, sq_flags, cqid);
|
||||
trace_seq_putc(p, 0);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10)
|
||||
{
|
||||
const char *ret = trace_seq_buffer_ptr(p);
|
||||
u16 cqid = get_unaligned_le16(cdw10);
|
||||
u16 qsize = get_unaligned_le16(cdw10 + 2);
|
||||
u16 cq_flags = get_unaligned_le16(cdw10 + 4);
|
||||
u16 irq_vector = get_unaligned_le16(cdw10 + 6);
|
||||
|
||||
trace_seq_printf(p, "cqid=%u, qsize=%u, cq_flags=0x%x, irq_vector=%u",
|
||||
cqid, qsize, cq_flags, irq_vector);
|
||||
trace_seq_putc(p, 0);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10)
|
||||
{
|
||||
const char *ret = trace_seq_buffer_ptr(p);
|
||||
u8 cns = cdw10[0];
|
||||
u16 ctrlid = get_unaligned_le16(cdw10 + 2);
|
||||
|
||||
trace_seq_printf(p, "cns=%u, ctrlid=%u", cns, ctrlid);
|
||||
trace_seq_putc(p, 0);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
|
||||
{
|
||||
const char *ret = trace_seq_buffer_ptr(p);
|
||||
u64 slba = get_unaligned_le64(cdw10);
|
||||
u16 length = get_unaligned_le16(cdw10 + 8);
|
||||
u16 control = get_unaligned_le16(cdw10 + 10);
|
||||
u32 dsmgmt = get_unaligned_le32(cdw10 + 12);
|
||||
u32 reftag = get_unaligned_le32(cdw10 + 16);
|
||||
|
||||
trace_seq_printf(p,
|
||||
"slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u",
|
||||
slba, length, control, dsmgmt, reftag);
|
||||
trace_seq_putc(p, 0);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10)
|
||||
{
|
||||
const char *ret = trace_seq_buffer_ptr(p);
|
||||
|
||||
trace_seq_printf(p, "nr=%u, attributes=%u",
|
||||
get_unaligned_le32(cdw10),
|
||||
get_unaligned_le32(cdw10 + 4));
|
||||
trace_seq_putc(p, 0);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10)
|
||||
{
|
||||
const char *ret = trace_seq_buffer_ptr(p);
|
||||
|
||||
trace_seq_printf(p, "cdw10=%*ph", 24, cdw10);
|
||||
trace_seq_putc(p, 0);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
|
||||
u8 opcode, u8 *cdw10)
|
||||
{
|
||||
switch (opcode) {
|
||||
case nvme_admin_create_sq:
|
||||
return nvme_trace_create_sq(p, cdw10);
|
||||
case nvme_admin_create_cq:
|
||||
return nvme_trace_create_cq(p, cdw10);
|
||||
case nvme_admin_identify:
|
||||
return nvme_trace_admin_identify(p, cdw10);
|
||||
default:
|
||||
return nvme_trace_common(p, cdw10);
|
||||
}
|
||||
}
|
||||
|
||||
const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
|
||||
u8 opcode, u8 *cdw10)
|
||||
{
|
||||
switch (opcode) {
|
||||
case nvme_cmd_read:
|
||||
case nvme_cmd_write:
|
||||
case nvme_cmd_write_zeroes:
|
||||
return nvme_trace_read_write(p, cdw10);
|
||||
case nvme_cmd_dsm:
|
||||
return nvme_trace_dsm(p, cdw10);
|
||||
default:
|
||||
return nvme_trace_common(p, cdw10);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
/*
|
||||
* NVM Express device driver tracepoints
|
||||
* Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#undef TRACE_SYSTEM
|
||||
#define TRACE_SYSTEM nvme
|
||||
|
||||
#if !defined(_TRACE_NVME_H) || defined(TRACE_HEADER_MULTI_READ)
|
||||
#define _TRACE_NVME_H
|
||||
|
||||
#include <linux/nvme.h>
|
||||
#include <linux/tracepoint.h>
|
||||
#include <linux/trace_seq.h>
|
||||
|
||||
#include "nvme.h"
|
||||
|
||||
#define nvme_admin_opcode_name(opcode) { opcode, #opcode }
|
||||
#define show_admin_opcode_name(val) \
|
||||
__print_symbolic(val, \
|
||||
nvme_admin_opcode_name(nvme_admin_delete_sq), \
|
||||
nvme_admin_opcode_name(nvme_admin_create_sq), \
|
||||
nvme_admin_opcode_name(nvme_admin_get_log_page), \
|
||||
nvme_admin_opcode_name(nvme_admin_delete_cq), \
|
||||
nvme_admin_opcode_name(nvme_admin_create_cq), \
|
||||
nvme_admin_opcode_name(nvme_admin_identify), \
|
||||
nvme_admin_opcode_name(nvme_admin_abort_cmd), \
|
||||
nvme_admin_opcode_name(nvme_admin_set_features), \
|
||||
nvme_admin_opcode_name(nvme_admin_get_features), \
|
||||
nvme_admin_opcode_name(nvme_admin_async_event), \
|
||||
nvme_admin_opcode_name(nvme_admin_ns_mgmt), \
|
||||
nvme_admin_opcode_name(nvme_admin_activate_fw), \
|
||||
nvme_admin_opcode_name(nvme_admin_download_fw), \
|
||||
nvme_admin_opcode_name(nvme_admin_ns_attach), \
|
||||
nvme_admin_opcode_name(nvme_admin_keep_alive), \
|
||||
nvme_admin_opcode_name(nvme_admin_directive_send), \
|
||||
nvme_admin_opcode_name(nvme_admin_directive_recv), \
|
||||
nvme_admin_opcode_name(nvme_admin_dbbuf), \
|
||||
nvme_admin_opcode_name(nvme_admin_format_nvm), \
|
||||
nvme_admin_opcode_name(nvme_admin_security_send), \
|
||||
nvme_admin_opcode_name(nvme_admin_security_recv), \
|
||||
nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
|
||||
|
||||
const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
|
||||
u8 *cdw10);
|
||||
#define __parse_nvme_admin_cmd(opcode, cdw10) \
|
||||
nvme_trace_parse_admin_cmd(p, opcode, cdw10)
|
||||
|
||||
#define nvme_opcode_name(opcode) { opcode, #opcode }
|
||||
#define show_opcode_name(val) \
|
||||
__print_symbolic(val, \
|
||||
nvme_opcode_name(nvme_cmd_flush), \
|
||||
nvme_opcode_name(nvme_cmd_write), \
|
||||
nvme_opcode_name(nvme_cmd_read), \
|
||||
nvme_opcode_name(nvme_cmd_write_uncor), \
|
||||
nvme_opcode_name(nvme_cmd_compare), \
|
||||
nvme_opcode_name(nvme_cmd_write_zeroes), \
|
||||
nvme_opcode_name(nvme_cmd_dsm), \
|
||||
nvme_opcode_name(nvme_cmd_resv_register), \
|
||||
nvme_opcode_name(nvme_cmd_resv_report), \
|
||||
nvme_opcode_name(nvme_cmd_resv_acquire), \
|
||||
nvme_opcode_name(nvme_cmd_resv_release))
|
||||
|
||||
const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
|
||||
u8 *cdw10);
|
||||
#define __parse_nvme_cmd(opcode, cdw10) \
|
||||
nvme_trace_parse_nvm_cmd(p, opcode, cdw10)
|
||||
|
||||
TRACE_EVENT(nvme_setup_admin_cmd,
|
||||
TP_PROTO(struct nvme_command *cmd),
|
||||
TP_ARGS(cmd),
|
||||
TP_STRUCT__entry(
|
||||
__field(u8, opcode)
|
||||
__field(u8, flags)
|
||||
__field(u16, cid)
|
||||
__field(u64, metadata)
|
||||
__array(u8, cdw10, 24)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->opcode = cmd->common.opcode;
|
||||
__entry->flags = cmd->common.flags;
|
||||
__entry->cid = cmd->common.command_id;
|
||||
__entry->metadata = le64_to_cpu(cmd->common.metadata);
|
||||
memcpy(__entry->cdw10, cmd->common.cdw10,
|
||||
sizeof(__entry->cdw10));
|
||||
),
|
||||
TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
|
||||
__entry->cid, __entry->flags, __entry->metadata,
|
||||
show_admin_opcode_name(__entry->opcode),
|
||||
__parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10))
|
||||
);
|
||||
|
||||
|
||||
TRACE_EVENT(nvme_setup_nvm_cmd,
|
||||
TP_PROTO(int qid, struct nvme_command *cmd),
|
||||
TP_ARGS(qid, cmd),
|
||||
TP_STRUCT__entry(
|
||||
__field(int, qid)
|
||||
__field(u8, opcode)
|
||||
__field(u8, flags)
|
||||
__field(u16, cid)
|
||||
__field(u32, nsid)
|
||||
__field(u64, metadata)
|
||||
__array(u8, cdw10, 24)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->qid = qid;
|
||||
__entry->opcode = cmd->common.opcode;
|
||||
__entry->flags = cmd->common.flags;
|
||||
__entry->cid = cmd->common.command_id;
|
||||
__entry->nsid = le32_to_cpu(cmd->common.nsid);
|
||||
__entry->metadata = le64_to_cpu(cmd->common.metadata);
|
||||
memcpy(__entry->cdw10, cmd->common.cdw10,
|
||||
sizeof(__entry->cdw10));
|
||||
),
|
||||
TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
|
||||
__entry->qid, __entry->nsid, __entry->cid,
|
||||
__entry->flags, __entry->metadata,
|
||||
show_opcode_name(__entry->opcode),
|
||||
__parse_nvme_cmd(__entry->opcode, __entry->cdw10))
|
||||
);
|
||||
|
||||
TRACE_EVENT(nvme_complete_rq,
|
||||
TP_PROTO(struct request *req),
|
||||
TP_ARGS(req),
|
||||
TP_STRUCT__entry(
|
||||
__field(int, qid)
|
||||
__field(int, cid)
|
||||
__field(u64, result)
|
||||
__field(u8, retries)
|
||||
__field(u8, flags)
|
||||
__field(u16, status)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->qid = req->q->id;
|
||||
__entry->cid = req->tag;
|
||||
__entry->result = le64_to_cpu(nvme_req(req)->result.u64);
|
||||
__entry->retries = nvme_req(req)->retries;
|
||||
__entry->flags = nvme_req(req)->flags;
|
||||
__entry->status = nvme_req(req)->status;
|
||||
),
|
||||
TP_printk("cmdid=%u, qid=%d, res=%llu, retries=%u, flags=0x%x, status=%u",
|
||||
__entry->cid, __entry->qid, __entry->result,
|
||||
__entry->retries, __entry->flags, __entry->status)
|
||||
|
||||
);
|
||||
|
||||
#endif /* _TRACE_NVME_H */
|
||||
|
||||
#undef TRACE_INCLUDE_PATH
|
||||
#define TRACE_INCLUDE_PATH .
|
||||
#undef TRACE_INCLUDE_FILE
|
||||
#define TRACE_INCLUDE_FILE trace
|
||||
|
||||
/* This part must be outside protection */
|
||||
#include <trace/define_trace.h>
|
|
@ -29,6 +29,7 @@ config NVME_TARGET_RDMA
|
|||
tristate "NVMe over Fabrics RDMA target support"
|
||||
depends on INFINIBAND
|
||||
depends on NVME_TARGET
|
||||
select SGL_ALLOC
|
||||
help
|
||||
This enables the NVMe RDMA target support, which allows exporting NVMe
|
||||
devices over RDMA.
|
||||
|
@ -39,6 +40,7 @@ config NVME_TARGET_FC
|
|||
tristate "NVMe over Fabrics FC target driver"
|
||||
depends on NVME_TARGET
|
||||
depends on HAS_DMA
|
||||
select SGL_ALLOC
|
||||
help
|
||||
This enables the NVMe FC target support, which allows exporting NVMe
|
||||
devices over FC.
|
||||
|
|
|
@ -512,6 +512,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
|
|||
req->sg_cnt = 0;
|
||||
req->transfer_len = 0;
|
||||
req->rsp->status = 0;
|
||||
req->ns = NULL;
|
||||
|
||||
/* no support for fused commands yet */
|
||||
if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
|
||||
|
@ -557,6 +558,8 @@ EXPORT_SYMBOL_GPL(nvmet_req_init);
|
|||
void nvmet_req_uninit(struct nvmet_req *req)
|
||||
{
|
||||
percpu_ref_put(&req->sq->ref);
|
||||
if (req->ns)
|
||||
nvmet_put_namespace(req->ns);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_req_uninit);
|
||||
|
||||
|
@ -830,7 +833,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
|
|||
/* Don't accept keep-alive timeout for discovery controllers */
|
||||
if (kato) {
|
||||
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
|
||||
goto out_free_sqs;
|
||||
goto out_remove_ida;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -860,6 +863,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
|
|||
*ctrlp = ctrl;
|
||||
return 0;
|
||||
|
||||
out_remove_ida:
|
||||
ida_simple_remove(&cntlid_ida, ctrl->cntlid);
|
||||
out_free_sqs:
|
||||
kfree(ctrl->sqs);
|
||||
out_free_cqs:
|
||||
|
@ -877,21 +882,22 @@ static void nvmet_ctrl_free(struct kref *ref)
|
|||
struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
|
||||
struct nvmet_subsys *subsys = ctrl->subsys;
|
||||
|
||||
nvmet_stop_keep_alive_timer(ctrl);
|
||||
|
||||
mutex_lock(&subsys->lock);
|
||||
list_del(&ctrl->subsys_entry);
|
||||
mutex_unlock(&subsys->lock);
|
||||
|
||||
nvmet_stop_keep_alive_timer(ctrl);
|
||||
|
||||
flush_work(&ctrl->async_event_work);
|
||||
cancel_work_sync(&ctrl->fatal_err_work);
|
||||
|
||||
ida_simple_remove(&cntlid_ida, ctrl->cntlid);
|
||||
nvmet_subsys_put(subsys);
|
||||
|
||||
kfree(ctrl->sqs);
|
||||
kfree(ctrl->cqs);
|
||||
kfree(ctrl);
|
||||
|
||||
nvmet_subsys_put(subsys);
|
||||
}
|
||||
|
||||
void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
|
||||
|
|
|
@ -225,7 +225,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
|
|||
goto out_ctrl_put;
|
||||
}
|
||||
|
||||
pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
|
||||
pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
|
||||
|
||||
out:
|
||||
kfree(d);
|
||||
|
|
|
@ -1697,31 +1697,12 @@ static int
|
|||
nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
|
||||
{
|
||||
struct scatterlist *sg;
|
||||
struct page *page;
|
||||
unsigned int nent;
|
||||
u32 page_len, length;
|
||||
int i = 0;
|
||||
|
||||
length = fod->req.transfer_len;
|
||||
nent = DIV_ROUND_UP(length, PAGE_SIZE);
|
||||
sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
|
||||
sg = sgl_alloc(fod->req.transfer_len, GFP_KERNEL, &nent);
|
||||
if (!sg)
|
||||
goto out;
|
||||
|
||||
sg_init_table(sg, nent);
|
||||
|
||||
while (length) {
|
||||
page_len = min_t(u32, length, PAGE_SIZE);
|
||||
|
||||
page = alloc_page(GFP_KERNEL);
|
||||
if (!page)
|
||||
goto out_free_pages;
|
||||
|
||||
sg_set_page(&sg[i], page, page_len, 0);
|
||||
length -= page_len;
|
||||
i++;
|
||||
}
|
||||
|
||||
fod->data_sg = sg;
|
||||
fod->data_sg_cnt = nent;
|
||||
fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent,
|
||||
|
@ -1731,14 +1712,6 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
|
|||
|
||||
return 0;
|
||||
|
||||
out_free_pages:
|
||||
while (i > 0) {
|
||||
i--;
|
||||
__free_page(sg_page(&sg[i]));
|
||||
}
|
||||
kfree(sg);
|
||||
fod->data_sg = NULL;
|
||||
fod->data_sg_cnt = 0;
|
||||
out:
|
||||
return NVME_SC_INTERNAL;
|
||||
}
|
||||
|
@ -1746,18 +1719,13 @@ out:
|
|||
static void
|
||||
nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
|
||||
{
|
||||
struct scatterlist *sg;
|
||||
int count;
|
||||
|
||||
if (!fod->data_sg || !fod->data_sg_cnt)
|
||||
return;
|
||||
|
||||
fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt,
|
||||
((fod->io_dir == NVMET_FCP_WRITE) ?
|
||||
DMA_FROM_DEVICE : DMA_TO_DEVICE));
|
||||
for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count)
|
||||
__free_page(sg_page(sg));
|
||||
kfree(fod->data_sg);
|
||||
sgl_free(fod->data_sg);
|
||||
fod->data_sg = NULL;
|
||||
fod->data_sg_cnt = 0;
|
||||
}
|
||||
|
@ -2522,14 +2490,8 @@ nvmet_fc_add_port(struct nvmet_port *port)
|
|||
list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) {
|
||||
if ((tgtport->fc_target_port.node_name == traddr.nn) &&
|
||||
(tgtport->fc_target_port.port_name == traddr.pn)) {
|
||||
/* a FC port can only be 1 nvmet port id */
|
||||
if (!tgtport->port) {
|
||||
tgtport->port = port;
|
||||
port->priv = tgtport;
|
||||
nvmet_fc_tgtport_get(tgtport);
|
||||
ret = 0;
|
||||
} else
|
||||
ret = -EALREADY;
|
||||
tgtport->port = port;
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -2540,19 +2502,7 @@ nvmet_fc_add_port(struct nvmet_port *port)
|
|||
static void
|
||||
nvmet_fc_remove_port(struct nvmet_port *port)
|
||||
{
|
||||
struct nvmet_fc_tgtport *tgtport = port->priv;
|
||||
unsigned long flags;
|
||||
bool matched = false;
|
||||
|
||||
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
|
||||
if (tgtport->port == port) {
|
||||
matched = true;
|
||||
tgtport->port = NULL;
|
||||
}
|
||||
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
|
||||
|
||||
if (matched)
|
||||
nvmet_fc_tgtport_put(tgtport);
|
||||
/* nothing to do */
|
||||
}
|
||||
|
||||
static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = {
|
||||
|
|
|
@ -204,6 +204,10 @@ struct fcloop_lport {
|
|||
struct completion unreg_done;
|
||||
};
|
||||
|
||||
struct fcloop_lport_priv {
|
||||
struct fcloop_lport *lport;
|
||||
};
|
||||
|
||||
struct fcloop_rport {
|
||||
struct nvme_fc_remote_port *remoteport;
|
||||
struct nvmet_fc_target_port *targetport;
|
||||
|
@ -238,21 +242,32 @@ struct fcloop_lsreq {
|
|||
int status;
|
||||
};
|
||||
|
||||
enum {
|
||||
INI_IO_START = 0,
|
||||
INI_IO_ACTIVE = 1,
|
||||
INI_IO_ABORTED = 2,
|
||||
INI_IO_COMPLETED = 3,
|
||||
};
|
||||
|
||||
struct fcloop_fcpreq {
|
||||
struct fcloop_tport *tport;
|
||||
struct nvmefc_fcp_req *fcpreq;
|
||||
spinlock_t reqlock;
|
||||
u16 status;
|
||||
u32 inistate;
|
||||
bool active;
|
||||
bool aborted;
|
||||
struct work_struct work;
|
||||
struct kref ref;
|
||||
struct work_struct fcp_rcv_work;
|
||||
struct work_struct abort_rcv_work;
|
||||
struct work_struct tio_done_work;
|
||||
struct nvmefc_tgt_fcp_req tgt_fcp_req;
|
||||
};
|
||||
|
||||
struct fcloop_ini_fcpreq {
|
||||
struct nvmefc_fcp_req *fcpreq;
|
||||
struct fcloop_fcpreq *tfcp_req;
|
||||
struct work_struct iniwork;
|
||||
spinlock_t inilock;
|
||||
};
|
||||
|
||||
static inline struct fcloop_lsreq *
|
||||
|
@ -343,17 +358,122 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* FCP IO operation done by initiator abort.
|
||||
* call back up initiator "done" flows.
|
||||
*/
|
||||
static void
|
||||
fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work)
|
||||
fcloop_tfcp_req_free(struct kref *ref)
|
||||
{
|
||||
struct fcloop_ini_fcpreq *inireq =
|
||||
container_of(work, struct fcloop_ini_fcpreq, iniwork);
|
||||
struct fcloop_fcpreq *tfcp_req =
|
||||
container_of(ref, struct fcloop_fcpreq, ref);
|
||||
|
||||
inireq->fcpreq->done(inireq->fcpreq);
|
||||
kfree(tfcp_req);
|
||||
}
|
||||
|
||||
static void
|
||||
fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req)
|
||||
{
|
||||
kref_put(&tfcp_req->ref, fcloop_tfcp_req_free);
|
||||
}
|
||||
|
||||
static int
|
||||
fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req)
|
||||
{
|
||||
return kref_get_unless_zero(&tfcp_req->ref);
|
||||
}
|
||||
|
||||
static void
|
||||
fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
|
||||
struct fcloop_fcpreq *tfcp_req, int status)
|
||||
{
|
||||
struct fcloop_ini_fcpreq *inireq = NULL;
|
||||
|
||||
if (fcpreq) {
|
||||
inireq = fcpreq->private;
|
||||
spin_lock(&inireq->inilock);
|
||||
inireq->tfcp_req = NULL;
|
||||
spin_unlock(&inireq->inilock);
|
||||
|
||||
fcpreq->status = status;
|
||||
fcpreq->done(fcpreq);
|
||||
}
|
||||
|
||||
/* release original io reference on tgt struct */
|
||||
fcloop_tfcp_req_put(tfcp_req);
|
||||
}
|
||||
|
||||
static void
|
||||
fcloop_fcp_recv_work(struct work_struct *work)
|
||||
{
|
||||
struct fcloop_fcpreq *tfcp_req =
|
||||
container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
|
||||
struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
|
||||
int ret = 0;
|
||||
bool aborted = false;
|
||||
|
||||
spin_lock(&tfcp_req->reqlock);
|
||||
switch (tfcp_req->inistate) {
|
||||
case INI_IO_START:
|
||||
tfcp_req->inistate = INI_IO_ACTIVE;
|
||||
break;
|
||||
case INI_IO_ABORTED:
|
||||
aborted = true;
|
||||
break;
|
||||
default:
|
||||
spin_unlock(&tfcp_req->reqlock);
|
||||
WARN_ON(1);
|
||||
return;
|
||||
}
|
||||
spin_unlock(&tfcp_req->reqlock);
|
||||
|
||||
if (unlikely(aborted))
|
||||
ret = -ECANCELED;
|
||||
else
|
||||
ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
|
||||
&tfcp_req->tgt_fcp_req,
|
||||
fcpreq->cmdaddr, fcpreq->cmdlen);
|
||||
if (ret)
|
||||
fcloop_call_host_done(fcpreq, tfcp_req, ret);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static void
|
||||
fcloop_fcp_abort_recv_work(struct work_struct *work)
|
||||
{
|
||||
struct fcloop_fcpreq *tfcp_req =
|
||||
container_of(work, struct fcloop_fcpreq, abort_rcv_work);
|
||||
struct nvmefc_fcp_req *fcpreq;
|
||||
bool completed = false;
|
||||
|
||||
spin_lock(&tfcp_req->reqlock);
|
||||
fcpreq = tfcp_req->fcpreq;
|
||||
switch (tfcp_req->inistate) {
|
||||
case INI_IO_ABORTED:
|
||||
break;
|
||||
case INI_IO_COMPLETED:
|
||||
completed = true;
|
||||
break;
|
||||
default:
|
||||
spin_unlock(&tfcp_req->reqlock);
|
||||
WARN_ON(1);
|
||||
return;
|
||||
}
|
||||
spin_unlock(&tfcp_req->reqlock);
|
||||
|
||||
if (unlikely(completed)) {
|
||||
/* remove reference taken in original abort downcall */
|
||||
fcloop_tfcp_req_put(tfcp_req);
|
||||
return;
|
||||
}
|
||||
|
||||
if (tfcp_req->tport->targetport)
|
||||
nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
|
||||
&tfcp_req->tgt_fcp_req);
|
||||
|
||||
spin_lock(&tfcp_req->reqlock);
|
||||
tfcp_req->fcpreq = NULL;
|
||||
spin_unlock(&tfcp_req->reqlock);
|
||||
|
||||
fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
|
||||
/* call_host_done releases reference for abort downcall */
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -364,20 +484,15 @@ static void
|
|||
fcloop_tgt_fcprqst_done_work(struct work_struct *work)
|
||||
{
|
||||
struct fcloop_fcpreq *tfcp_req =
|
||||
container_of(work, struct fcloop_fcpreq, work);
|
||||
struct fcloop_tport *tport = tfcp_req->tport;
|
||||
container_of(work, struct fcloop_fcpreq, tio_done_work);
|
||||
struct nvmefc_fcp_req *fcpreq;
|
||||
|
||||
spin_lock(&tfcp_req->reqlock);
|
||||
fcpreq = tfcp_req->fcpreq;
|
||||
tfcp_req->inistate = INI_IO_COMPLETED;
|
||||
spin_unlock(&tfcp_req->reqlock);
|
||||
|
||||
if (tport->remoteport && fcpreq) {
|
||||
fcpreq->status = tfcp_req->status;
|
||||
fcpreq->done(fcpreq);
|
||||
}
|
||||
|
||||
kfree(tfcp_req);
|
||||
fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status);
|
||||
}
|
||||
|
||||
|
||||
|
@ -390,7 +505,6 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
|
|||
struct fcloop_rport *rport = remoteport->private;
|
||||
struct fcloop_ini_fcpreq *inireq = fcpreq->private;
|
||||
struct fcloop_fcpreq *tfcp_req;
|
||||
int ret = 0;
|
||||
|
||||
if (!rport->targetport)
|
||||
return -ECONNREFUSED;
|
||||
|
@ -401,16 +515,20 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
|
|||
|
||||
inireq->fcpreq = fcpreq;
|
||||
inireq->tfcp_req = tfcp_req;
|
||||
INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work);
|
||||
spin_lock_init(&inireq->inilock);
|
||||
|
||||
tfcp_req->fcpreq = fcpreq;
|
||||
tfcp_req->tport = rport->targetport->private;
|
||||
tfcp_req->inistate = INI_IO_START;
|
||||
spin_lock_init(&tfcp_req->reqlock);
|
||||
INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
|
||||
INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work);
|
||||
INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work);
|
||||
INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work);
|
||||
kref_init(&tfcp_req->ref);
|
||||
|
||||
ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req,
|
||||
fcpreq->cmdaddr, fcpreq->cmdlen);
|
||||
schedule_work(&tfcp_req->fcp_rcv_work);
|
||||
|
||||
return ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -589,7 +707,7 @@ fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
|
|||
{
|
||||
struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
|
||||
|
||||
schedule_work(&tfcp_req->work);
|
||||
schedule_work(&tfcp_req->tio_done_work);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -605,27 +723,47 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
|
|||
void *hw_queue_handle,
|
||||
struct nvmefc_fcp_req *fcpreq)
|
||||
{
|
||||
struct fcloop_rport *rport = remoteport->private;
|
||||
struct fcloop_ini_fcpreq *inireq = fcpreq->private;
|
||||
struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req;
|
||||
struct fcloop_fcpreq *tfcp_req;
|
||||
bool abortio = true;
|
||||
|
||||
spin_lock(&inireq->inilock);
|
||||
tfcp_req = inireq->tfcp_req;
|
||||
if (tfcp_req)
|
||||
fcloop_tfcp_req_get(tfcp_req);
|
||||
spin_unlock(&inireq->inilock);
|
||||
|
||||
if (!tfcp_req)
|
||||
/* abort has already been called */
|
||||
return;
|
||||
|
||||
if (rport->targetport)
|
||||
nvmet_fc_rcv_fcp_abort(rport->targetport,
|
||||
&tfcp_req->tgt_fcp_req);
|
||||
|
||||
/* break initiator/target relationship for io */
|
||||
spin_lock(&tfcp_req->reqlock);
|
||||
inireq->tfcp_req = NULL;
|
||||
tfcp_req->fcpreq = NULL;
|
||||
switch (tfcp_req->inistate) {
|
||||
case INI_IO_START:
|
||||
case INI_IO_ACTIVE:
|
||||
tfcp_req->inistate = INI_IO_ABORTED;
|
||||
break;
|
||||
case INI_IO_COMPLETED:
|
||||
abortio = false;
|
||||
break;
|
||||
default:
|
||||
spin_unlock(&tfcp_req->reqlock);
|
||||
WARN_ON(1);
|
||||
return;
|
||||
}
|
||||
spin_unlock(&tfcp_req->reqlock);
|
||||
|
||||
/* post the aborted io completion */
|
||||
fcpreq->status = -ECANCELED;
|
||||
schedule_work(&inireq->iniwork);
|
||||
if (abortio)
|
||||
/* leave the reference while the work item is scheduled */
|
||||
WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work));
|
||||
else {
|
||||
/*
|
||||
* as the io has already had the done callback made,
|
||||
* nothing more to do. So release the reference taken above
|
||||
*/
|
||||
fcloop_tfcp_req_put(tfcp_req);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -657,7 +795,8 @@ fcloop_nport_get(struct fcloop_nport *nport)
|
|||
static void
|
||||
fcloop_localport_delete(struct nvme_fc_local_port *localport)
|
||||
{
|
||||
struct fcloop_lport *lport = localport->private;
|
||||
struct fcloop_lport_priv *lport_priv = localport->private;
|
||||
struct fcloop_lport *lport = lport_priv->lport;
|
||||
|
||||
/* release any threads waiting for the unreg to complete */
|
||||
complete(&lport->unreg_done);
|
||||
|
@ -697,7 +836,7 @@ static struct nvme_fc_port_template fctemplate = {
|
|||
.max_dif_sgl_segments = FCLOOP_SGL_SEGS,
|
||||
.dma_boundary = FCLOOP_DMABOUND_4G,
|
||||
/* sizes of additional private data for data structures */
|
||||
.local_priv_sz = sizeof(struct fcloop_lport),
|
||||
.local_priv_sz = sizeof(struct fcloop_lport_priv),
|
||||
.remote_priv_sz = sizeof(struct fcloop_rport),
|
||||
.lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
|
||||
.fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq),
|
||||
|
@ -714,8 +853,7 @@ static struct nvmet_fc_target_template tgttemplate = {
|
|||
.max_dif_sgl_segments = FCLOOP_SGL_SEGS,
|
||||
.dma_boundary = FCLOOP_DMABOUND_4G,
|
||||
/* optional features */
|
||||
.target_features = NVMET_FCTGTFEAT_CMD_IN_ISR |
|
||||
NVMET_FCTGTFEAT_OPDONE_IN_ISR,
|
||||
.target_features = 0,
|
||||
/* sizes of additional private data for data structures */
|
||||
.target_priv_sz = sizeof(struct fcloop_tport),
|
||||
};
|
||||
|
@ -728,11 +866,17 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
|
|||
struct fcloop_ctrl_options *opts;
|
||||
struct nvme_fc_local_port *localport;
|
||||
struct fcloop_lport *lport;
|
||||
int ret;
|
||||
struct fcloop_lport_priv *lport_priv;
|
||||
unsigned long flags;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
lport = kzalloc(sizeof(*lport), GFP_KERNEL);
|
||||
if (!lport)
|
||||
return -ENOMEM;
|
||||
|
||||
opts = kzalloc(sizeof(*opts), GFP_KERNEL);
|
||||
if (!opts)
|
||||
return -ENOMEM;
|
||||
goto out_free_lport;
|
||||
|
||||
ret = fcloop_parse_options(opts, buf);
|
||||
if (ret)
|
||||
|
@ -752,23 +896,25 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
|
|||
|
||||
ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport);
|
||||
if (!ret) {
|
||||
unsigned long flags;
|
||||
|
||||
/* success */
|
||||
lport = localport->private;
|
||||
lport_priv = localport->private;
|
||||
lport_priv->lport = lport;
|
||||
|
||||
lport->localport = localport;
|
||||
INIT_LIST_HEAD(&lport->lport_list);
|
||||
|
||||
spin_lock_irqsave(&fcloop_lock, flags);
|
||||
list_add_tail(&lport->lport_list, &fcloop_lports);
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
/* mark all of the input buffer consumed */
|
||||
ret = count;
|
||||
}
|
||||
|
||||
out_free_opts:
|
||||
kfree(opts);
|
||||
out_free_lport:
|
||||
/* free only if we're going to fail */
|
||||
if (ret)
|
||||
kfree(lport);
|
||||
|
||||
return ret ? ret : count;
|
||||
}
|
||||
|
||||
|
@ -790,6 +936,8 @@ __wait_localport_unreg(struct fcloop_lport *lport)
|
|||
|
||||
wait_for_completion(&lport->unreg_done);
|
||||
|
||||
kfree(lport);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -686,6 +686,7 @@ static struct nvmet_fabrics_ops nvme_loop_ops = {
|
|||
|
||||
static struct nvmf_transport_ops nvme_loop_transport = {
|
||||
.name = "loop",
|
||||
.module = THIS_MODULE,
|
||||
.create_ctrl = nvme_loop_create_ctrl,
|
||||
};
|
||||
|
||||
|
@ -716,7 +717,7 @@ static void __exit nvme_loop_cleanup_module(void)
|
|||
nvme_delete_ctrl(&ctrl->ctrl);
|
||||
mutex_unlock(&nvme_loop_ctrl_mutex);
|
||||
|
||||
flush_workqueue(nvme_wq);
|
||||
flush_workqueue(nvme_delete_wq);
|
||||
}
|
||||
|
||||
module_init(nvme_loop_init_module);
|
||||
|
|
|
@ -185,59 +185,6 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
|
|||
spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
|
||||
}
|
||||
|
||||
static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents)
|
||||
{
|
||||
struct scatterlist *sg;
|
||||
int count;
|
||||
|
||||
if (!sgl || !nents)
|
||||
return;
|
||||
|
||||
for_each_sg(sgl, sg, nents, count)
|
||||
__free_page(sg_page(sg));
|
||||
kfree(sgl);
|
||||
}
|
||||
|
||||
static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
|
||||
u32 length)
|
||||
{
|
||||
struct scatterlist *sg;
|
||||
struct page *page;
|
||||
unsigned int nent;
|
||||
int i = 0;
|
||||
|
||||
nent = DIV_ROUND_UP(length, PAGE_SIZE);
|
||||
sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
|
||||
if (!sg)
|
||||
goto out;
|
||||
|
||||
sg_init_table(sg, nent);
|
||||
|
||||
while (length) {
|
||||
u32 page_len = min_t(u32, length, PAGE_SIZE);
|
||||
|
||||
page = alloc_page(GFP_KERNEL);
|
||||
if (!page)
|
||||
goto out_free_pages;
|
||||
|
||||
sg_set_page(&sg[i], page, page_len, 0);
|
||||
length -= page_len;
|
||||
i++;
|
||||
}
|
||||
*sgl = sg;
|
||||
*nents = nent;
|
||||
return 0;
|
||||
|
||||
out_free_pages:
|
||||
while (i > 0) {
|
||||
i--;
|
||||
__free_page(sg_page(&sg[i]));
|
||||
}
|
||||
kfree(sg);
|
||||
out:
|
||||
return NVME_SC_INTERNAL;
|
||||
}
|
||||
|
||||
static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
|
||||
struct nvmet_rdma_cmd *c, bool admin)
|
||||
{
|
||||
|
@ -484,7 +431,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
|
|||
}
|
||||
|
||||
if (rsp->req.sg != &rsp->cmd->inline_sg)
|
||||
nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt);
|
||||
sgl_free(rsp->req.sg);
|
||||
|
||||
if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
|
||||
nvmet_rdma_process_wr_wait_list(queue);
|
||||
|
@ -621,16 +568,14 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
|
|||
u32 len = get_unaligned_le24(sgl->length);
|
||||
u32 key = get_unaligned_le32(sgl->key);
|
||||
int ret;
|
||||
u16 status;
|
||||
|
||||
/* no data command? */
|
||||
if (!len)
|
||||
return 0;
|
||||
|
||||
status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
|
||||
len);
|
||||
if (status)
|
||||
return status;
|
||||
rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt);
|
||||
if (!rsp->req.sg)
|
||||
return NVME_SC_INTERNAL;
|
||||
|
||||
ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
|
||||
rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
|
||||
|
@ -976,7 +921,7 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
|
|||
|
||||
static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
|
||||
{
|
||||
pr_info("freeing queue %d\n", queue->idx);
|
||||
pr_debug("freeing queue %d\n", queue->idx);
|
||||
|
||||
nvmet_sq_destroy(&queue->nvme_sq);
|
||||
|
||||
|
@ -1558,25 +1503,9 @@ err_ib_client:
|
|||
|
||||
static void __exit nvmet_rdma_exit(void)
|
||||
{
|
||||
struct nvmet_rdma_queue *queue;
|
||||
|
||||
nvmet_unregister_transport(&nvmet_rdma_ops);
|
||||
|
||||
flush_scheduled_work();
|
||||
|
||||
mutex_lock(&nvmet_rdma_queue_mutex);
|
||||
while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list,
|
||||
struct nvmet_rdma_queue, queue_list))) {
|
||||
list_del_init(&queue->queue_list);
|
||||
|
||||
mutex_unlock(&nvmet_rdma_queue_mutex);
|
||||
__nvmet_rdma_queue_disconnect(queue);
|
||||
mutex_lock(&nvmet_rdma_queue_mutex);
|
||||
}
|
||||
mutex_unlock(&nvmet_rdma_queue_mutex);
|
||||
|
||||
flush_scheduled_work();
|
||||
ib_unregister_client(&nvmet_rdma_ib_client);
|
||||
WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
|
||||
ida_destroy(&nvmet_rdma_queue_ida);
|
||||
}
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ menuconfig TARGET_CORE
|
|||
select CONFIGFS_FS
|
||||
select CRC_T10DIF
|
||||
select BLK_SCSI_REQUEST # only for scsi_command_size_tbl..
|
||||
select SGL_ALLOC
|
||||
default n
|
||||
help
|
||||
Say Y or M here to enable the TCM Storage Engine and ConfigFS enabled
|
||||
|
|
|
@ -2300,13 +2300,7 @@ queue_full:
|
|||
|
||||
void target_free_sgl(struct scatterlist *sgl, int nents)
|
||||
{
|
||||
struct scatterlist *sg;
|
||||
int count;
|
||||
|
||||
for_each_sg(sgl, sg, nents, count)
|
||||
__free_page(sg_page(sg));
|
||||
|
||||
kfree(sgl);
|
||||
sgl_free_n_order(sgl, nents, 0);
|
||||
}
|
||||
EXPORT_SYMBOL(target_free_sgl);
|
||||
|
||||
|
@ -2414,42 +2408,10 @@ int
|
|||
target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, u32 length,
|
||||
bool zero_page, bool chainable)
|
||||
{
|
||||
struct scatterlist *sg;
|
||||
struct page *page;
|
||||
gfp_t zero_flag = (zero_page) ? __GFP_ZERO : 0;
|
||||
unsigned int nalloc, nent;
|
||||
int i = 0;
|
||||
gfp_t gfp = GFP_KERNEL | (zero_page ? __GFP_ZERO : 0);
|
||||
|
||||
nalloc = nent = DIV_ROUND_UP(length, PAGE_SIZE);
|
||||
if (chainable)
|
||||
nalloc++;
|
||||
sg = kmalloc_array(nalloc, sizeof(struct scatterlist), GFP_KERNEL);
|
||||
if (!sg)
|
||||
return -ENOMEM;
|
||||
|
||||
sg_init_table(sg, nalloc);
|
||||
|
||||
while (length) {
|
||||
u32 page_len = min_t(u32, length, PAGE_SIZE);
|
||||
page = alloc_page(GFP_KERNEL | zero_flag);
|
||||
if (!page)
|
||||
goto out;
|
||||
|
||||
sg_set_page(&sg[i], page, page_len, 0);
|
||||
length -= page_len;
|
||||
i++;
|
||||
}
|
||||
*sgl = sg;
|
||||
*nents = nent;
|
||||
return 0;
|
||||
|
||||
out:
|
||||
while (i > 0) {
|
||||
i--;
|
||||
__free_page(sg_page(&sg[i]));
|
||||
}
|
||||
kfree(sg);
|
||||
return -ENOMEM;
|
||||
*sgl = sgl_alloc_order(length, 0, chainable, gfp, nents);
|
||||
return *sgl ? 0 : -ENOMEM;
|
||||
}
|
||||
EXPORT_SYMBOL(target_alloc_sgl);
|
||||
|
||||
|
|
|
@ -411,7 +411,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
|
|||
|
||||
static u64 bio_end_offset(struct bio *bio)
|
||||
{
|
||||
struct bio_vec *last = &bio->bi_io_vec[bio->bi_vcnt - 1];
|
||||
struct bio_vec *last = bio_last_bvec_all(bio);
|
||||
|
||||
return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
|
||||
}
|
||||
|
@ -563,7 +563,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
|
|||
/* we need the actual starting offset of this extent in the file */
|
||||
read_lock(&em_tree->lock);
|
||||
em = lookup_extent_mapping(em_tree,
|
||||
page_offset(bio->bi_io_vec->bv_page),
|
||||
page_offset(bio_first_page_all(bio)),
|
||||
PAGE_SIZE);
|
||||
read_unlock(&em_tree->lock);
|
||||
if (!em)
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue