for-linus-20190726
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAl07DGAQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgplf5EADOOvOdsz9N/Iw8ZHHHJXCqKR26zZv75G1z 0h1PGC7p0JZQbYFo0Zo7mjiRBGlg6tlXc2d4Gyl94XJKDwjeYTcFDvbvERdYa+MH d2RiFkAfR967Ri4fb+FP5L3mYOQdMJ/zk0xCDHLv/DcxeFLa5a9EJS1+vBSR+AcB 0JpJWuHypGqGmbTaL0z9q2pmx0mgA1ERlWQtkMLrsEr2Vqg/rrjGwe2bGFY00lXc vKtFkpfugKc4zVAPSzC1YZgojfDDpGNEA4QMtxMsEH4hqyMpHhrtUedNY5QrjC0B p9h6aPXXYr2KhGP0grrEytzaYUOzK2crK5h+q+1vu6nOgx2EgmnLM9tBu/LuRH1j uUzKJOa3/AE+bU7uZEsaUerTBsHrgEBa1x8G92obYRnjgW3aCD2CaSbjjBhNxTZ4 1dXyr0DTHFXZmfcfWja5tO26JTPzjwVOrwiRyU0S727UsdVJupoHiYLr5fwaDfgn /Du2I/XWvFtflm5i0ND0sdcX1yRlFiGZ9e45z1QFaFmcteKKWzRBDlC6mQzI/lw3 oc583mhDR3tRtJxow+wn6AuMUehFRh8wj0UhL/MEMjLW8GiqXU5aRtanT+22Xz4L saNDQieeEnV7raMYXMP0qIhkJtrNASmJQos+MOJAEGOWcS2ePIUUio2kSXie+071 BphJd2RamQ== =HIzH -----END PGP SIGNATURE----- Merge tag 'for-linus-20190726' of git://git.kernel.dk/linux-block Pull block fixes from Jens Axboe: - Several io_uring fixes/improvements: - Blocking fix for O_DIRECT (me) - Latter page slowness for registered buffers (me) - Fix poll hang under certain conditions (me) - Defer sequence check fix for wrapped rings (Zhengyuan) - Mismatch in async inc/dec accounting (Zhengyuan) - Memory ordering issue that could cause stall (Zhengyuan) - Track sequential defer in bytes, not pages (Zhengyuan) - NVMe pull request from Christoph - Set of hang fixes for wbt (Josef) - Redundant error message kill for libahci (Ding) - Remove unused blk_mq_sched_started_request() and related ops (Marcos) - drbd dynamic alloc shash descriptor to reduce stack use (Arnd) - blkcg ->pd_stat() non-debug print (Tejun) - bcache memory leak fix (Wei) - Comment fix (Akinobu) - BFQ perf regression fix (Paolo) * tag 'for-linus-20190726' of git://git.kernel.dk/linux-block: (24 commits) io_uring: ensure ->list is initialized for poll commands Revert "nvme-pci: don't create a read hctx mapping without read queues" nvme: fix multipath crash when ANA is deactivated nvme: fix memory leak caused by incorrect subsystem free nvme: ignore subnqn for ADATA SX6000LNP drbd: dynamically allocate shash descriptor block: blk-mq: Remove blk_mq_sched_started_request and started_request bcache: fix possible memory leak in bch_cached_dev_run() io_uring: track io length in async_list based on bytes io_uring: don't use iov_iter_advance() for fixed buffers block: properly handle IOCB_NOWAIT for async O_DIRECT IO blk-mq: allow REQ_NOWAIT to return an error inline io_uring: add a memory barrier before atomic_read rq-qos: use a mb for got_token rq-qos: set ourself TASK_UNINTERRUPTIBLE after we schedule rq-qos: don't reset has_sleepers on spurious wakeups rq-qos: fix missed wake-ups in rq_qos_throttle wait: add wq_has_single_sleeper helper block, bfq: check also in-flight I/O in dispatch plugging block: fix sysfs module parameters directory path in comment ...
This commit is contained in:
commit
0441281965
|
@ -3354,38 +3354,57 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
|
|||
* there is no active group, then the primary expectation for
|
||||
* this device is probably a high throughput.
|
||||
*
|
||||
* We are now left only with explaining the additional
|
||||
* compound condition that is checked below for deciding
|
||||
* whether the scenario is asymmetric. To explain this
|
||||
* compound condition, we need to add that the function
|
||||
* We are now left only with explaining the two sub-conditions in the
|
||||
* additional compound condition that is checked below for deciding
|
||||
* whether the scenario is asymmetric. To explain the first
|
||||
* sub-condition, we need to add that the function
|
||||
* bfq_asymmetric_scenario checks the weights of only
|
||||
* non-weight-raised queues, for efficiency reasons (see
|
||||
* comments on bfq_weights_tree_add()). Then the fact that
|
||||
* bfqq is weight-raised is checked explicitly here. More
|
||||
* precisely, the compound condition below takes into account
|
||||
* also the fact that, even if bfqq is being weight-raised,
|
||||
* the scenario is still symmetric if all queues with requests
|
||||
* waiting for completion happen to be
|
||||
* weight-raised. Actually, we should be even more precise
|
||||
* here, and differentiate between interactive weight raising
|
||||
* and soft real-time weight raising.
|
||||
* non-weight-raised queues, for efficiency reasons (see comments on
|
||||
* bfq_weights_tree_add()). Then the fact that bfqq is weight-raised
|
||||
* is checked explicitly here. More precisely, the compound condition
|
||||
* below takes into account also the fact that, even if bfqq is being
|
||||
* weight-raised, the scenario is still symmetric if all queues with
|
||||
* requests waiting for completion happen to be
|
||||
* weight-raised. Actually, we should be even more precise here, and
|
||||
* differentiate between interactive weight raising and soft real-time
|
||||
* weight raising.
|
||||
*
|
||||
* The second sub-condition checked in the compound condition is
|
||||
* whether there is a fair amount of already in-flight I/O not
|
||||
* belonging to bfqq. If so, I/O dispatching is to be plugged, for the
|
||||
* following reason. The drive may decide to serve in-flight
|
||||
* non-bfqq's I/O requests before bfqq's ones, thereby delaying the
|
||||
* arrival of new I/O requests for bfqq (recall that bfqq is sync). If
|
||||
* I/O-dispatching is not plugged, then, while bfqq remains empty, a
|
||||
* basically uncontrolled amount of I/O from other queues may be
|
||||
* dispatched too, possibly causing the service of bfqq's I/O to be
|
||||
* delayed even longer in the drive. This problem gets more and more
|
||||
* serious as the speed and the queue depth of the drive grow,
|
||||
* because, as these two quantities grow, the probability to find no
|
||||
* queue busy but many requests in flight grows too. By contrast,
|
||||
* plugging I/O dispatching minimizes the delay induced by already
|
||||
* in-flight I/O, and enables bfqq to recover the bandwidth it may
|
||||
* lose because of this delay.
|
||||
*
|
||||
* As a side note, it is worth considering that the above
|
||||
* device-idling countermeasures may however fail in the
|
||||
* following unlucky scenario: if idling is (correctly)
|
||||
* disabled in a time period during which all symmetry
|
||||
* sub-conditions hold, and hence the device is allowed to
|
||||
* enqueue many requests, but at some later point in time some
|
||||
* sub-condition stops to hold, then it may become impossible
|
||||
* to let requests be served in the desired order until all
|
||||
* the requests already queued in the device have been served.
|
||||
* device-idling countermeasures may however fail in the following
|
||||
* unlucky scenario: if I/O-dispatch plugging is (correctly) disabled
|
||||
* in a time period during which all symmetry sub-conditions hold, and
|
||||
* therefore the device is allowed to enqueue many requests, but at
|
||||
* some later point in time some sub-condition stops to hold, then it
|
||||
* may become impossible to make requests be served in the desired
|
||||
* order until all the requests already queued in the device have been
|
||||
* served. The last sub-condition commented above somewhat mitigates
|
||||
* this problem for weight-raised queues.
|
||||
*/
|
||||
static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
|
||||
struct bfq_queue *bfqq)
|
||||
{
|
||||
return (bfqq->wr_coeff > 1 &&
|
||||
bfqd->wr_busy_queues <
|
||||
bfq_tot_busy_queues(bfqd)) ||
|
||||
(bfqd->wr_busy_queues <
|
||||
bfq_tot_busy_queues(bfqd) ||
|
||||
bfqd->rq_in_driver >=
|
||||
bfqq->dispatched + 4)) ||
|
||||
bfq_asymmetric_scenario(bfqd, bfqq);
|
||||
}
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
|
|||
|
||||
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
|
||||
|
||||
static bool blkcg_debug_stats = false;
|
||||
bool blkcg_debug_stats = false;
|
||||
static struct workqueue_struct *blkcg_punt_bio_wq;
|
||||
|
||||
static bool blkcg_policy_enabled(struct request_queue *q,
|
||||
|
@ -944,10 +944,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
|
|||
dbytes, dios);
|
||||
}
|
||||
|
||||
if (!blkcg_debug_stats)
|
||||
goto next;
|
||||
|
||||
if (atomic_read(&blkg->use_delay)) {
|
||||
if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
|
||||
has_stats = true;
|
||||
off += scnprintf(buf+off, size-off,
|
||||
" use_delay=%d delay_nsec=%llu",
|
||||
|
@ -967,7 +964,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
|
|||
has_stats = true;
|
||||
off += written;
|
||||
}
|
||||
next:
|
||||
|
||||
if (has_stats) {
|
||||
if (off < size - 1) {
|
||||
off += scnprintf(buf+off, size-off, "\n");
|
||||
|
|
|
@ -917,6 +917,9 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
|
|||
unsigned long long avg_lat;
|
||||
unsigned long long cur_win;
|
||||
|
||||
if (!blkcg_debug_stats)
|
||||
return 0;
|
||||
|
||||
if (iolat->ssd)
|
||||
return iolatency_ssd_stat(iolat, buf, size);
|
||||
|
||||
|
|
|
@ -61,15 +61,6 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
|
|||
e->type->ops.completed_request(rq, now);
|
||||
}
|
||||
|
||||
static inline void blk_mq_sched_started_request(struct request *rq)
|
||||
{
|
||||
struct request_queue *q = rq->q;
|
||||
struct elevator_queue *e = q->elevator;
|
||||
|
||||
if (e && e->type->ops.started_request)
|
||||
e->type->ops.started_request(rq);
|
||||
}
|
||||
|
||||
static inline void blk_mq_sched_requeue_request(struct request *rq)
|
||||
{
|
||||
struct request_queue *q = rq->q;
|
||||
|
|
|
@ -669,8 +669,6 @@ void blk_mq_start_request(struct request *rq)
|
|||
{
|
||||
struct request_queue *q = rq->q;
|
||||
|
||||
blk_mq_sched_started_request(rq);
|
||||
|
||||
trace_block_rq_issue(q, rq);
|
||||
|
||||
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
|
||||
|
@ -1960,9 +1958,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
|
|||
rq = blk_mq_get_request(q, bio, &data);
|
||||
if (unlikely(!rq)) {
|
||||
rq_qos_cleanup(q, bio);
|
||||
if (bio->bi_opf & REQ_NOWAIT)
|
||||
|
||||
cookie = BLK_QC_T_NONE;
|
||||
if (bio->bi_opf & REQ_NOWAIT_INLINE)
|
||||
cookie = BLK_QC_T_EAGAIN;
|
||||
else if (bio->bi_opf & REQ_NOWAIT)
|
||||
bio_wouldblock_error(bio);
|
||||
return BLK_QC_T_NONE;
|
||||
return cookie;
|
||||
}
|
||||
|
||||
trace_block_getrq(q, bio, bio->bi_opf);
|
||||
|
|
|
@ -202,6 +202,7 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr,
|
|||
return -1;
|
||||
|
||||
data->got_token = true;
|
||||
smp_wmb();
|
||||
list_del_init(&curr->entry);
|
||||
wake_up_process(data->task);
|
||||
return 1;
|
||||
|
@ -244,7 +245,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
|
|||
return;
|
||||
|
||||
prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
|
||||
has_sleeper = !wq_has_single_sleeper(&rqw->wait);
|
||||
do {
|
||||
/* The memory barrier in set_task_state saves us here. */
|
||||
if (data.got_token)
|
||||
break;
|
||||
if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
|
||||
|
@ -255,12 +258,14 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
|
|||
* which means we now have two. Put our local token
|
||||
* and wake anyone else potentially waiting for one.
|
||||
*/
|
||||
smp_rmb();
|
||||
if (data.got_token)
|
||||
cleanup_cb(rqw, private_data);
|
||||
break;
|
||||
}
|
||||
io_schedule();
|
||||
has_sleeper = false;
|
||||
has_sleeper = true;
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
} while (1);
|
||||
finish_wait(&rqw->wait, &data.wq);
|
||||
}
|
||||
|
|
|
@ -1969,7 +1969,7 @@ static const struct attribute *disk_events_attrs[] = {
|
|||
* The default polling interval can be specified by the kernel
|
||||
* parameter block.events_dfl_poll_msecs which defaults to 0
|
||||
* (disable). This can also be modified runtime by writing to
|
||||
* /sys/module/block/events_dfl_poll_msecs.
|
||||
* /sys/module/block/parameters/events_dfl_poll_msecs.
|
||||
*/
|
||||
static int disk_events_set_dfl_poll_msecs(const char *val,
|
||||
const struct kernel_param *kp)
|
||||
|
|
|
@ -408,7 +408,6 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev,
|
|||
hpriv->mmio = devm_ioremap_resource(dev,
|
||||
platform_get_resource(pdev, IORESOURCE_MEM, 0));
|
||||
if (IS_ERR(hpriv->mmio)) {
|
||||
dev_err(dev, "no mmio space\n");
|
||||
rc = PTR_ERR(hpriv->mmio);
|
||||
goto err_out;
|
||||
}
|
||||
|
|
|
@ -5417,7 +5417,7 @@ static int drbd_do_auth(struct drbd_connection *connection)
|
|||
unsigned int key_len;
|
||||
char secret[SHARED_SECRET_MAX]; /* 64 byte */
|
||||
unsigned int resp_size;
|
||||
SHASH_DESC_ON_STACK(desc, connection->cram_hmac_tfm);
|
||||
struct shash_desc *desc;
|
||||
struct packet_info pi;
|
||||
struct net_conf *nc;
|
||||
int err, rv;
|
||||
|
@ -5430,6 +5430,13 @@ static int drbd_do_auth(struct drbd_connection *connection)
|
|||
memcpy(secret, nc->shared_secret, key_len);
|
||||
rcu_read_unlock();
|
||||
|
||||
desc = kmalloc(sizeof(struct shash_desc) +
|
||||
crypto_shash_descsize(connection->cram_hmac_tfm),
|
||||
GFP_KERNEL);
|
||||
if (!desc) {
|
||||
rv = -1;
|
||||
goto fail;
|
||||
}
|
||||
desc->tfm = connection->cram_hmac_tfm;
|
||||
|
||||
rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
|
||||
|
@ -5571,7 +5578,10 @@ static int drbd_do_auth(struct drbd_connection *connection)
|
|||
kfree(peers_ch);
|
||||
kfree(response);
|
||||
kfree(right_response);
|
||||
shash_desc_zero(desc);
|
||||
if (desc) {
|
||||
shash_desc_zero(desc);
|
||||
kfree(desc);
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
|
|
@ -931,6 +931,9 @@ int bch_cached_dev_run(struct cached_dev *dc)
|
|||
if (dc->io_disable) {
|
||||
pr_err("I/O disabled on cached dev %s",
|
||||
dc->backing_dev_name);
|
||||
kfree(env[1]);
|
||||
kfree(env[2]);
|
||||
kfree(buf);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
|
|
|
@ -2311,15 +2311,13 @@ static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ct
|
|||
memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
|
||||
}
|
||||
|
||||
static void __nvme_release_subsystem(struct nvme_subsystem *subsys)
|
||||
{
|
||||
ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
|
||||
kfree(subsys);
|
||||
}
|
||||
|
||||
static void nvme_release_subsystem(struct device *dev)
|
||||
{
|
||||
__nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev));
|
||||
struct nvme_subsystem *subsys =
|
||||
container_of(dev, struct nvme_subsystem, dev);
|
||||
|
||||
ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
|
||||
kfree(subsys);
|
||||
}
|
||||
|
||||
static void nvme_destroy_subsystem(struct kref *ref)
|
||||
|
@ -2477,7 +2475,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
|||
mutex_lock(&nvme_subsystems_lock);
|
||||
found = __nvme_find_get_subsystem(subsys->subnqn);
|
||||
if (found) {
|
||||
__nvme_release_subsystem(subsys);
|
||||
put_device(&subsys->dev);
|
||||
subsys = found;
|
||||
|
||||
if (!nvme_validate_cntlid(subsys, ctrl, id)) {
|
||||
|
|
|
@ -12,11 +12,6 @@ module_param(multipath, bool, 0444);
|
|||
MODULE_PARM_DESC(multipath,
|
||||
"turn on native support for multiple controllers per subsystem");
|
||||
|
||||
inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
|
||||
}
|
||||
|
||||
/*
|
||||
* If multipathing is enabled we need to always use the subsystem instance
|
||||
* number for numbering our devices to avoid conflicts between subsystems that
|
||||
|
@ -622,7 +617,8 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
|||
{
|
||||
int error;
|
||||
|
||||
if (!nvme_ctrl_use_ana(ctrl))
|
||||
/* check if multipath is enabled and we have the capability */
|
||||
if (!multipath || !ctrl->subsys || !(ctrl->subsys->cmic & (1 << 3)))
|
||||
return 0;
|
||||
|
||||
ctrl->anacap = id->anacap;
|
||||
|
|
|
@ -485,7 +485,11 @@ extern const struct attribute_group *nvme_ns_id_attr_groups[];
|
|||
extern const struct block_device_operations nvme_ns_head_ops;
|
||||
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
|
||||
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
return ctrl->ana_log_buf != NULL;
|
||||
}
|
||||
|
||||
void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
|
||||
struct nvme_ctrl *ctrl, int *flags);
|
||||
void nvme_failover_req(struct request *req);
|
||||
|
|
|
@ -2254,9 +2254,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
|
|||
if (!dev->ctrl.tagset) {
|
||||
dev->tagset.ops = &nvme_mq_ops;
|
||||
dev->tagset.nr_hw_queues = dev->online_queues - 1;
|
||||
dev->tagset.nr_maps = 1; /* default */
|
||||
if (dev->io_queues[HCTX_TYPE_READ])
|
||||
dev->tagset.nr_maps++;
|
||||
dev->tagset.nr_maps = 2; /* default + read */
|
||||
if (dev->io_queues[HCTX_TYPE_POLL])
|
||||
dev->tagset.nr_maps++;
|
||||
dev->tagset.timeout = NVME_IO_TIMEOUT;
|
||||
|
@ -3029,6 +3027,8 @@ static const struct pci_device_id nvme_id_table[] = {
|
|||
.driver_data = NVME_QUIRK_LIGHTNVM, },
|
||||
{ PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */
|
||||
.driver_data = NVME_QUIRK_LIGHTNVM, },
|
||||
{ PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */
|
||||
.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
|
||||
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
|
||||
|
|
|
@ -345,15 +345,24 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
|
|||
struct bio *bio;
|
||||
bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
|
||||
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
|
||||
bool nowait = (iocb->ki_flags & IOCB_NOWAIT) != 0;
|
||||
loff_t pos = iocb->ki_pos;
|
||||
blk_qc_t qc = BLK_QC_T_NONE;
|
||||
int ret = 0;
|
||||
gfp_t gfp;
|
||||
ssize_t ret;
|
||||
|
||||
if ((pos | iov_iter_alignment(iter)) &
|
||||
(bdev_logical_block_size(bdev) - 1))
|
||||
return -EINVAL;
|
||||
|
||||
bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
|
||||
if (nowait)
|
||||
gfp = GFP_NOWAIT;
|
||||
else
|
||||
gfp = GFP_KERNEL;
|
||||
|
||||
bio = bio_alloc_bioset(gfp, nr_pages, &blkdev_dio_pool);
|
||||
if (!bio)
|
||||
return -EAGAIN;
|
||||
|
||||
dio = container_of(bio, struct blkdev_dio, bio);
|
||||
dio->is_sync = is_sync = is_sync_kiocb(iocb);
|
||||
|
@ -375,7 +384,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
|
|||
if (!is_poll)
|
||||
blk_start_plug(&plug);
|
||||
|
||||
ret = 0;
|
||||
for (;;) {
|
||||
int err;
|
||||
|
||||
bio_set_dev(bio, bdev);
|
||||
bio->bi_iter.bi_sector = pos >> 9;
|
||||
bio->bi_write_hint = iocb->ki_hint;
|
||||
|
@ -383,8 +395,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
|
|||
bio->bi_end_io = blkdev_bio_end_io;
|
||||
bio->bi_ioprio = iocb->ki_ioprio;
|
||||
|
||||
ret = bio_iov_iter_get_pages(bio, iter);
|
||||
if (unlikely(ret)) {
|
||||
err = bio_iov_iter_get_pages(bio, iter);
|
||||
if (unlikely(err)) {
|
||||
if (!ret)
|
||||
ret = err;
|
||||
bio->bi_status = BLK_STS_IOERR;
|
||||
bio_endio(bio);
|
||||
break;
|
||||
|
@ -399,6 +413,14 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
|
|||
task_io_account_write(bio->bi_iter.bi_size);
|
||||
}
|
||||
|
||||
/*
|
||||
* Tell underlying layer to not block for resource shortage.
|
||||
* And if we would have blocked, return error inline instead
|
||||
* of through the bio->bi_end_io() callback.
|
||||
*/
|
||||
if (nowait)
|
||||
bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE);
|
||||
|
||||
dio->size += bio->bi_iter.bi_size;
|
||||
pos += bio->bi_iter.bi_size;
|
||||
|
||||
|
@ -412,6 +434,11 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
|
|||
}
|
||||
|
||||
qc = submit_bio(bio);
|
||||
if (qc == BLK_QC_T_EAGAIN) {
|
||||
if (!ret)
|
||||
ret = -EAGAIN;
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (polled)
|
||||
WRITE_ONCE(iocb->ki_cookie, qc);
|
||||
|
@ -432,8 +459,20 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
|
|||
atomic_inc(&dio->ref);
|
||||
}
|
||||
|
||||
submit_bio(bio);
|
||||
bio = bio_alloc(GFP_KERNEL, nr_pages);
|
||||
qc = submit_bio(bio);
|
||||
if (qc == BLK_QC_T_EAGAIN) {
|
||||
if (!ret)
|
||||
ret = -EAGAIN;
|
||||
goto error;
|
||||
}
|
||||
ret += bio->bi_iter.bi_size;
|
||||
|
||||
bio = bio_alloc(gfp, nr_pages);
|
||||
if (!bio) {
|
||||
if (!ret)
|
||||
ret = -EAGAIN;
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_poll)
|
||||
|
@ -453,13 +492,16 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
|
|||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
out:
|
||||
if (!ret)
|
||||
ret = blk_status_to_errno(dio->bio.bi_status);
|
||||
if (likely(!ret))
|
||||
ret = dio->size;
|
||||
|
||||
bio_put(&dio->bio);
|
||||
return ret;
|
||||
error:
|
||||
if (!is_poll)
|
||||
blk_finish_plug(&plug);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
|
|
|
@ -202,7 +202,7 @@ struct async_list {
|
|||
|
||||
struct file *file;
|
||||
off_t io_end;
|
||||
size_t io_pages;
|
||||
size_t io_len;
|
||||
};
|
||||
|
||||
struct io_ring_ctx {
|
||||
|
@ -333,7 +333,8 @@ struct io_kiocb {
|
|||
#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
|
||||
#define REQ_F_IO_DRAINED 32 /* drain done */
|
||||
#define REQ_F_LINK 64 /* linked sqes */
|
||||
#define REQ_F_FAIL_LINK 128 /* fail rest of links */
|
||||
#define REQ_F_LINK_DONE 128 /* linked sqes done */
|
||||
#define REQ_F_FAIL_LINK 256 /* fail rest of links */
|
||||
u64 user_data;
|
||||
u32 result;
|
||||
u32 sequence;
|
||||
|
@ -429,7 +430,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
|
|||
if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
|
||||
return false;
|
||||
|
||||
return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped;
|
||||
return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped;
|
||||
}
|
||||
|
||||
static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
|
||||
|
@ -632,6 +633,7 @@ static void io_req_link_next(struct io_kiocb *req)
|
|||
nxt->flags |= REQ_F_LINK;
|
||||
}
|
||||
|
||||
nxt->flags |= REQ_F_LINK_DONE;
|
||||
INIT_WORK(&nxt->work, io_sq_wq_submit_work);
|
||||
queue_work(req->ctx->sqo_wq, &nxt->work);
|
||||
}
|
||||
|
@ -1064,8 +1066,44 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
|
|||
*/
|
||||
offset = buf_addr - imu->ubuf;
|
||||
iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
|
||||
if (offset)
|
||||
iov_iter_advance(iter, offset);
|
||||
|
||||
if (offset) {
|
||||
/*
|
||||
* Don't use iov_iter_advance() here, as it's really slow for
|
||||
* using the latter parts of a big fixed buffer - it iterates
|
||||
* over each segment manually. We can cheat a bit here, because
|
||||
* we know that:
|
||||
*
|
||||
* 1) it's a BVEC iter, we set it up
|
||||
* 2) all bvecs are PAGE_SIZE in size, except potentially the
|
||||
* first and last bvec
|
||||
*
|
||||
* So just find our index, and adjust the iterator afterwards.
|
||||
* If the offset is within the first bvec (or the whole first
|
||||
* bvec, just use iov_iter_advance(). This makes it easier
|
||||
* since we can just skip the first segment, which may not
|
||||
* be PAGE_SIZE aligned.
|
||||
*/
|
||||
const struct bio_vec *bvec = imu->bvec;
|
||||
|
||||
if (offset <= bvec->bv_len) {
|
||||
iov_iter_advance(iter, offset);
|
||||
} else {
|
||||
unsigned long seg_skip;
|
||||
|
||||
/* skip first vec */
|
||||
offset -= bvec->bv_len;
|
||||
seg_skip = 1 + (offset >> PAGE_SHIFT);
|
||||
|
||||
iter->bvec = bvec + seg_skip;
|
||||
iter->nr_segs -= seg_skip;
|
||||
iter->count -= (seg_skip << PAGE_SHIFT);
|
||||
iter->iov_offset = offset & ~PAGE_MASK;
|
||||
if (iter->iov_offset)
|
||||
iter->count -= iter->iov_offset;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1120,28 +1158,26 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
|
|||
off_t io_end = kiocb->ki_pos + len;
|
||||
|
||||
if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
|
||||
unsigned long max_pages;
|
||||
unsigned long max_bytes;
|
||||
|
||||
/* Use 8x RA size as a decent limiter for both reads/writes */
|
||||
max_pages = filp->f_ra.ra_pages;
|
||||
if (!max_pages)
|
||||
max_pages = VM_READAHEAD_PAGES;
|
||||
max_pages *= 8;
|
||||
max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
|
||||
if (!max_bytes)
|
||||
max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
|
||||
|
||||
/* If max pages are exceeded, reset the state */
|
||||
len >>= PAGE_SHIFT;
|
||||
if (async_list->io_pages + len <= max_pages) {
|
||||
/* If max len are exceeded, reset the state */
|
||||
if (async_list->io_len + len <= max_bytes) {
|
||||
req->flags |= REQ_F_SEQ_PREV;
|
||||
async_list->io_pages += len;
|
||||
async_list->io_len += len;
|
||||
} else {
|
||||
io_end = 0;
|
||||
async_list->io_pages = 0;
|
||||
async_list->io_len = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* New file? Reset state. */
|
||||
if (async_list->file != filp) {
|
||||
async_list->io_pages = 0;
|
||||
async_list->io_len = 0;
|
||||
async_list->file = filp;
|
||||
}
|
||||
async_list->io_end = io_end;
|
||||
|
@ -1630,6 +1666,8 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
|||
INIT_LIST_HEAD(&poll->wait.entry);
|
||||
init_waitqueue_func_entry(&poll->wait, io_poll_wake);
|
||||
|
||||
INIT_LIST_HEAD(&req->list);
|
||||
|
||||
mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
|
||||
|
||||
spin_lock_irq(&ctx->completion_lock);
|
||||
|
@ -1844,6 +1882,10 @@ restart:
|
|||
/* async context always use a copy of the sqe */
|
||||
kfree(sqe);
|
||||
|
||||
/* req from defer and link list needn't decrease async cnt */
|
||||
if (req->flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
|
||||
goto out;
|
||||
|
||||
if (!async_list)
|
||||
break;
|
||||
if (!list_empty(&req_list)) {
|
||||
|
@ -1891,6 +1933,7 @@ restart:
|
|||
}
|
||||
}
|
||||
|
||||
out:
|
||||
if (cur_mm) {
|
||||
set_fs(old_fs);
|
||||
unuse_mm(cur_mm);
|
||||
|
@ -1917,6 +1960,10 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
|
|||
ret = true;
|
||||
spin_lock(&list->lock);
|
||||
list_add_tail(&req->list, &list->list);
|
||||
/*
|
||||
* Ensure we see a simultaneous modification from io_sq_wq_submit_work()
|
||||
*/
|
||||
smp_mb();
|
||||
if (!atomic_read(&list->cnt)) {
|
||||
list_del_init(&req->list);
|
||||
ret = false;
|
||||
|
|
|
@ -181,6 +181,7 @@ struct blkcg_policy {
|
|||
|
||||
extern struct blkcg blkcg_root;
|
||||
extern struct cgroup_subsys_state * const blkcg_root_css;
|
||||
extern bool blkcg_debug_stats;
|
||||
|
||||
struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
|
||||
struct request_queue *q, bool update_hint);
|
||||
|
|
|
@ -311,6 +311,7 @@ enum req_flag_bits {
|
|||
__REQ_RAHEAD, /* read ahead, can fail anytime */
|
||||
__REQ_BACKGROUND, /* background IO */
|
||||
__REQ_NOWAIT, /* Don't wait if request will block */
|
||||
__REQ_NOWAIT_INLINE, /* Return would-block error inline */
|
||||
/*
|
||||
* When a shared kthread needs to issue a bio for a cgroup, doing
|
||||
* so synchronously can lead to priority inversions as the kthread
|
||||
|
@ -345,6 +346,7 @@ enum req_flag_bits {
|
|||
#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
|
||||
#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
|
||||
#define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
|
||||
#define REQ_NOWAIT_INLINE (1ULL << __REQ_NOWAIT_INLINE)
|
||||
#define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT)
|
||||
|
||||
#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
|
||||
|
@ -418,12 +420,13 @@ static inline int op_stat_group(unsigned int op)
|
|||
|
||||
typedef unsigned int blk_qc_t;
|
||||
#define BLK_QC_T_NONE -1U
|
||||
#define BLK_QC_T_EAGAIN -2U
|
||||
#define BLK_QC_T_SHIFT 16
|
||||
#define BLK_QC_T_INTERNAL (1U << 31)
|
||||
|
||||
static inline bool blk_qc_t_valid(blk_qc_t cookie)
|
||||
{
|
||||
return cookie != BLK_QC_T_NONE;
|
||||
return cookie != BLK_QC_T_NONE && cookie != BLK_QC_T_EAGAIN;
|
||||
}
|
||||
|
||||
static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
|
||||
|
|
|
@ -45,7 +45,6 @@ struct elevator_mq_ops {
|
|||
struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
|
||||
bool (*has_work)(struct blk_mq_hw_ctx *);
|
||||
void (*completed_request)(struct request *, u64);
|
||||
void (*started_request)(struct request *);
|
||||
void (*requeue_request)(struct request *);
|
||||
struct request *(*former_request)(struct request_queue *, struct request *);
|
||||
struct request *(*next_request)(struct request_queue *, struct request *);
|
||||
|
|
|
@ -126,6 +126,19 @@ static inline int waitqueue_active(struct wait_queue_head *wq_head)
|
|||
return !list_empty(&wq_head->head);
|
||||
}
|
||||
|
||||
/**
|
||||
* wq_has_single_sleeper - check if there is only one sleeper
|
||||
* @wq_head: wait queue head
|
||||
*
|
||||
* Returns true of wq_head has only one sleeper on the list.
|
||||
*
|
||||
* Please refer to the comment for waitqueue_active.
|
||||
*/
|
||||
static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head)
|
||||
{
|
||||
return list_is_singular(&wq_head->head);
|
||||
}
|
||||
|
||||
/**
|
||||
* wq_has_sleeper - check if there are any waiting processes
|
||||
* @wq_head: wait queue head
|
||||
|
|
Loading…
Reference in New Issue