2018-12-04 09:52:17 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
|
|
* NVMe over Fabrics TCP host.
|
|
|
|
* Copyright (c) 2018 Lightbits Labs. All rights reserved.
|
|
|
|
*/
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/err.h>
|
|
|
|
#include <linux/nvme-tcp.h>
|
|
|
|
#include <net/sock.h>
|
|
|
|
#include <net/tcp.h>
|
|
|
|
#include <linux/blk-mq.h>
|
|
|
|
#include <crypto/hash.h>
|
2019-07-04 05:08:04 +08:00
|
|
|
#include <net/busy_poll.h>
|
2023-01-20 08:45:16 +08:00
|
|
|
#include <trace/events/sock.h>
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
#include "nvme.h"
|
|
|
|
#include "fabrics.h"
|
|
|
|
|
|
|
|
struct nvme_tcp_queue;
|
|
|
|
|
2020-01-16 08:46:12 +08:00
|
|
|
/* Define the socket priority to use for connections were it is desirable
|
|
|
|
* that the NIC consider performing optimized packet processing or filtering.
|
|
|
|
* A non-zero value being sufficient to indicate general consideration of any
|
|
|
|
* possible optimization. Making it a module param allows for alternative
|
|
|
|
* values that may be unique for some NIC implementations.
|
|
|
|
*/
|
|
|
|
static int so_priority;
|
|
|
|
module_param(so_priority, int, 0644);
|
|
|
|
MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
|
|
|
|
|
nvme-tcp: lockdep: annotate in-kernel sockets
Put NVMe/TCP sockets in their own class to avoid some lockdep warnings.
Sockets created by nvme-tcp are not exposed to user-space, and will not
trigger certain code paths that the general socket API exposes.
Lockdep complains about a circular dependency between the socket and
filesystem locks, because setsockopt can trigger a page fault with a
socket lock held, but nvme-tcp sends requests on the socket while file
system locks are held.
======================================================
WARNING: possible circular locking dependency detected
5.15.0-rc3 #1 Not tainted
------------------------------------------------------
fio/1496 is trying to acquire lock:
(sk_lock-AF_INET){+.+.}-{0:0}, at: tcp_sendpage+0x23/0x80
but task is already holding lock:
(&xfs_dir_ilock_class/5){+.+.}-{3:3}, at: xfs_ilock+0xcf/0x290 [xfs]
which lock already depends on the new lock.
other info that might help us debug this:
chain exists of:
sk_lock-AF_INET --> sb_internal --> &xfs_dir_ilock_class/5
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&xfs_dir_ilock_class/5);
lock(sb_internal);
lock(&xfs_dir_ilock_class/5);
lock(sk_lock-AF_INET);
*** DEADLOCK ***
6 locks held by fio/1496:
#0: (sb_writers#13){.+.+}-{0:0}, at: path_openat+0x9fc/0xa20
#1: (&inode->i_sb->s_type->i_mutex_dir_key){++++}-{3:3}, at: path_openat+0x296/0xa20
#2: (sb_internal){.+.+}-{0:0}, at: xfs_trans_alloc_icreate+0x41/0xd0 [xfs]
#3: (&xfs_dir_ilock_class/5){+.+.}-{3:3}, at: xfs_ilock+0xcf/0x290 [xfs]
#4: (hctx->srcu){....}-{0:0}, at: hctx_lock+0x51/0xd0
#5: (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0x33e/0x380 [nvme_tcp]
This annotation lets lockdep analyze nvme-tcp controlled sockets
independently of what the user-space sockets API does.
Link: https://lore.kernel.org/linux-nvme/CAHj4cs9MDYLJ+q+2_GXUK9HxFizv2pxUryUR0toX974M040z7g@mail.gmail.com/
Signed-off-by: Chris Leech <cleech@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-02-16 10:22:49 +08:00
|
|
|
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
|
|
|
/* lockdep can detect a circular dependency of the form
|
|
|
|
* sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
|
|
|
|
* because dependencies are tracked for both nvme-tcp and user contexts. Using
|
|
|
|
* a separate class prevents lockdep from conflating nvme-tcp socket use with
|
|
|
|
* user-space socket API use.
|
|
|
|
*/
|
|
|
|
static struct lock_class_key nvme_tcp_sk_key[2];
|
|
|
|
static struct lock_class_key nvme_tcp_slock_key[2];
|
|
|
|
|
|
|
|
static void nvme_tcp_reclassify_socket(struct socket *sock)
|
|
|
|
{
|
|
|
|
struct sock *sk = sock->sk;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
|
|
|
|
return;
|
|
|
|
|
|
|
|
switch (sk->sk_family) {
|
|
|
|
case AF_INET:
|
|
|
|
sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME",
|
|
|
|
&nvme_tcp_slock_key[0],
|
|
|
|
"sk_lock-AF_INET-NVME",
|
|
|
|
&nvme_tcp_sk_key[0]);
|
|
|
|
break;
|
|
|
|
case AF_INET6:
|
|
|
|
sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME",
|
|
|
|
&nvme_tcp_slock_key[1],
|
|
|
|
"sk_lock-AF_INET6-NVME",
|
|
|
|
&nvme_tcp_sk_key[1]);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static void nvme_tcp_reclassify_socket(struct socket *sock) { }
|
|
|
|
#endif
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
enum nvme_tcp_send_state {
|
|
|
|
NVME_TCP_SEND_CMD_PDU = 0,
|
|
|
|
NVME_TCP_SEND_H2C_PDU,
|
|
|
|
NVME_TCP_SEND_DATA,
|
|
|
|
NVME_TCP_SEND_DDGST,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct nvme_tcp_request {
|
|
|
|
struct nvme_request req;
|
|
|
|
void *pdu;
|
|
|
|
struct nvme_tcp_queue *queue;
|
|
|
|
u32 data_len;
|
|
|
|
u32 pdu_len;
|
|
|
|
u32 pdu_sent;
|
2022-01-23 00:57:44 +08:00
|
|
|
u32 h2cdata_left;
|
|
|
|
u32 h2cdata_offset;
|
2018-12-04 09:52:17 +08:00
|
|
|
u16 ttag;
|
2021-08-30 21:36:26 +08:00
|
|
|
__le16 status;
|
2018-12-04 09:52:17 +08:00
|
|
|
struct list_head entry;
|
2020-06-19 08:30:22 +08:00
|
|
|
struct llist_node lentry;
|
2018-12-13 16:46:59 +08:00
|
|
|
__le32 ddgst;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
struct bio *curr_bio;
|
|
|
|
struct iov_iter iter;
|
|
|
|
|
|
|
|
/* send state */
|
|
|
|
size_t offset;
|
|
|
|
size_t data_sent;
|
|
|
|
enum nvme_tcp_send_state state;
|
|
|
|
};
|
|
|
|
|
|
|
|
enum nvme_tcp_queue_flags {
|
|
|
|
NVME_TCP_Q_ALLOCATED = 0,
|
|
|
|
NVME_TCP_Q_LIVE = 1,
|
2020-05-02 05:25:44 +08:00
|
|
|
NVME_TCP_Q_POLLING = 2,
|
2018-12-04 09:52:17 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
enum nvme_tcp_recv_state {
|
|
|
|
NVME_TCP_RECV_PDU = 0,
|
|
|
|
NVME_TCP_RECV_DATA,
|
|
|
|
NVME_TCP_RECV_DDGST,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct nvme_tcp_ctrl;
|
|
|
|
struct nvme_tcp_queue {
|
|
|
|
struct socket *sock;
|
|
|
|
struct work_struct io_work;
|
|
|
|
int io_cpu;
|
|
|
|
|
2021-01-14 17:09:26 +08:00
|
|
|
struct mutex queue_lock;
|
2020-05-02 05:25:45 +08:00
|
|
|
struct mutex send_mutex;
|
2020-06-19 08:30:22 +08:00
|
|
|
struct llist_head req_list;
|
2018-12-04 09:52:17 +08:00
|
|
|
struct list_head send_list;
|
|
|
|
|
|
|
|
/* recv state */
|
|
|
|
void *pdu;
|
|
|
|
int pdu_remaining;
|
|
|
|
int pdu_offset;
|
|
|
|
size_t data_remaining;
|
|
|
|
size_t ddgst_remaining;
|
2019-07-04 05:08:04 +08:00
|
|
|
unsigned int nr_cqe;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
/* send state */
|
|
|
|
struct nvme_tcp_request *request;
|
|
|
|
|
2022-01-23 00:57:44 +08:00
|
|
|
u32 maxh2cdata;
|
2018-12-04 09:52:17 +08:00
|
|
|
size_t cmnd_capsule_len;
|
|
|
|
struct nvme_tcp_ctrl *ctrl;
|
|
|
|
unsigned long flags;
|
|
|
|
bool rd_enabled;
|
|
|
|
|
|
|
|
bool hdr_digest;
|
|
|
|
bool data_digest;
|
|
|
|
struct ahash_request *rcv_hash;
|
|
|
|
struct ahash_request *snd_hash;
|
|
|
|
__le32 exp_ddgst;
|
|
|
|
__le32 recv_ddgst;
|
|
|
|
|
|
|
|
struct page_frag_cache pf_cache;
|
|
|
|
|
|
|
|
void (*state_change)(struct sock *);
|
|
|
|
void (*data_ready)(struct sock *);
|
|
|
|
void (*write_space)(struct sock *);
|
|
|
|
};
|
|
|
|
|
|
|
|
struct nvme_tcp_ctrl {
|
|
|
|
/* read only in the hot path */
|
|
|
|
struct nvme_tcp_queue *queues;
|
|
|
|
struct blk_mq_tag_set tag_set;
|
|
|
|
|
|
|
|
/* other member variables */
|
|
|
|
struct list_head list;
|
|
|
|
struct blk_mq_tag_set admin_tag_set;
|
|
|
|
struct sockaddr_storage addr;
|
|
|
|
struct sockaddr_storage src_addr;
|
|
|
|
struct nvme_ctrl ctrl;
|
|
|
|
|
|
|
|
struct work_struct err_work;
|
|
|
|
struct delayed_work connect_work;
|
|
|
|
struct nvme_tcp_request async_req;
|
2019-05-29 13:49:05 +08:00
|
|
|
u32 io_queues[HCTX_MAX_TYPES];
|
2018-12-04 09:52:17 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static LIST_HEAD(nvme_tcp_ctrl_list);
|
|
|
|
static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
|
|
|
|
static struct workqueue_struct *nvme_tcp_wq;
|
2020-05-29 06:25:07 +08:00
|
|
|
static const struct blk_mq_ops nvme_tcp_mq_ops;
|
|
|
|
static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
|
2020-05-02 05:25:45 +08:00
|
|
|
static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
return queue - queue->ctrl->queues;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
u32 queue_idx = nvme_tcp_queue_id(queue);
|
|
|
|
|
|
|
|
if (queue_idx == 0)
|
|
|
|
return queue->ctrl->admin_tag_set.tags[queue_idx];
|
|
|
|
return queue->ctrl->tag_set.tags[queue_idx - 1];
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
|
|
|
|
}
|
|
|
|
|
2023-03-13 16:56:22 +08:00
|
|
|
static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req)
|
|
|
|
{
|
|
|
|
return req->pdu;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req)
|
|
|
|
{
|
|
|
|
/* use the pdu space in the back for the data pdu */
|
|
|
|
return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) -
|
|
|
|
sizeof(struct nvme_tcp_data_pdu);
|
|
|
|
}
|
|
|
|
|
2022-07-08 05:12:45 +08:00
|
|
|
static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req)
|
2018-12-04 09:52:17 +08:00
|
|
|
{
|
2022-07-08 05:12:45 +08:00
|
|
|
if (nvme_is_fabrics(req->req.cmd))
|
|
|
|
return NVME_TCP_ADMIN_CCSZ;
|
|
|
|
return req->queue->cmnd_capsule_len - sizeof(struct nvme_command);
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
|
|
|
|
{
|
|
|
|
return req == &req->queue->ctrl->async_req;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
|
|
|
|
{
|
|
|
|
struct request *rq;
|
|
|
|
|
|
|
|
if (unlikely(nvme_tcp_async_req(req)))
|
|
|
|
return false; /* async events don't have a request */
|
|
|
|
|
|
|
|
rq = blk_mq_rq_from_pdu(req);
|
|
|
|
|
2020-03-24 06:06:30 +08:00
|
|
|
return rq_data_dir(rq) == WRITE && req->data_len &&
|
2022-07-08 05:12:45 +08:00
|
|
|
req->data_len <= nvme_tcp_inline_data_size(req);
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
|
|
|
|
{
|
|
|
|
return req->iter.bvec->bv_page;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
|
|
|
|
{
|
|
|
|
return req->iter.bvec->bv_offset + req->iter.iov_offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
|
|
|
|
{
|
2021-01-14 05:56:57 +08:00
|
|
|
return min_t(size_t, iov_iter_single_seg_count(&req->iter),
|
2018-12-04 09:52:17 +08:00
|
|
|
req->pdu_len - req->pdu_sent);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
|
|
|
|
{
|
|
|
|
return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
|
|
|
|
req->pdu_len - req->pdu_sent : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
|
|
|
|
int len)
|
|
|
|
{
|
|
|
|
return nvme_tcp_pdu_data_left(req) <= len;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
|
|
|
|
unsigned int dir)
|
|
|
|
{
|
|
|
|
struct request *rq = blk_mq_rq_from_pdu(req);
|
|
|
|
struct bio_vec *vec;
|
|
|
|
unsigned int size;
|
2021-01-15 05:15:26 +08:00
|
|
|
int nr_bvec;
|
2018-12-04 09:52:17 +08:00
|
|
|
size_t offset;
|
|
|
|
|
|
|
|
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
|
|
|
|
vec = &rq->special_vec;
|
2021-01-15 05:15:26 +08:00
|
|
|
nr_bvec = 1;
|
2018-12-04 09:52:17 +08:00
|
|
|
size = blk_rq_payload_bytes(rq);
|
|
|
|
offset = 0;
|
|
|
|
} else {
|
|
|
|
struct bio *bio = req->curr_bio;
|
2021-01-15 05:15:26 +08:00
|
|
|
struct bvec_iter bi;
|
|
|
|
struct bio_vec bv;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
|
2021-01-15 05:15:26 +08:00
|
|
|
nr_bvec = 0;
|
|
|
|
bio_for_each_bvec(bv, bio, bi) {
|
|
|
|
nr_bvec++;
|
|
|
|
}
|
2018-12-04 09:52:17 +08:00
|
|
|
size = bio->bi_iter.bi_size;
|
|
|
|
offset = bio->bi_iter.bi_bvec_done;
|
|
|
|
}
|
|
|
|
|
2021-01-15 05:15:26 +08:00
|
|
|
iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
|
2018-12-04 09:52:17 +08:00
|
|
|
req->iter.iov_offset = offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
|
|
|
|
int len)
|
|
|
|
{
|
|
|
|
req->data_sent += len;
|
|
|
|
req->pdu_sent += len;
|
|
|
|
iov_iter_advance(&req->iter, len);
|
|
|
|
if (!iov_iter_count(&req->iter) &&
|
|
|
|
req->data_sent < req->data_len) {
|
|
|
|
req->curr_bio = req->curr_bio->bi_next;
|
2022-09-16 08:25:47 +08:00
|
|
|
nvme_tcp_init_iter(req, ITER_SOURCE);
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-21 16:03:39 +08:00
|
|
|
static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* drain the send queue as much as we can... */
|
|
|
|
do {
|
|
|
|
ret = nvme_tcp_try_send(queue);
|
|
|
|
} while (ret > 0);
|
|
|
|
}
|
|
|
|
|
2021-09-09 23:54:52 +08:00
|
|
|
static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
return !list_empty(&queue->send_list) ||
|
2022-09-05 23:07:06 +08:00
|
|
|
!llist_empty(&queue->req_list);
|
2021-09-09 23:54:52 +08:00
|
|
|
}
|
|
|
|
|
2020-05-02 05:25:45 +08:00
|
|
|
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
|
2020-06-19 08:30:23 +08:00
|
|
|
bool sync, bool last)
|
2018-12-04 09:52:17 +08:00
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue = req->queue;
|
2020-05-02 05:25:45 +08:00
|
|
|
bool empty;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2020-06-19 08:30:22 +08:00
|
|
|
empty = llist_add(&req->lentry, &queue->req_list) &&
|
|
|
|
list_empty(&queue->send_list) && !queue->request;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2020-05-02 05:25:45 +08:00
|
|
|
/*
|
|
|
|
* if we're the first on the send_list and we can try to send
|
|
|
|
* directly, otherwise queue io_work. Also, only do that if we
|
|
|
|
* are on the same cpu, so we don't introduce contention.
|
|
|
|
*/
|
2021-03-16 04:53:47 +08:00
|
|
|
if (queue->io_cpu == raw_smp_processor_id() &&
|
2020-05-02 05:25:45 +08:00
|
|
|
sync && empty && mutex_trylock(&queue->send_mutex)) {
|
2020-12-21 16:03:39 +08:00
|
|
|
nvme_tcp_send_all(queue);
|
2020-05-02 05:25:45 +08:00
|
|
|
mutex_unlock(&queue->send_mutex);
|
|
|
|
}
|
2021-09-09 23:54:52 +08:00
|
|
|
|
|
|
|
if (last && nvme_tcp_queue_more(queue))
|
|
|
|
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
2020-06-19 08:30:22 +08:00
|
|
|
static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_request *req;
|
|
|
|
struct llist_node *node;
|
|
|
|
|
|
|
|
for (node = llist_del_all(&queue->req_list); node; node = node->next) {
|
|
|
|
req = llist_entry(node, struct nvme_tcp_request, lentry);
|
|
|
|
list_add(&req->entry, &queue->send_list);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
static inline struct nvme_tcp_request *
|
|
|
|
nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_request *req;
|
|
|
|
|
|
|
|
req = list_first_entry_or_null(&queue->send_list,
|
|
|
|
struct nvme_tcp_request, entry);
|
2020-06-19 08:30:22 +08:00
|
|
|
if (!req) {
|
|
|
|
nvme_tcp_process_req_list(queue);
|
|
|
|
req = list_first_entry_or_null(&queue->send_list,
|
|
|
|
struct nvme_tcp_request, entry);
|
|
|
|
if (unlikely(!req))
|
|
|
|
return NULL;
|
|
|
|
}
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2020-06-19 08:30:22 +08:00
|
|
|
list_del(&req->entry);
|
2018-12-04 09:52:17 +08:00
|
|
|
return req;
|
|
|
|
}
|
|
|
|
|
2018-12-13 16:46:59 +08:00
|
|
|
static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
|
|
|
|
__le32 *dgst)
|
2018-12-04 09:52:17 +08:00
|
|
|
{
|
|
|
|
ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
|
|
|
|
crypto_ahash_final(hash);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
|
|
|
|
struct page *page, off_t off, size_t len)
|
|
|
|
{
|
|
|
|
struct scatterlist sg;
|
|
|
|
|
nvme-tcp: replace sg_init_marker() with sg_init_table()
In nvme_tcp_ddgst_update(), sg_init_marker() is called with an
uninitialized scatterlist. This is probably fine, but gcc complains:
CC [M] drivers/nvme/host/tcp.o
In file included from ./include/linux/dma-mapping.h:10,
from ./include/linux/skbuff.h:31,
from ./include/net/net_namespace.h:43,
from ./include/linux/netdevice.h:38,
from ./include/net/sock.h:46,
from drivers/nvme/host/tcp.c:12:
In function ‘sg_mark_end’,
inlined from ‘sg_init_marker’ at ./include/linux/scatterlist.h:356:2,
inlined from ‘nvme_tcp_ddgst_update’ at drivers/nvme/host/tcp.c:390:2:
./include/linux/scatterlist.h:234:11: error: ‘sg.page_link’ is used uninitialized [-Werror=uninitialized]
234 | sg->page_link |= SG_END;
| ~~^~~~~~~~~~~
drivers/nvme/host/tcp.c: In function ‘nvme_tcp_ddgst_update’:
drivers/nvme/host/tcp.c:388:28: note: ‘sg’ declared here
388 | struct scatterlist sg;
| ^~
cc1: all warnings being treated as errors
Use sg_init_table() instead, which basically memset the scatterlist to
zero first before calling sg_init_marker().
Signed-off-by: Nam Cao <namcaov@gmail.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 01:46:36 +08:00
|
|
|
sg_init_table(&sg, 1);
|
2018-12-04 09:52:17 +08:00
|
|
|
sg_set_page(&sg, page, len, off);
|
|
|
|
ahash_request_set_crypt(hash, &sg, NULL, len);
|
|
|
|
crypto_ahash_update(hash);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void nvme_tcp_hdgst(struct ahash_request *hash,
|
|
|
|
void *pdu, size_t len)
|
|
|
|
{
|
|
|
|
struct scatterlist sg;
|
|
|
|
|
|
|
|
sg_init_one(&sg, pdu, len);
|
|
|
|
ahash_request_set_crypt(hash, &sg, pdu + len, len);
|
|
|
|
crypto_ahash_digest(hash);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
|
|
|
|
void *pdu, size_t pdu_len)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_hdr *hdr = pdu;
|
|
|
|
__le32 recv_digest;
|
|
|
|
__le32 exp_digest;
|
|
|
|
|
|
|
|
if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"queue %d: header digest flag is cleared\n",
|
|
|
|
nvme_tcp_queue_id(queue));
|
|
|
|
return -EPROTO;
|
|
|
|
}
|
|
|
|
|
|
|
|
recv_digest = *(__le32 *)(pdu + hdr->hlen);
|
|
|
|
nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
|
|
|
|
exp_digest = *(__le32 *)(pdu + hdr->hlen);
|
|
|
|
if (recv_digest != exp_digest) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"header digest error: recv %#x expected %#x\n",
|
|
|
|
le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_hdr *hdr = pdu;
|
|
|
|
u8 digest_len = nvme_tcp_hdgst_len(queue);
|
|
|
|
u32 len;
|
|
|
|
|
|
|
|
len = le32_to_cpu(hdr->plen) - hdr->hlen -
|
|
|
|
((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
|
|
|
|
|
|
|
|
if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"queue %d: data digest flag is cleared\n",
|
|
|
|
nvme_tcp_queue_id(queue));
|
|
|
|
return -EPROTO;
|
|
|
|
}
|
|
|
|
crypto_ahash_init(queue->rcv_hash);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
|
|
|
|
struct request *rq, unsigned int hctx_idx)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
|
|
|
|
|
|
|
|
page_frag_free(req->pdu);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
|
|
|
|
struct request *rq, unsigned int hctx_idx,
|
|
|
|
unsigned int numa_node)
|
|
|
|
{
|
2022-09-20 23:09:48 +08:00
|
|
|
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
|
2018-12-04 09:52:17 +08:00
|
|
|
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
|
2021-03-18 04:37:03 +08:00
|
|
|
struct nvme_tcp_cmd_pdu *pdu;
|
2018-12-04 09:52:17 +08:00
|
|
|
int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
|
|
|
|
struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
|
|
|
|
u8 hdgst = nvme_tcp_hdgst_len(queue);
|
|
|
|
|
|
|
|
req->pdu = page_frag_alloc(&queue->pf_cache,
|
|
|
|
sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
|
|
|
|
GFP_KERNEL | __GFP_ZERO);
|
|
|
|
if (!req->pdu)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2021-03-18 04:37:03 +08:00
|
|
|
pdu = req->pdu;
|
2018-12-04 09:52:17 +08:00
|
|
|
req->queue = queue;
|
|
|
|
nvme_req(rq)->ctrl = &ctrl->ctrl;
|
2021-03-18 04:37:03 +08:00
|
|
|
nvme_req(rq)->cmd = &pdu->cmd;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
|
|
|
|
unsigned int hctx_idx)
|
|
|
|
{
|
2022-09-20 23:09:48 +08:00
|
|
|
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
|
2018-12-04 09:52:17 +08:00
|
|
|
struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
|
|
|
|
|
|
|
|
hctx->driver_data = queue;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
|
|
|
|
unsigned int hctx_idx)
|
|
|
|
{
|
2022-09-20 23:09:48 +08:00
|
|
|
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
|
2018-12-04 09:52:17 +08:00
|
|
|
struct nvme_tcp_queue *queue = &ctrl->queues[0];
|
|
|
|
|
|
|
|
hctx->driver_data = queue;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static enum nvme_tcp_recv_state
|
|
|
|
nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
|
|
|
|
(queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
|
|
|
|
NVME_TCP_RECV_DATA;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
|
|
|
|
nvme_tcp_hdgst_len(queue);
|
|
|
|
queue->pdu_offset = 0;
|
|
|
|
queue->data_remaining = -1;
|
|
|
|
queue->ddgst_remaining = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
|
|
|
|
return;
|
|
|
|
|
2020-07-29 04:16:36 +08:00
|
|
|
dev_warn(ctrl->device, "starting error recovery\n");
|
2020-02-11 08:01:45 +08:00
|
|
|
queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
|
|
|
|
struct nvme_completion *cqe)
|
|
|
|
{
|
2021-08-30 21:36:26 +08:00
|
|
|
struct nvme_tcp_request *req;
|
2018-12-04 09:52:17 +08:00
|
|
|
struct request *rq;
|
|
|
|
|
2021-06-17 05:19:36 +08:00
|
|
|
rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id);
|
2018-12-04 09:52:17 +08:00
|
|
|
if (!rq) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
2021-06-17 05:19:36 +08:00
|
|
|
"got bad cqe.command_id %#x on queue %d\n",
|
|
|
|
cqe->command_id, nvme_tcp_queue_id(queue));
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2021-08-30 21:36:26 +08:00
|
|
|
req = blk_mq_rq_to_pdu(rq);
|
|
|
|
if (req->status == cpu_to_le16(NVME_SC_SUCCESS))
|
|
|
|
req->status = cqe->status;
|
|
|
|
|
|
|
|
if (!nvme_try_complete_req(rq, req->status, cqe->result))
|
2020-06-11 14:44:52 +08:00
|
|
|
nvme_complete_rq(rq);
|
2019-07-04 05:08:04 +08:00
|
|
|
queue->nr_cqe++;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
|
|
|
|
struct nvme_tcp_data_pdu *pdu)
|
|
|
|
{
|
|
|
|
struct request *rq;
|
|
|
|
|
2021-06-17 05:19:36 +08:00
|
|
|
rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
|
2018-12-04 09:52:17 +08:00
|
|
|
if (!rq) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
2021-06-17 05:19:36 +08:00
|
|
|
"got bad c2hdata.command_id %#x on queue %d\n",
|
|
|
|
pdu->command_id, nvme_tcp_queue_id(queue));
|
2018-12-04 09:52:17 +08:00
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!blk_rq_payload_bytes(rq)) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"queue %d tag %#x unexpected data\n",
|
|
|
|
nvme_tcp_queue_id(queue), rq->tag);
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
|
|
|
queue->data_remaining = le32_to_cpu(pdu->data_length);
|
|
|
|
|
2019-03-14 01:55:10 +08:00
|
|
|
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
|
|
|
|
unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"queue %d tag %#x SUCCESS set but not last PDU\n",
|
|
|
|
nvme_tcp_queue_id(queue), rq->tag);
|
|
|
|
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
|
|
|
|
return -EPROTO;
|
|
|
|
}
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
|
|
|
|
struct nvme_tcp_rsp_pdu *pdu)
|
|
|
|
{
|
|
|
|
struct nvme_completion *cqe = &pdu->cqe;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* AEN requests are special as they don't time out and can
|
|
|
|
* survive any kind of queue freeze and often don't respond to
|
|
|
|
* aborts. We don't even bother to allocate a struct request
|
|
|
|
* for them but rather special case them here.
|
|
|
|
*/
|
2019-10-14 00:57:31 +08:00
|
|
|
if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
|
|
|
|
cqe->command_id)))
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
|
|
|
|
&cqe->result);
|
|
|
|
else
|
|
|
|
ret = nvme_tcp_process_nvme_cqe(queue, cqe);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-01-23 00:57:44 +08:00
|
|
|
static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req)
|
2018-12-04 09:52:17 +08:00
|
|
|
{
|
2023-03-13 16:56:22 +08:00
|
|
|
struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req);
|
2018-12-04 09:52:17 +08:00
|
|
|
struct nvme_tcp_queue *queue = req->queue;
|
|
|
|
struct request *rq = blk_mq_rq_from_pdu(req);
|
2022-01-23 00:57:44 +08:00
|
|
|
u32 h2cdata_sent = req->pdu_len;
|
2018-12-04 09:52:17 +08:00
|
|
|
u8 hdgst = nvme_tcp_hdgst_len(queue);
|
|
|
|
u8 ddgst = nvme_tcp_ddgst_len(queue);
|
|
|
|
|
2021-11-23 18:58:56 +08:00
|
|
|
req->state = NVME_TCP_SEND_H2C_PDU;
|
|
|
|
req->offset = 0;
|
2022-01-23 00:57:44 +08:00
|
|
|
req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata);
|
2018-12-04 09:52:17 +08:00
|
|
|
req->pdu_sent = 0;
|
2022-01-23 00:57:44 +08:00
|
|
|
req->h2cdata_left -= req->pdu_len;
|
|
|
|
req->h2cdata_offset += h2cdata_sent;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
memset(data, 0, sizeof(*data));
|
|
|
|
data->hdr.type = nvme_tcp_h2c_data;
|
2022-01-23 00:57:44 +08:00
|
|
|
if (!req->h2cdata_left)
|
|
|
|
data->hdr.flags = NVME_TCP_F_DATA_LAST;
|
2018-12-04 09:52:17 +08:00
|
|
|
if (queue->hdr_digest)
|
|
|
|
data->hdr.flags |= NVME_TCP_F_HDGST;
|
|
|
|
if (queue->data_digest)
|
|
|
|
data->hdr.flags |= NVME_TCP_F_DDGST;
|
|
|
|
data->hdr.hlen = sizeof(*data);
|
|
|
|
data->hdr.pdo = data->hdr.hlen + hdgst;
|
|
|
|
data->hdr.plen =
|
|
|
|
cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
|
2022-01-23 00:57:44 +08:00
|
|
|
data->ttag = req->ttag;
|
2021-06-17 05:19:36 +08:00
|
|
|
data->command_id = nvme_cid(rq);
|
2022-01-23 00:57:44 +08:00
|
|
|
data->data_offset = cpu_to_le32(req->h2cdata_offset);
|
2018-12-04 09:52:17 +08:00
|
|
|
data->data_length = cpu_to_le32(req->pdu_len);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
|
|
|
|
struct nvme_tcp_r2t_pdu *pdu)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_request *req;
|
|
|
|
struct request *rq;
|
2021-11-23 18:58:56 +08:00
|
|
|
u32 r2t_length = le32_to_cpu(pdu->r2t_length);
|
2022-01-23 00:57:44 +08:00
|
|
|
u32 r2t_offset = le32_to_cpu(pdu->r2t_offset);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2021-06-17 05:19:36 +08:00
|
|
|
rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
|
2018-12-04 09:52:17 +08:00
|
|
|
if (!rq) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
2021-06-17 05:19:36 +08:00
|
|
|
"got bad r2t.command_id %#x on queue %d\n",
|
|
|
|
pdu->command_id, nvme_tcp_queue_id(queue));
|
2018-12-04 09:52:17 +08:00
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
req = blk_mq_rq_to_pdu(rq);
|
|
|
|
|
2021-11-23 18:58:56 +08:00
|
|
|
if (unlikely(!r2t_length)) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"req %d r2t len is %u, probably a bug...\n",
|
|
|
|
rq->tag, r2t_length);
|
|
|
|
return -EPROTO;
|
|
|
|
}
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2021-11-23 18:58:56 +08:00
|
|
|
if (unlikely(req->data_sent + r2t_length > req->data_len)) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"req %d r2t len %u exceeded data len %u (%zu sent)\n",
|
|
|
|
rq->tag, r2t_length, req->data_len, req->data_sent);
|
|
|
|
return -EPROTO;
|
|
|
|
}
|
|
|
|
|
2022-01-23 00:57:44 +08:00
|
|
|
if (unlikely(r2t_offset < req->data_sent)) {
|
2021-11-23 18:58:56 +08:00
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"req %d unexpected r2t offset %u (expected %zu)\n",
|
2022-01-23 00:57:44 +08:00
|
|
|
rq->tag, r2t_offset, req->data_sent);
|
2021-11-23 18:58:56 +08:00
|
|
|
return -EPROTO;
|
|
|
|
}
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2022-01-23 00:57:44 +08:00
|
|
|
req->pdu_len = 0;
|
|
|
|
req->h2cdata_left = r2t_length;
|
|
|
|
req->h2cdata_offset = r2t_offset;
|
|
|
|
req->ttag = pdu->ttag;
|
|
|
|
|
|
|
|
nvme_tcp_setup_h2c_data_pdu(req);
|
2020-06-19 08:30:23 +08:00
|
|
|
nvme_tcp_queue_request(req, false, true);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
|
|
|
|
unsigned int *offset, size_t *len)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_hdr *hdr;
|
|
|
|
char *pdu = queue->pdu;
|
|
|
|
size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = skb_copy_bits(skb, *offset,
|
|
|
|
&pdu[queue->pdu_offset], rcv_len);
|
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
queue->pdu_remaining -= rcv_len;
|
|
|
|
queue->pdu_offset += rcv_len;
|
|
|
|
*offset += rcv_len;
|
|
|
|
*len -= rcv_len;
|
|
|
|
if (queue->pdu_remaining)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
hdr = queue->pdu;
|
|
|
|
if (queue->hdr_digest) {
|
|
|
|
ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
|
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (queue->data_digest) {
|
|
|
|
ret = nvme_tcp_check_ddgst(queue, queue->pdu);
|
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (hdr->type) {
|
|
|
|
case nvme_tcp_c2h_data:
|
2019-07-20 03:46:46 +08:00
|
|
|
return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
|
2018-12-04 09:52:17 +08:00
|
|
|
case nvme_tcp_rsp:
|
|
|
|
nvme_tcp_init_recv_ctx(queue);
|
2019-07-20 03:46:46 +08:00
|
|
|
return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
|
2018-12-04 09:52:17 +08:00
|
|
|
case nvme_tcp_r2t:
|
|
|
|
nvme_tcp_init_recv_ctx(queue);
|
2019-07-20 03:46:46 +08:00
|
|
|
return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
|
2018-12-04 09:52:17 +08:00
|
|
|
default:
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"unsupported pdu type (%d)\n", hdr->type);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-03-15 15:41:04 +08:00
|
|
|
static inline void nvme_tcp_end_request(struct request *rq, u16 status)
|
2019-03-14 01:55:10 +08:00
|
|
|
{
|
|
|
|
union nvme_result res = {};
|
|
|
|
|
2020-08-18 15:11:29 +08:00
|
|
|
if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
|
2020-06-11 14:44:52 +08:00
|
|
|
nvme_complete_rq(rq);
|
2019-03-14 01:55:10 +08:00
|
|
|
}
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
|
|
|
|
unsigned int *offset, size_t *len)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
|
2021-06-17 05:19:35 +08:00
|
|
|
struct request *rq =
|
2021-06-17 05:19:36 +08:00
|
|
|
nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
|
2021-06-17 05:19:35 +08:00
|
|
|
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
while (true) {
|
|
|
|
int recv_len, ret;
|
|
|
|
|
|
|
|
recv_len = min_t(size_t, *len, queue->data_remaining);
|
|
|
|
if (!recv_len)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (!iov_iter_count(&req->iter)) {
|
|
|
|
req->curr_bio = req->curr_bio->bi_next;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we don`t have any bios it means that controller
|
|
|
|
* sent more data than we requested, hence error
|
|
|
|
*/
|
|
|
|
if (!req->curr_bio) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"queue %d no space in request %#x",
|
|
|
|
nvme_tcp_queue_id(queue), rq->tag);
|
|
|
|
nvme_tcp_init_recv_ctx(queue);
|
|
|
|
return -EIO;
|
|
|
|
}
|
2022-09-16 08:25:47 +08:00
|
|
|
nvme_tcp_init_iter(req, ITER_DEST);
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* we can read only from what is left in this bio */
|
|
|
|
recv_len = min_t(size_t, recv_len,
|
|
|
|
iov_iter_count(&req->iter));
|
|
|
|
|
|
|
|
if (queue->data_digest)
|
|
|
|
ret = skb_copy_and_hash_datagram_iter(skb, *offset,
|
|
|
|
&req->iter, recv_len, queue->rcv_hash);
|
|
|
|
else
|
|
|
|
ret = skb_copy_datagram_iter(skb, *offset,
|
|
|
|
&req->iter, recv_len);
|
|
|
|
if (ret) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"queue %d failed to copy request %#x data",
|
|
|
|
nvme_tcp_queue_id(queue), rq->tag);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
*len -= recv_len;
|
|
|
|
*offset += recv_len;
|
|
|
|
queue->data_remaining -= recv_len;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!queue->data_remaining) {
|
|
|
|
if (queue->data_digest) {
|
|
|
|
nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
|
|
|
|
queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
|
|
|
|
} else {
|
2019-07-04 05:08:04 +08:00
|
|
|
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
|
2021-08-30 21:36:26 +08:00
|
|
|
nvme_tcp_end_request(rq,
|
|
|
|
le16_to_cpu(req->status));
|
2019-07-04 05:08:04 +08:00
|
|
|
queue->nr_cqe++;
|
|
|
|
}
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_init_recv_ctx(queue);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
|
|
|
|
struct sk_buff *skb, unsigned int *offset, size_t *len)
|
|
|
|
{
|
2019-03-14 01:55:10 +08:00
|
|
|
struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
|
2018-12-04 09:52:17 +08:00
|
|
|
char *ddgst = (char *)&queue->recv_ddgst;
|
|
|
|
size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
|
|
|
|
off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
|
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
queue->ddgst_remaining -= recv_len;
|
|
|
|
*offset += recv_len;
|
|
|
|
*len -= recv_len;
|
|
|
|
if (queue->ddgst_remaining)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (queue->recv_ddgst != queue->exp_ddgst) {
|
2021-08-30 21:36:26 +08:00
|
|
|
struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
|
|
|
|
pdu->command_id);
|
|
|
|
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
|
|
|
|
|
|
|
|
req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR);
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"data digest error: recv %#x expected %#x\n",
|
|
|
|
le32_to_cpu(queue->recv_ddgst),
|
|
|
|
le32_to_cpu(queue->exp_ddgst));
|
|
|
|
}
|
|
|
|
|
2019-03-14 01:55:10 +08:00
|
|
|
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
|
2021-06-17 05:19:36 +08:00
|
|
|
struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
|
|
|
|
pdu->command_id);
|
2021-08-30 21:36:26 +08:00
|
|
|
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
|
2019-03-14 01:55:10 +08:00
|
|
|
|
2021-08-30 21:36:26 +08:00
|
|
|
nvme_tcp_end_request(rq, le16_to_cpu(req->status));
|
2019-07-04 05:08:04 +08:00
|
|
|
queue->nr_cqe++;
|
2019-03-14 01:55:10 +08:00
|
|
|
}
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_init_recv_ctx(queue);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
|
|
|
|
unsigned int offset, size_t len)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue = desc->arg.data;
|
|
|
|
size_t consumed = len;
|
|
|
|
int result;
|
|
|
|
|
2023-03-22 00:30:25 +08:00
|
|
|
if (unlikely(!queue->rd_enabled))
|
|
|
|
return -EFAULT;
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
while (len) {
|
|
|
|
switch (nvme_tcp_recv_state(queue)) {
|
|
|
|
case NVME_TCP_RECV_PDU:
|
|
|
|
result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
|
|
|
|
break;
|
|
|
|
case NVME_TCP_RECV_DATA:
|
|
|
|
result = nvme_tcp_recv_data(queue, skb, &offset, &len);
|
|
|
|
break;
|
|
|
|
case NVME_TCP_RECV_DDGST:
|
|
|
|
result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
result = -EFAULT;
|
|
|
|
}
|
|
|
|
if (result) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"receive failed: %d\n", result);
|
|
|
|
queue->rd_enabled = false;
|
|
|
|
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return consumed;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_data_ready(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue;
|
|
|
|
|
2023-01-20 08:45:16 +08:00
|
|
|
trace_sk_data_ready(sk);
|
|
|
|
|
2020-05-01 04:59:32 +08:00
|
|
|
read_lock_bh(&sk->sk_callback_lock);
|
2018-12-04 09:52:17 +08:00
|
|
|
queue = sk->sk_user_data;
|
2020-05-02 05:25:44 +08:00
|
|
|
if (likely(queue && queue->rd_enabled) &&
|
|
|
|
!test_bit(NVME_TCP_Q_POLLING, &queue->flags))
|
2018-12-04 09:52:17 +08:00
|
|
|
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
|
2020-05-01 04:59:32 +08:00
|
|
|
read_unlock_bh(&sk->sk_callback_lock);
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_write_space(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue;
|
|
|
|
|
|
|
|
read_lock_bh(&sk->sk_callback_lock);
|
|
|
|
queue = sk->sk_user_data;
|
|
|
|
if (likely(queue && sk_stream_is_writeable(sk))) {
|
|
|
|
clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
|
|
|
|
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
|
|
|
|
}
|
|
|
|
read_unlock_bh(&sk->sk_callback_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_state_change(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue;
|
|
|
|
|
2021-03-21 15:08:48 +08:00
|
|
|
read_lock_bh(&sk->sk_callback_lock);
|
2018-12-04 09:52:17 +08:00
|
|
|
queue = sk->sk_user_data;
|
|
|
|
if (!queue)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
switch (sk->sk_state) {
|
|
|
|
case TCP_CLOSE:
|
|
|
|
case TCP_CLOSE_WAIT:
|
|
|
|
case TCP_LAST_ACK:
|
|
|
|
case TCP_FIN_WAIT1:
|
|
|
|
case TCP_FIN_WAIT2:
|
|
|
|
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
dev_info(queue->ctrl->ctrl.device,
|
|
|
|
"queue %d socket state %d\n",
|
|
|
|
nvme_tcp_queue_id(queue), sk->sk_state);
|
|
|
|
}
|
|
|
|
|
|
|
|
queue->state_change(sk);
|
|
|
|
done:
|
2021-03-21 15:08:48 +08:00
|
|
|
read_unlock_bh(&sk->sk_callback_lock);
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
queue->request = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
|
|
|
|
{
|
2022-02-07 06:40:13 +08:00
|
|
|
if (nvme_tcp_async_req(req)) {
|
|
|
|
union nvme_result res = {};
|
|
|
|
|
|
|
|
nvme_complete_async_event(&req->queue->ctrl->ctrl,
|
|
|
|
cpu_to_le16(NVME_SC_HOST_PATH_ERROR), &res);
|
|
|
|
} else {
|
|
|
|
nvme_tcp_end_request(blk_mq_rq_from_pdu(req),
|
|
|
|
NVME_SC_HOST_PATH_ERROR);
|
|
|
|
}
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue = req->queue;
|
2021-10-24 15:43:31 +08:00
|
|
|
int req_data_len = req->data_len;
|
2022-01-23 00:57:44 +08:00
|
|
|
u32 h2cdata_left = req->h2cdata_left;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
while (true) {
|
|
|
|
struct page *page = nvme_tcp_req_cur_page(req);
|
|
|
|
size_t offset = nvme_tcp_req_cur_offset(req);
|
|
|
|
size_t len = nvme_tcp_req_cur_length(req);
|
|
|
|
bool last = nvme_tcp_pdu_last_send(req, len);
|
2021-10-24 15:43:31 +08:00
|
|
|
int req_data_sent = req->data_sent;
|
2018-12-04 09:52:17 +08:00
|
|
|
int ret, flags = MSG_DONTWAIT;
|
|
|
|
|
2020-06-19 08:30:24 +08:00
|
|
|
if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
|
2018-12-04 09:52:17 +08:00
|
|
|
flags |= MSG_EOR;
|
|
|
|
else
|
2020-05-05 13:20:01 +08:00
|
|
|
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2020-10-02 16:27:30 +08:00
|
|
|
if (sendpage_ok(page)) {
|
|
|
|
ret = kernel_sendpage(queue->sock, page, offset, len,
|
2019-07-08 18:31:29 +08:00
|
|
|
flags);
|
|
|
|
} else {
|
2020-10-02 16:27:30 +08:00
|
|
|
ret = sock_no_sendpage(queue->sock, page, offset, len,
|
2019-07-08 18:31:29 +08:00
|
|
|
flags);
|
|
|
|
}
|
2018-12-04 09:52:17 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (queue->data_digest)
|
|
|
|
nvme_tcp_ddgst_update(queue->snd_hash, page,
|
|
|
|
offset, ret);
|
|
|
|
|
2021-09-14 23:38:55 +08:00
|
|
|
/*
|
|
|
|
* update the request iterator except for the last payload send
|
|
|
|
* in the request where we don't want to modify it as we may
|
|
|
|
* compete with the RX path completing the request.
|
|
|
|
*/
|
2021-10-24 15:43:31 +08:00
|
|
|
if (req_data_sent + ret < req_data_len)
|
2021-09-14 23:38:55 +08:00
|
|
|
nvme_tcp_advance_req(req, ret);
|
|
|
|
|
|
|
|
/* fully successful last send in current PDU */
|
2018-12-04 09:52:17 +08:00
|
|
|
if (last && ret == len) {
|
|
|
|
if (queue->data_digest) {
|
|
|
|
nvme_tcp_ddgst_final(queue->snd_hash,
|
|
|
|
&req->ddgst);
|
|
|
|
req->state = NVME_TCP_SEND_DDGST;
|
|
|
|
req->offset = 0;
|
|
|
|
} else {
|
2022-01-23 00:57:44 +08:00
|
|
|
if (h2cdata_left)
|
|
|
|
nvme_tcp_setup_h2c_data_pdu(req);
|
|
|
|
else
|
|
|
|
nvme_tcp_done_send_req(queue);
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue = req->queue;
|
2023-03-13 16:56:22 +08:00
|
|
|
struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
|
2018-12-04 09:52:17 +08:00
|
|
|
bool inline_data = nvme_tcp_has_inline_data(req);
|
|
|
|
u8 hdgst = nvme_tcp_hdgst_len(queue);
|
|
|
|
int len = sizeof(*pdu) + hdgst - req->offset;
|
2020-05-05 13:20:01 +08:00
|
|
|
int flags = MSG_DONTWAIT;
|
2018-12-04 09:52:17 +08:00
|
|
|
int ret;
|
|
|
|
|
2020-06-19 08:30:24 +08:00
|
|
|
if (inline_data || nvme_tcp_queue_more(queue))
|
2020-05-05 13:20:01 +08:00
|
|
|
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
|
|
|
|
else
|
|
|
|
flags |= MSG_EOR;
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
if (queue->hdr_digest && !req->offset)
|
|
|
|
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
|
|
|
|
|
|
|
|
ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
|
|
|
|
offset_in_page(pdu) + req->offset, len, flags);
|
|
|
|
if (unlikely(ret <= 0))
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
len -= ret;
|
|
|
|
if (!len) {
|
|
|
|
if (inline_data) {
|
|
|
|
req->state = NVME_TCP_SEND_DATA;
|
|
|
|
if (queue->data_digest)
|
|
|
|
crypto_ahash_init(queue->snd_hash);
|
|
|
|
} else {
|
|
|
|
nvme_tcp_done_send_req(queue);
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
req->offset += ret;
|
|
|
|
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue = req->queue;
|
2023-03-13 16:56:22 +08:00
|
|
|
struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
|
2018-12-04 09:52:17 +08:00
|
|
|
u8 hdgst = nvme_tcp_hdgst_len(queue);
|
|
|
|
int len = sizeof(*pdu) - req->offset + hdgst;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (queue->hdr_digest && !req->offset)
|
|
|
|
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
|
|
|
|
|
2022-01-23 00:57:44 +08:00
|
|
|
if (!req->h2cdata_left)
|
|
|
|
ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
|
|
|
|
offset_in_page(pdu) + req->offset, len,
|
|
|
|
MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
|
|
|
|
else
|
|
|
|
ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
|
|
|
|
offset_in_page(pdu) + req->offset, len,
|
|
|
|
MSG_DONTWAIT | MSG_MORE);
|
2018-12-04 09:52:17 +08:00
|
|
|
if (unlikely(ret <= 0))
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
len -= ret;
|
|
|
|
if (!len) {
|
|
|
|
req->state = NVME_TCP_SEND_DATA;
|
|
|
|
if (queue->data_digest)
|
|
|
|
crypto_ahash_init(queue->snd_hash);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
req->offset += ret;
|
|
|
|
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue = req->queue;
|
2021-10-26 21:31:55 +08:00
|
|
|
size_t offset = req->offset;
|
2022-01-23 00:57:44 +08:00
|
|
|
u32 h2cdata_left = req->h2cdata_left;
|
2018-12-04 09:52:17 +08:00
|
|
|
int ret;
|
2020-06-19 08:30:24 +08:00
|
|
|
struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
|
2018-12-04 09:52:17 +08:00
|
|
|
struct kvec iov = {
|
2021-10-26 01:17:30 +08:00
|
|
|
.iov_base = (u8 *)&req->ddgst + req->offset,
|
2018-12-04 09:52:17 +08:00
|
|
|
.iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
|
|
|
|
};
|
|
|
|
|
2020-06-19 08:30:24 +08:00
|
|
|
if (nvme_tcp_queue_more(queue))
|
|
|
|
msg.msg_flags |= MSG_MORE;
|
|
|
|
else
|
|
|
|
msg.msg_flags |= MSG_EOR;
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
|
|
|
|
if (unlikely(ret <= 0))
|
|
|
|
return ret;
|
|
|
|
|
2021-10-26 21:31:55 +08:00
|
|
|
if (offset + ret == NVME_TCP_DIGEST_LENGTH) {
|
2022-01-23 00:57:44 +08:00
|
|
|
if (h2cdata_left)
|
|
|
|
nvme_tcp_setup_h2c_data_pdu(req);
|
|
|
|
else
|
|
|
|
nvme_tcp_done_send_req(queue);
|
2018-12-04 09:52:17 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
req->offset += ret;
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_request *req;
|
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure
When destroying a queue, when calling sock_release, the network stack
might need to allocate an skb to send a FIN/RST. When that happens
during memory pressure, there is a need to reclaim memory, which
in turn may ask the nvme-tcp device to write out dirty pages, however
this is not possible due to a ctrl teardown that is going on.
Set PF_MEMALLOC to the task that releases the socket to grant access
to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp
thread as this may also originate from the swap itself and should
be more resilient to memory pressure situations.
This fixes the following lockdep complaint:
--
======================================================
WARNING: possible circular locking dependency detected
6.0.0-rc2+ #25 Tainted: G W
------------------------------------------------------
kswapd0/92 is trying to acquire lock:
ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0
but task is already holding lock:
ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (fs_reclaim){+.+.}-{0:0}:
fs_reclaim_acquire+0x11e/0x160
kmem_cache_alloc_node+0x44/0x530
__alloc_skb+0x158/0x230
tcp_send_active_reset+0x7e/0x730
tcp_disconnect+0x1272/0x1ae0
__tcp_close+0x707/0xd90
tcp_close+0x26/0x80
inet_release+0xfa/0x220
sock_release+0x85/0x1a0
nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp]
nvme_do_delete_ctrl+0x130/0x13d [nvme_core]
nvme_sysfs_delete.cold+0x8/0xd [nvme_core]
kernfs_fop_write_iter+0x356/0x530
vfs_write+0x4e8/0xce0
ksys_write+0xfd/0x1d0
do_syscall_64+0x58/0x80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
-> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}:
__lock_acquire+0x2a0c/0x5690
lock_acquire+0x18e/0x4f0
lock_sock_nested+0x37/0xc0
tcp_sendpage+0x23/0xa0
inet_sendpage+0xad/0x120
kernel_sendpage+0x156/0x440
nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp]
nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp]
__blk_mq_try_issue_directly+0x452/0x660
blk_mq_plug_issue_direct.constprop.0+0x207/0x700
blk_mq_flush_plug_list+0x6f5/0xc70
__blk_flush_plug+0x264/0x410
blk_finish_plug+0x4b/0xa0
shrink_lruvec+0x1263/0x1ea0
shrink_node+0x736/0x1a80
balance_pgdat+0x740/0x10d0
kswapd+0x5f2/0xaf0
kthread+0x256/0x2f0
ret_from_fork+0x1f/0x30
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
*** DEADLOCK ***
3 locks held by kswapd0/92:
#0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
#1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70
#2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp]
Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver")
Reported-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Tested-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
|
|
|
unsigned int noreclaim_flag;
|
2018-12-04 09:52:17 +08:00
|
|
|
int ret = 1;
|
|
|
|
|
|
|
|
if (!queue->request) {
|
|
|
|
queue->request = nvme_tcp_fetch_request(queue);
|
|
|
|
if (!queue->request)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
req = queue->request;
|
|
|
|
|
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure
When destroying a queue, when calling sock_release, the network stack
might need to allocate an skb to send a FIN/RST. When that happens
during memory pressure, there is a need to reclaim memory, which
in turn may ask the nvme-tcp device to write out dirty pages, however
this is not possible due to a ctrl teardown that is going on.
Set PF_MEMALLOC to the task that releases the socket to grant access
to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp
thread as this may also originate from the swap itself and should
be more resilient to memory pressure situations.
This fixes the following lockdep complaint:
--
======================================================
WARNING: possible circular locking dependency detected
6.0.0-rc2+ #25 Tainted: G W
------------------------------------------------------
kswapd0/92 is trying to acquire lock:
ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0
but task is already holding lock:
ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (fs_reclaim){+.+.}-{0:0}:
fs_reclaim_acquire+0x11e/0x160
kmem_cache_alloc_node+0x44/0x530
__alloc_skb+0x158/0x230
tcp_send_active_reset+0x7e/0x730
tcp_disconnect+0x1272/0x1ae0
__tcp_close+0x707/0xd90
tcp_close+0x26/0x80
inet_release+0xfa/0x220
sock_release+0x85/0x1a0
nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp]
nvme_do_delete_ctrl+0x130/0x13d [nvme_core]
nvme_sysfs_delete.cold+0x8/0xd [nvme_core]
kernfs_fop_write_iter+0x356/0x530
vfs_write+0x4e8/0xce0
ksys_write+0xfd/0x1d0
do_syscall_64+0x58/0x80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
-> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}:
__lock_acquire+0x2a0c/0x5690
lock_acquire+0x18e/0x4f0
lock_sock_nested+0x37/0xc0
tcp_sendpage+0x23/0xa0
inet_sendpage+0xad/0x120
kernel_sendpage+0x156/0x440
nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp]
nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp]
__blk_mq_try_issue_directly+0x452/0x660
blk_mq_plug_issue_direct.constprop.0+0x207/0x700
blk_mq_flush_plug_list+0x6f5/0xc70
__blk_flush_plug+0x264/0x410
blk_finish_plug+0x4b/0xa0
shrink_lruvec+0x1263/0x1ea0
shrink_node+0x736/0x1a80
balance_pgdat+0x740/0x10d0
kswapd+0x5f2/0xaf0
kthread+0x256/0x2f0
ret_from_fork+0x1f/0x30
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
*** DEADLOCK ***
3 locks held by kswapd0/92:
#0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
#1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70
#2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp]
Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver")
Reported-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Tested-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
|
|
|
noreclaim_flag = memalloc_noreclaim_save();
|
2018-12-04 09:52:17 +08:00
|
|
|
if (req->state == NVME_TCP_SEND_CMD_PDU) {
|
|
|
|
ret = nvme_tcp_try_send_cmd_pdu(req);
|
|
|
|
if (ret <= 0)
|
|
|
|
goto done;
|
|
|
|
if (!nvme_tcp_has_inline_data(req))
|
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure
When destroying a queue, when calling sock_release, the network stack
might need to allocate an skb to send a FIN/RST. When that happens
during memory pressure, there is a need to reclaim memory, which
in turn may ask the nvme-tcp device to write out dirty pages, however
this is not possible due to a ctrl teardown that is going on.
Set PF_MEMALLOC to the task that releases the socket to grant access
to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp
thread as this may also originate from the swap itself and should
be more resilient to memory pressure situations.
This fixes the following lockdep complaint:
--
======================================================
WARNING: possible circular locking dependency detected
6.0.0-rc2+ #25 Tainted: G W
------------------------------------------------------
kswapd0/92 is trying to acquire lock:
ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0
but task is already holding lock:
ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (fs_reclaim){+.+.}-{0:0}:
fs_reclaim_acquire+0x11e/0x160
kmem_cache_alloc_node+0x44/0x530
__alloc_skb+0x158/0x230
tcp_send_active_reset+0x7e/0x730
tcp_disconnect+0x1272/0x1ae0
__tcp_close+0x707/0xd90
tcp_close+0x26/0x80
inet_release+0xfa/0x220
sock_release+0x85/0x1a0
nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp]
nvme_do_delete_ctrl+0x130/0x13d [nvme_core]
nvme_sysfs_delete.cold+0x8/0xd [nvme_core]
kernfs_fop_write_iter+0x356/0x530
vfs_write+0x4e8/0xce0
ksys_write+0xfd/0x1d0
do_syscall_64+0x58/0x80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
-> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}:
__lock_acquire+0x2a0c/0x5690
lock_acquire+0x18e/0x4f0
lock_sock_nested+0x37/0xc0
tcp_sendpage+0x23/0xa0
inet_sendpage+0xad/0x120
kernel_sendpage+0x156/0x440
nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp]
nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp]
__blk_mq_try_issue_directly+0x452/0x660
blk_mq_plug_issue_direct.constprop.0+0x207/0x700
blk_mq_flush_plug_list+0x6f5/0xc70
__blk_flush_plug+0x264/0x410
blk_finish_plug+0x4b/0xa0
shrink_lruvec+0x1263/0x1ea0
shrink_node+0x736/0x1a80
balance_pgdat+0x740/0x10d0
kswapd+0x5f2/0xaf0
kthread+0x256/0x2f0
ret_from_fork+0x1f/0x30
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
*** DEADLOCK ***
3 locks held by kswapd0/92:
#0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
#1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70
#2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp]
Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver")
Reported-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Tested-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
|
|
|
goto out;
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (req->state == NVME_TCP_SEND_H2C_PDU) {
|
|
|
|
ret = nvme_tcp_try_send_data_pdu(req);
|
|
|
|
if (ret <= 0)
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (req->state == NVME_TCP_SEND_DATA) {
|
|
|
|
ret = nvme_tcp_try_send_data(req);
|
|
|
|
if (ret <= 0)
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (req->state == NVME_TCP_SEND_DDGST)
|
|
|
|
ret = nvme_tcp_try_send_ddgst(req);
|
|
|
|
done:
|
2020-02-26 08:43:23 +08:00
|
|
|
if (ret == -EAGAIN) {
|
2018-12-04 09:52:17 +08:00
|
|
|
ret = 0;
|
2020-02-26 08:43:23 +08:00
|
|
|
} else if (ret < 0) {
|
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"failed to send request %d\n", ret);
|
2022-06-26 17:24:51 +08:00
|
|
|
nvme_tcp_fail_request(queue->request);
|
2020-02-26 08:43:23 +08:00
|
|
|
nvme_tcp_done_send_req(queue);
|
|
|
|
}
|
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure
When destroying a queue, when calling sock_release, the network stack
might need to allocate an skb to send a FIN/RST. When that happens
during memory pressure, there is a need to reclaim memory, which
in turn may ask the nvme-tcp device to write out dirty pages, however
this is not possible due to a ctrl teardown that is going on.
Set PF_MEMALLOC to the task that releases the socket to grant access
to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp
thread as this may also originate from the swap itself and should
be more resilient to memory pressure situations.
This fixes the following lockdep complaint:
--
======================================================
WARNING: possible circular locking dependency detected
6.0.0-rc2+ #25 Tainted: G W
------------------------------------------------------
kswapd0/92 is trying to acquire lock:
ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0
but task is already holding lock:
ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (fs_reclaim){+.+.}-{0:0}:
fs_reclaim_acquire+0x11e/0x160
kmem_cache_alloc_node+0x44/0x530
__alloc_skb+0x158/0x230
tcp_send_active_reset+0x7e/0x730
tcp_disconnect+0x1272/0x1ae0
__tcp_close+0x707/0xd90
tcp_close+0x26/0x80
inet_release+0xfa/0x220
sock_release+0x85/0x1a0
nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp]
nvme_do_delete_ctrl+0x130/0x13d [nvme_core]
nvme_sysfs_delete.cold+0x8/0xd [nvme_core]
kernfs_fop_write_iter+0x356/0x530
vfs_write+0x4e8/0xce0
ksys_write+0xfd/0x1d0
do_syscall_64+0x58/0x80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
-> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}:
__lock_acquire+0x2a0c/0x5690
lock_acquire+0x18e/0x4f0
lock_sock_nested+0x37/0xc0
tcp_sendpage+0x23/0xa0
inet_sendpage+0xad/0x120
kernel_sendpage+0x156/0x440
nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp]
nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp]
__blk_mq_try_issue_directly+0x452/0x660
blk_mq_plug_issue_direct.constprop.0+0x207/0x700
blk_mq_flush_plug_list+0x6f5/0xc70
__blk_flush_plug+0x264/0x410
blk_finish_plug+0x4b/0xa0
shrink_lruvec+0x1263/0x1ea0
shrink_node+0x736/0x1a80
balance_pgdat+0x740/0x10d0
kswapd+0x5f2/0xaf0
kthread+0x256/0x2f0
ret_from_fork+0x1f/0x30
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
*** DEADLOCK ***
3 locks held by kswapd0/92:
#0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
#1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70
#2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp]
Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver")
Reported-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Tested-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
|
|
|
out:
|
|
|
|
memalloc_noreclaim_restore(noreclaim_flag);
|
2018-12-04 09:52:17 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
2019-07-08 17:52:00 +08:00
|
|
|
struct socket *sock = queue->sock;
|
|
|
|
struct sock *sk = sock->sk;
|
2018-12-04 09:52:17 +08:00
|
|
|
read_descriptor_t rd_desc;
|
|
|
|
int consumed;
|
|
|
|
|
|
|
|
rd_desc.arg.data = queue;
|
|
|
|
rd_desc.count = 1;
|
|
|
|
lock_sock(sk);
|
2019-07-04 05:08:04 +08:00
|
|
|
queue->nr_cqe = 0;
|
2019-07-08 17:52:00 +08:00
|
|
|
consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
|
2018-12-04 09:52:17 +08:00
|
|
|
release_sock(sk);
|
|
|
|
return consumed;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_io_work(struct work_struct *w)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue =
|
|
|
|
container_of(w, struct nvme_tcp_queue, io_work);
|
2019-09-19 07:36:37 +08:00
|
|
|
unsigned long deadline = jiffies + msecs_to_jiffies(1);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
bool pending = false;
|
|
|
|
int result;
|
|
|
|
|
2020-05-02 05:25:45 +08:00
|
|
|
if (mutex_trylock(&queue->send_mutex)) {
|
|
|
|
result = nvme_tcp_try_send(queue);
|
|
|
|
mutex_unlock(&queue->send_mutex);
|
|
|
|
if (result > 0)
|
|
|
|
pending = true;
|
|
|
|
else if (unlikely(result < 0))
|
|
|
|
break;
|
2021-09-09 23:54:52 +08:00
|
|
|
}
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
result = nvme_tcp_try_recv(queue);
|
|
|
|
if (result > 0)
|
|
|
|
pending = true;
|
2020-02-26 08:43:24 +08:00
|
|
|
else if (unlikely(result < 0))
|
2020-04-01 13:44:23 +08:00
|
|
|
return;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2022-09-05 18:54:17 +08:00
|
|
|
if (!pending || !queue->rd_enabled)
|
2018-12-04 09:52:17 +08:00
|
|
|
return;
|
|
|
|
|
2019-09-19 07:36:37 +08:00
|
|
|
} while (!time_after(jiffies, deadline)); /* quota is exhausted */
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
|
|
|
|
|
|
|
|
ahash_request_free(queue->rcv_hash);
|
|
|
|
ahash_request_free(queue->snd_hash);
|
|
|
|
crypto_free_ahash(tfm);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
struct crypto_ahash *tfm;
|
|
|
|
|
|
|
|
tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
|
|
|
|
if (IS_ERR(tfm))
|
|
|
|
return PTR_ERR(tfm);
|
|
|
|
|
|
|
|
queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
|
|
|
|
if (!queue->snd_hash)
|
|
|
|
goto free_tfm;
|
|
|
|
ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
|
|
|
|
|
|
|
|
queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
|
|
|
|
if (!queue->rcv_hash)
|
|
|
|
goto free_snd_hash;
|
|
|
|
ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
free_snd_hash:
|
|
|
|
ahash_request_free(queue->snd_hash);
|
|
|
|
free_tfm:
|
|
|
|
crypto_free_ahash(tfm);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_request *async = &ctrl->async_req;
|
|
|
|
|
|
|
|
page_frag_free(async->pdu);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue = &ctrl->queues[0];
|
|
|
|
struct nvme_tcp_request *async = &ctrl->async_req;
|
|
|
|
u8 hdgst = nvme_tcp_hdgst_len(queue);
|
|
|
|
|
|
|
|
async->pdu = page_frag_alloc(&queue->pf_cache,
|
|
|
|
sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
|
|
|
|
GFP_KERNEL | __GFP_ZERO);
|
|
|
|
if (!async->pdu)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
async->queue = &ctrl->queues[0];
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
|
|
|
|
{
|
2021-11-03 16:18:17 +08:00
|
|
|
struct page *page;
|
2018-12-04 09:52:17 +08:00
|
|
|
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
|
|
|
|
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
|
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure
When destroying a queue, when calling sock_release, the network stack
might need to allocate an skb to send a FIN/RST. When that happens
during memory pressure, there is a need to reclaim memory, which
in turn may ask the nvme-tcp device to write out dirty pages, however
this is not possible due to a ctrl teardown that is going on.
Set PF_MEMALLOC to the task that releases the socket to grant access
to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp
thread as this may also originate from the swap itself and should
be more resilient to memory pressure situations.
This fixes the following lockdep complaint:
--
======================================================
WARNING: possible circular locking dependency detected
6.0.0-rc2+ #25 Tainted: G W
------------------------------------------------------
kswapd0/92 is trying to acquire lock:
ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0
but task is already holding lock:
ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (fs_reclaim){+.+.}-{0:0}:
fs_reclaim_acquire+0x11e/0x160
kmem_cache_alloc_node+0x44/0x530
__alloc_skb+0x158/0x230
tcp_send_active_reset+0x7e/0x730
tcp_disconnect+0x1272/0x1ae0
__tcp_close+0x707/0xd90
tcp_close+0x26/0x80
inet_release+0xfa/0x220
sock_release+0x85/0x1a0
nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp]
nvme_do_delete_ctrl+0x130/0x13d [nvme_core]
nvme_sysfs_delete.cold+0x8/0xd [nvme_core]
kernfs_fop_write_iter+0x356/0x530
vfs_write+0x4e8/0xce0
ksys_write+0xfd/0x1d0
do_syscall_64+0x58/0x80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
-> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}:
__lock_acquire+0x2a0c/0x5690
lock_acquire+0x18e/0x4f0
lock_sock_nested+0x37/0xc0
tcp_sendpage+0x23/0xa0
inet_sendpage+0xad/0x120
kernel_sendpage+0x156/0x440
nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp]
nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp]
__blk_mq_try_issue_directly+0x452/0x660
blk_mq_plug_issue_direct.constprop.0+0x207/0x700
blk_mq_flush_plug_list+0x6f5/0xc70
__blk_flush_plug+0x264/0x410
blk_finish_plug+0x4b/0xa0
shrink_lruvec+0x1263/0x1ea0
shrink_node+0x736/0x1a80
balance_pgdat+0x740/0x10d0
kswapd+0x5f2/0xaf0
kthread+0x256/0x2f0
ret_from_fork+0x1f/0x30
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
*** DEADLOCK ***
3 locks held by kswapd0/92:
#0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
#1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70
#2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp]
Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver")
Reported-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Tested-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
|
|
|
unsigned int noreclaim_flag;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (queue->hdr_digest || queue->data_digest)
|
|
|
|
nvme_tcp_free_crypto(queue);
|
|
|
|
|
2021-11-03 16:18:17 +08:00
|
|
|
if (queue->pf_cache.va) {
|
|
|
|
page = virt_to_head_page(queue->pf_cache.va);
|
|
|
|
__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
|
|
|
|
queue->pf_cache.va = NULL;
|
|
|
|
}
|
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure
When destroying a queue, when calling sock_release, the network stack
might need to allocate an skb to send a FIN/RST. When that happens
during memory pressure, there is a need to reclaim memory, which
in turn may ask the nvme-tcp device to write out dirty pages, however
this is not possible due to a ctrl teardown that is going on.
Set PF_MEMALLOC to the task that releases the socket to grant access
to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp
thread as this may also originate from the swap itself and should
be more resilient to memory pressure situations.
This fixes the following lockdep complaint:
--
======================================================
WARNING: possible circular locking dependency detected
6.0.0-rc2+ #25 Tainted: G W
------------------------------------------------------
kswapd0/92 is trying to acquire lock:
ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0
but task is already holding lock:
ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (fs_reclaim){+.+.}-{0:0}:
fs_reclaim_acquire+0x11e/0x160
kmem_cache_alloc_node+0x44/0x530
__alloc_skb+0x158/0x230
tcp_send_active_reset+0x7e/0x730
tcp_disconnect+0x1272/0x1ae0
__tcp_close+0x707/0xd90
tcp_close+0x26/0x80
inet_release+0xfa/0x220
sock_release+0x85/0x1a0
nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp]
nvme_do_delete_ctrl+0x130/0x13d [nvme_core]
nvme_sysfs_delete.cold+0x8/0xd [nvme_core]
kernfs_fop_write_iter+0x356/0x530
vfs_write+0x4e8/0xce0
ksys_write+0xfd/0x1d0
do_syscall_64+0x58/0x80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
-> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}:
__lock_acquire+0x2a0c/0x5690
lock_acquire+0x18e/0x4f0
lock_sock_nested+0x37/0xc0
tcp_sendpage+0x23/0xa0
inet_sendpage+0xad/0x120
kernel_sendpage+0x156/0x440
nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp]
nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp]
__blk_mq_try_issue_directly+0x452/0x660
blk_mq_plug_issue_direct.constprop.0+0x207/0x700
blk_mq_flush_plug_list+0x6f5/0xc70
__blk_flush_plug+0x264/0x410
blk_finish_plug+0x4b/0xa0
shrink_lruvec+0x1263/0x1ea0
shrink_node+0x736/0x1a80
balance_pgdat+0x740/0x10d0
kswapd+0x5f2/0xaf0
kthread+0x256/0x2f0
ret_from_fork+0x1f/0x30
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
*** DEADLOCK ***
3 locks held by kswapd0/92:
#0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
#1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70
#2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp]
Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver")
Reported-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Tested-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
|
|
|
|
|
|
|
noreclaim_flag = memalloc_noreclaim_save();
|
2018-12-04 09:52:17 +08:00
|
|
|
sock_release(queue->sock);
|
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure
When destroying a queue, when calling sock_release, the network stack
might need to allocate an skb to send a FIN/RST. When that happens
during memory pressure, there is a need to reclaim memory, which
in turn may ask the nvme-tcp device to write out dirty pages, however
this is not possible due to a ctrl teardown that is going on.
Set PF_MEMALLOC to the task that releases the socket to grant access
to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp
thread as this may also originate from the swap itself and should
be more resilient to memory pressure situations.
This fixes the following lockdep complaint:
--
======================================================
WARNING: possible circular locking dependency detected
6.0.0-rc2+ #25 Tainted: G W
------------------------------------------------------
kswapd0/92 is trying to acquire lock:
ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0
but task is already holding lock:
ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (fs_reclaim){+.+.}-{0:0}:
fs_reclaim_acquire+0x11e/0x160
kmem_cache_alloc_node+0x44/0x530
__alloc_skb+0x158/0x230
tcp_send_active_reset+0x7e/0x730
tcp_disconnect+0x1272/0x1ae0
__tcp_close+0x707/0xd90
tcp_close+0x26/0x80
inet_release+0xfa/0x220
sock_release+0x85/0x1a0
nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp]
nvme_do_delete_ctrl+0x130/0x13d [nvme_core]
nvme_sysfs_delete.cold+0x8/0xd [nvme_core]
kernfs_fop_write_iter+0x356/0x530
vfs_write+0x4e8/0xce0
ksys_write+0xfd/0x1d0
do_syscall_64+0x58/0x80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
-> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}:
__lock_acquire+0x2a0c/0x5690
lock_acquire+0x18e/0x4f0
lock_sock_nested+0x37/0xc0
tcp_sendpage+0x23/0xa0
inet_sendpage+0xad/0x120
kernel_sendpage+0x156/0x440
nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp]
nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp]
__blk_mq_try_issue_directly+0x452/0x660
blk_mq_plug_issue_direct.constprop.0+0x207/0x700
blk_mq_flush_plug_list+0x6f5/0xc70
__blk_flush_plug+0x264/0x410
blk_finish_plug+0x4b/0xa0
shrink_lruvec+0x1263/0x1ea0
shrink_node+0x736/0x1a80
balance_pgdat+0x740/0x10d0
kswapd+0x5f2/0xaf0
kthread+0x256/0x2f0
ret_from_fork+0x1f/0x30
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
lock(fs_reclaim);
lock(sk_lock-AF_INET-NVME);
*** DEADLOCK ***
3 locks held by kswapd0/92:
#0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0
#1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70
#2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp]
Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver")
Reported-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Tested-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
|
|
|
memalloc_noreclaim_restore(noreclaim_flag);
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
kfree(queue->pdu);
|
2021-08-06 23:41:43 +08:00
|
|
|
mutex_destroy(&queue->send_mutex);
|
2021-01-14 17:09:26 +08:00
|
|
|
mutex_destroy(&queue->queue_lock);
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_icreq_pdu *icreq;
|
|
|
|
struct nvme_tcp_icresp_pdu *icresp;
|
|
|
|
struct msghdr msg = {};
|
|
|
|
struct kvec iov;
|
|
|
|
bool ctrl_hdgst, ctrl_ddgst;
|
2022-01-23 00:57:44 +08:00
|
|
|
u32 maxh2cdata;
|
2018-12-04 09:52:17 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
|
|
|
|
if (!icreq)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
|
|
|
|
if (!icresp) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto free_icreq;
|
|
|
|
}
|
|
|
|
|
|
|
|
icreq->hdr.type = nvme_tcp_icreq;
|
|
|
|
icreq->hdr.hlen = sizeof(*icreq);
|
|
|
|
icreq->hdr.pdo = 0;
|
|
|
|
icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
|
|
|
|
icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
|
|
|
|
icreq->maxr2t = 0; /* single inflight r2t supported */
|
|
|
|
icreq->hpda = 0; /* no alignment constraint */
|
|
|
|
if (queue->hdr_digest)
|
|
|
|
icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
|
|
|
|
if (queue->data_digest)
|
|
|
|
icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
|
|
|
|
|
|
|
|
iov.iov_base = icreq;
|
|
|
|
iov.iov_len = sizeof(*icreq);
|
|
|
|
ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
|
|
|
|
if (ret < 0)
|
|
|
|
goto free_icresp;
|
|
|
|
|
|
|
|
memset(&msg, 0, sizeof(msg));
|
|
|
|
iov.iov_base = icresp;
|
|
|
|
iov.iov_len = sizeof(*icresp);
|
|
|
|
ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
|
|
|
|
iov.iov_len, msg.msg_flags);
|
|
|
|
if (ret < 0)
|
|
|
|
goto free_icresp;
|
|
|
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
if (icresp->hdr.type != nvme_tcp_icresp) {
|
|
|
|
pr_err("queue %d: bad type returned %d\n",
|
|
|
|
nvme_tcp_queue_id(queue), icresp->hdr.type);
|
|
|
|
goto free_icresp;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
|
|
|
|
pr_err("queue %d: bad pdu length returned %d\n",
|
|
|
|
nvme_tcp_queue_id(queue), icresp->hdr.plen);
|
|
|
|
goto free_icresp;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (icresp->pfv != NVME_TCP_PFV_1_0) {
|
|
|
|
pr_err("queue %d: bad pfv returned %d\n",
|
|
|
|
nvme_tcp_queue_id(queue), icresp->pfv);
|
|
|
|
goto free_icresp;
|
|
|
|
}
|
|
|
|
|
|
|
|
ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
|
|
|
|
if ((queue->data_digest && !ctrl_ddgst) ||
|
|
|
|
(!queue->data_digest && ctrl_ddgst)) {
|
|
|
|
pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
|
|
|
|
nvme_tcp_queue_id(queue),
|
|
|
|
queue->data_digest ? "enabled" : "disabled",
|
|
|
|
ctrl_ddgst ? "enabled" : "disabled");
|
|
|
|
goto free_icresp;
|
|
|
|
}
|
|
|
|
|
|
|
|
ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
|
|
|
|
if ((queue->hdr_digest && !ctrl_hdgst) ||
|
|
|
|
(!queue->hdr_digest && ctrl_hdgst)) {
|
|
|
|
pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
|
|
|
|
nvme_tcp_queue_id(queue),
|
|
|
|
queue->hdr_digest ? "enabled" : "disabled",
|
|
|
|
ctrl_hdgst ? "enabled" : "disabled");
|
|
|
|
goto free_icresp;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (icresp->cpda != 0) {
|
|
|
|
pr_err("queue %d: unsupported cpda returned %d\n",
|
|
|
|
nvme_tcp_queue_id(queue), icresp->cpda);
|
|
|
|
goto free_icresp;
|
|
|
|
}
|
|
|
|
|
2022-01-23 00:57:44 +08:00
|
|
|
maxh2cdata = le32_to_cpu(icresp->maxdata);
|
|
|
|
if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) {
|
|
|
|
pr_err("queue %d: invalid maxh2cdata returned %u\n",
|
|
|
|
nvme_tcp_queue_id(queue), maxh2cdata);
|
|
|
|
goto free_icresp;
|
|
|
|
}
|
|
|
|
queue->maxh2cdata = maxh2cdata;
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
ret = 0;
|
|
|
|
free_icresp:
|
|
|
|
kfree(icresp);
|
|
|
|
free_icreq:
|
|
|
|
kfree(icreq);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-02-26 07:53:09 +08:00
|
|
|
static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
return nvme_tcp_queue_id(queue) == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
|
|
|
|
int qid = nvme_tcp_queue_id(queue);
|
|
|
|
|
|
|
|
return !nvme_tcp_admin_queue(queue) &&
|
|
|
|
qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
|
|
|
|
int qid = nvme_tcp_queue_id(queue);
|
|
|
|
|
|
|
|
return !nvme_tcp_admin_queue(queue) &&
|
|
|
|
!nvme_tcp_default_queue(queue) &&
|
|
|
|
qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
|
|
|
|
ctrl->io_queues[HCTX_TYPE_READ];
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
|
|
|
|
int qid = nvme_tcp_queue_id(queue);
|
|
|
|
|
|
|
|
return !nvme_tcp_admin_queue(queue) &&
|
|
|
|
!nvme_tcp_default_queue(queue) &&
|
|
|
|
!nvme_tcp_read_queue(queue) &&
|
|
|
|
qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
|
|
|
|
ctrl->io_queues[HCTX_TYPE_READ] +
|
|
|
|
ctrl->io_queues[HCTX_TYPE_POLL];
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
|
|
|
|
int qid = nvme_tcp_queue_id(queue);
|
|
|
|
int n = 0;
|
|
|
|
|
|
|
|
if (nvme_tcp_default_queue(queue))
|
|
|
|
n = qid - 1;
|
|
|
|
else if (nvme_tcp_read_queue(queue))
|
|
|
|
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
|
|
|
|
else if (nvme_tcp_poll_queue(queue))
|
|
|
|
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
|
|
|
|
ctrl->io_queues[HCTX_TYPE_READ] - 1;
|
|
|
|
queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
|
|
|
|
}
|
|
|
|
|
2022-09-20 23:23:24 +08:00
|
|
|
static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
|
2018-12-04 09:52:17 +08:00
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
|
|
|
|
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
|
2020-05-28 13:12:26 +08:00
|
|
|
int ret, rcv_pdu_size;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2021-01-14 17:09:26 +08:00
|
|
|
mutex_init(&queue->queue_lock);
|
2018-12-04 09:52:17 +08:00
|
|
|
queue->ctrl = ctrl;
|
2020-06-19 08:30:22 +08:00
|
|
|
init_llist_head(&queue->req_list);
|
2018-12-04 09:52:17 +08:00
|
|
|
INIT_LIST_HEAD(&queue->send_list);
|
2020-05-02 05:25:45 +08:00
|
|
|
mutex_init(&queue->send_mutex);
|
2018-12-04 09:52:17 +08:00
|
|
|
INIT_WORK(&queue->io_work, nvme_tcp_io_work);
|
|
|
|
|
|
|
|
if (qid > 0)
|
2019-08-18 17:08:53 +08:00
|
|
|
queue->cmnd_capsule_len = nctrl->ioccsz * 16;
|
2018-12-04 09:52:17 +08:00
|
|
|
else
|
|
|
|
queue->cmnd_capsule_len = sizeof(struct nvme_command) +
|
|
|
|
NVME_TCP_ADMIN_CCSZ;
|
|
|
|
|
|
|
|
ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
|
|
|
|
IPPROTO_TCP, &queue->sock);
|
|
|
|
if (ret) {
|
2019-08-18 17:08:53 +08:00
|
|
|
dev_err(nctrl->device,
|
2018-12-04 09:52:17 +08:00
|
|
|
"failed to create socket: %d\n", ret);
|
2021-01-14 17:09:26 +08:00
|
|
|
goto err_destroy_mutex;
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
nvme-tcp: lockdep: annotate in-kernel sockets
Put NVMe/TCP sockets in their own class to avoid some lockdep warnings.
Sockets created by nvme-tcp are not exposed to user-space, and will not
trigger certain code paths that the general socket API exposes.
Lockdep complains about a circular dependency between the socket and
filesystem locks, because setsockopt can trigger a page fault with a
socket lock held, but nvme-tcp sends requests on the socket while file
system locks are held.
======================================================
WARNING: possible circular locking dependency detected
5.15.0-rc3 #1 Not tainted
------------------------------------------------------
fio/1496 is trying to acquire lock:
(sk_lock-AF_INET){+.+.}-{0:0}, at: tcp_sendpage+0x23/0x80
but task is already holding lock:
(&xfs_dir_ilock_class/5){+.+.}-{3:3}, at: xfs_ilock+0xcf/0x290 [xfs]
which lock already depends on the new lock.
other info that might help us debug this:
chain exists of:
sk_lock-AF_INET --> sb_internal --> &xfs_dir_ilock_class/5
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&xfs_dir_ilock_class/5);
lock(sb_internal);
lock(&xfs_dir_ilock_class/5);
lock(sk_lock-AF_INET);
*** DEADLOCK ***
6 locks held by fio/1496:
#0: (sb_writers#13){.+.+}-{0:0}, at: path_openat+0x9fc/0xa20
#1: (&inode->i_sb->s_type->i_mutex_dir_key){++++}-{3:3}, at: path_openat+0x296/0xa20
#2: (sb_internal){.+.+}-{0:0}, at: xfs_trans_alloc_icreate+0x41/0xd0 [xfs]
#3: (&xfs_dir_ilock_class/5){+.+.}-{3:3}, at: xfs_ilock+0xcf/0x290 [xfs]
#4: (hctx->srcu){....}-{0:0}, at: hctx_lock+0x51/0xd0
#5: (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0x33e/0x380 [nvme_tcp]
This annotation lets lockdep analyze nvme-tcp controlled sockets
independently of what the user-space sockets API does.
Link: https://lore.kernel.org/linux-nvme/CAHj4cs9MDYLJ+q+2_GXUK9HxFizv2pxUryUR0toX974M040z7g@mail.gmail.com/
Signed-off-by: Chris Leech <cleech@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-02-16 10:22:49 +08:00
|
|
|
nvme_tcp_reclassify_socket(queue->sock);
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
/* Single syn retry */
|
2020-05-28 13:12:21 +08:00
|
|
|
tcp_sock_set_syncnt(queue->sock->sk, 1);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
/* Set TCP no delay */
|
2020-05-28 13:12:19 +08:00
|
|
|
tcp_sock_set_nodelay(queue->sock->sk);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Cleanup whatever is sitting in the TCP transmit queue on socket
|
|
|
|
* close. This is done to prevent stale data from being sent should
|
|
|
|
* the network connection be restored before TCP times out.
|
|
|
|
*/
|
2020-05-28 13:12:10 +08:00
|
|
|
sock_no_linger(queue->sock->sk);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2020-05-28 13:12:11 +08:00
|
|
|
if (so_priority > 0)
|
|
|
|
sock_set_priority(queue->sock->sk, so_priority);
|
2020-01-16 08:46:12 +08:00
|
|
|
|
2019-08-18 17:08:54 +08:00
|
|
|
/* Set socket type of service */
|
2020-05-28 13:12:26 +08:00
|
|
|
if (nctrl->opts->tos >= 0)
|
|
|
|
ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
|
2019-08-18 17:08:54 +08:00
|
|
|
|
2020-07-24 07:42:26 +08:00
|
|
|
/* Set 10 seconds timeout for icresp recvmsg */
|
|
|
|
queue->sock->sk->sk_rcvtimeo = 10 * HZ;
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
queue->sock->sk->sk_allocation = GFP_ATOMIC;
|
2022-12-16 20:45:27 +08:00
|
|
|
queue->sock->sk->sk_use_task_frag = false;
|
2020-02-26 07:53:09 +08:00
|
|
|
nvme_tcp_set_queue_io_cpu(queue);
|
2018-12-04 09:52:17 +08:00
|
|
|
queue->request = NULL;
|
|
|
|
queue->data_remaining = 0;
|
|
|
|
queue->ddgst_remaining = 0;
|
|
|
|
queue->pdu_remaining = 0;
|
|
|
|
queue->pdu_offset = 0;
|
|
|
|
sk_set_memalloc(queue->sock->sk);
|
|
|
|
|
2019-08-18 17:08:53 +08:00
|
|
|
if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
|
2018-12-04 09:52:17 +08:00
|
|
|
ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
|
|
|
|
sizeof(ctrl->src_addr));
|
|
|
|
if (ret) {
|
2019-08-18 17:08:53 +08:00
|
|
|
dev_err(nctrl->device,
|
2018-12-04 09:52:17 +08:00
|
|
|
"failed to bind queue %d socket %d\n",
|
|
|
|
qid, ret);
|
|
|
|
goto err_sock;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
nvme-tcp: allow selecting the network interface for connections
In our application, we need a way to force TCP connections to go out a
specific IP interface instead of letting Linux select the interface
based on the routing tables.
Add the 'host-iface' option to allow specifying the interface to use.
When the option host-iface is specified, the driver uses the specified
interface to set the option SO_BINDTODEVICE on the TCP socket before
connecting.
This new option is needed in addtion to the existing host-traddr for
the following reasons:
Specifying an IP interface by its associated IP address is less
intuitive than specifying the actual interface name and, in some cases,
simply doesn't work. That's because the association between interfaces
and IP addresses is not predictable. IP addresses can be changed or can
change by themselves over time (e.g. DHCP). Interface names are
predictable [1] and will persist over time. Consider the following
configuration.
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state ...
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 100.0.0.100/24 scope global lo
valid_lft forever preferred_lft forever
2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff
inet 100.0.0.100/24 scope global enp0s3
valid_lft forever preferred_lft forever
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff
inet 100.0.0.100/24 scope global enp0s8
valid_lft forever preferred_lft forever
The above is a VM that I configured with the same IP address
(100.0.0.100) on all interfaces. Doing a reverse lookup to identify the
unique interface associated with 100.0.0.100 does not work here. And
this is why the option host_iface is required. I understand that the
above config does not represent a standard host system, but I'm using
this to prove a point: "We can never know how users will configure
their systems". By te way, The above configuration is perfectly fine
by Linux.
The current TCP implementation for host_traddr performs a
bind()-before-connect(). This is a common construct to set the source
IP address on a TCP socket before connecting. This has no effect on how
Linux selects the interface for the connection. That's because Linux
uses the Weak End System model as described in RFC1122 [2]. On the other
hand, setting the Source IP Address has benefits and should be supported
by linux-nvme. In fact, setting the Source IP Address is a mandatory
FedGov requirement (e.g. connection to a RADIUS/TACACS+ server).
Consider the following configuration.
$ ip addr list dev enp0s8
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff
inet 192.168.56.101/24 brd 192.168.56.255 scope global enp0s8
valid_lft 426sec preferred_lft 426sec
inet 192.168.56.102/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
inet 192.168.56.103/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
inet 192.168.56.104/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
Here we can see that several addresses are associated with interface
enp0s8. By default, Linux always selects the default IP address,
192.168.56.101, as the source address when connecting over interface
enp0s8. Some users, however, want the ability to specify a different
source address (e.g., 192.168.56.102, 192.168.56.103, ...). The option
host_traddr can be used as-is to perform this function.
In conclusion, I believe that we need 2 options for TCP connections.
One that can be used to specify an interface (host-iface). And one that
can be used to set the source address (host-traddr). Users should be
allowed to use one or the other, or both, or none. Of course, the
documentation for host_traddr will need some clarification. It should
state that when used for TCP connection, this option only sets the
source address. And the documentation for host_iface should say that
this option is only available for TCP connections.
References:
[1] https://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames/
[2] https://tools.ietf.org/html/rfc1122
Tested both IPv4 and IPv6 connections.
Signed-off-by: Martin Belanger <martin.belanger@dell.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-05-21 03:09:34 +08:00
|
|
|
if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) {
|
|
|
|
char *iface = nctrl->opts->host_iface;
|
|
|
|
sockptr_t optval = KERNEL_SOCKPTR(iface);
|
|
|
|
|
|
|
|
ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
|
|
|
|
optval, strlen(iface));
|
|
|
|
if (ret) {
|
|
|
|
dev_err(nctrl->device,
|
|
|
|
"failed to bind to interface %s queue %d err %d\n",
|
|
|
|
iface, qid, ret);
|
|
|
|
goto err_sock;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
queue->hdr_digest = nctrl->opts->hdr_digest;
|
|
|
|
queue->data_digest = nctrl->opts->data_digest;
|
|
|
|
if (queue->hdr_digest || queue->data_digest) {
|
|
|
|
ret = nvme_tcp_alloc_crypto(queue);
|
|
|
|
if (ret) {
|
2019-08-18 17:08:53 +08:00
|
|
|
dev_err(nctrl->device,
|
2018-12-04 09:52:17 +08:00
|
|
|
"failed to allocate queue %d crypto\n", qid);
|
|
|
|
goto err_sock;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
|
|
|
|
nvme_tcp_hdgst_len(queue);
|
|
|
|
queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
|
|
|
|
if (!queue->pdu) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto err_crypto;
|
|
|
|
}
|
|
|
|
|
2019-08-18 17:08:53 +08:00
|
|
|
dev_dbg(nctrl->device, "connecting queue %d\n",
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_queue_id(queue));
|
|
|
|
|
|
|
|
ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
|
|
|
|
sizeof(ctrl->addr), 0);
|
|
|
|
if (ret) {
|
2019-08-18 17:08:53 +08:00
|
|
|
dev_err(nctrl->device,
|
2018-12-04 09:52:17 +08:00
|
|
|
"failed to connect socket: %d\n", ret);
|
|
|
|
goto err_rcv_pdu;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = nvme_tcp_init_connection(queue);
|
|
|
|
if (ret)
|
|
|
|
goto err_init_connect;
|
|
|
|
|
|
|
|
set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_init_connect:
|
|
|
|
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
|
|
|
|
err_rcv_pdu:
|
|
|
|
kfree(queue->pdu);
|
|
|
|
err_crypto:
|
|
|
|
if (queue->hdr_digest || queue->data_digest)
|
|
|
|
nvme_tcp_free_crypto(queue);
|
|
|
|
err_sock:
|
|
|
|
sock_release(queue->sock);
|
|
|
|
queue->sock = NULL;
|
2021-01-14 17:09:26 +08:00
|
|
|
err_destroy_mutex:
|
2021-08-06 23:41:43 +08:00
|
|
|
mutex_destroy(&queue->send_mutex);
|
2021-01-14 17:09:26 +08:00
|
|
|
mutex_destroy(&queue->queue_lock);
|
2018-12-04 09:52:17 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-03-20 21:33:34 +08:00
|
|
|
static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue)
|
2018-12-04 09:52:17 +08:00
|
|
|
{
|
|
|
|
struct socket *sock = queue->sock;
|
|
|
|
|
|
|
|
write_lock_bh(&sock->sk->sk_callback_lock);
|
|
|
|
sock->sk->sk_user_data = NULL;
|
|
|
|
sock->sk->sk_data_ready = queue->data_ready;
|
|
|
|
sock->sk->sk_state_change = queue->state_change;
|
|
|
|
sock->sk->sk_write_space = queue->write_space;
|
|
|
|
write_unlock_bh(&sock->sk->sk_callback_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
|
2023-03-20 21:33:34 +08:00
|
|
|
nvme_tcp_restore_sock_ops(queue);
|
2018-12-04 09:52:17 +08:00
|
|
|
cancel_work_sync(&queue->io_work);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
|
|
|
|
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
|
|
|
|
|
2022-08-01 16:09:00 +08:00
|
|
|
if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
|
|
|
|
return;
|
|
|
|
|
2021-01-14 17:09:26 +08:00
|
|
|
mutex_lock(&queue->queue_lock);
|
|
|
|
if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
|
|
|
|
__nvme_tcp_stop_queue(queue);
|
|
|
|
mutex_unlock(&queue->queue_lock);
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
2023-03-20 21:33:34 +08:00
|
|
|
static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
|
|
|
|
{
|
|
|
|
write_lock_bh(&queue->sock->sk->sk_callback_lock);
|
|
|
|
queue->sock->sk->sk_user_data = queue;
|
|
|
|
queue->state_change = queue->sock->sk->sk_state_change;
|
|
|
|
queue->data_ready = queue->sock->sk->sk_data_ready;
|
|
|
|
queue->write_space = queue->sock->sk->sk_write_space;
|
|
|
|
queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
|
|
|
|
queue->sock->sk->sk_state_change = nvme_tcp_state_change;
|
|
|
|
queue->sock->sk->sk_write_space = nvme_tcp_write_space;
|
|
|
|
#ifdef CONFIG_NET_RX_BUSY_POLL
|
|
|
|
queue->sock->sk->sk_ll_usec = 1;
|
|
|
|
#endif
|
|
|
|
write_unlock_bh(&queue->sock->sk->sk_callback_lock);
|
|
|
|
}
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
|
2023-03-20 21:33:34 +08:00
|
|
|
struct nvme_tcp_queue *queue = &ctrl->queues[idx];
|
2018-12-04 09:52:17 +08:00
|
|
|
int ret;
|
|
|
|
|
2023-03-20 21:33:34 +08:00
|
|
|
queue->rd_enabled = true;
|
|
|
|
nvme_tcp_init_recv_ctx(queue);
|
|
|
|
nvme_tcp_setup_sock_ops(queue);
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
if (idx)
|
2021-06-11 05:44:35 +08:00
|
|
|
ret = nvmf_connect_io_queue(nctrl, idx);
|
2018-12-04 09:52:17 +08:00
|
|
|
else
|
|
|
|
ret = nvmf_connect_admin_queue(nctrl);
|
|
|
|
|
|
|
|
if (!ret) {
|
2023-03-20 21:33:34 +08:00
|
|
|
set_bit(NVME_TCP_Q_LIVE, &queue->flags);
|
2018-12-04 09:52:17 +08:00
|
|
|
} else {
|
2023-03-20 21:33:34 +08:00
|
|
|
if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
|
|
|
|
__nvme_tcp_stop_queue(queue);
|
2018-12-04 09:52:17 +08:00
|
|
|
dev_err(nctrl->device,
|
|
|
|
"failed to connect queue: %d ret=%d\n", idx, ret);
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (to_tcp_ctrl(ctrl)->async_req.pdu) {
|
2020-09-03 06:42:53 +08:00
|
|
|
cancel_work_sync(&ctrl->async_event_work);
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
|
|
|
|
to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
nvme_tcp_free_queue(ctrl, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 1; i < ctrl->queue_count; i++)
|
|
|
|
nvme_tcp_free_queue(ctrl, i);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 1; i < ctrl->queue_count; i++)
|
|
|
|
nvme_tcp_stop_queue(ctrl, i);
|
|
|
|
}
|
|
|
|
|
2022-08-29 17:28:40 +08:00
|
|
|
static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl,
|
|
|
|
int first, int last)
|
2018-12-04 09:52:17 +08:00
|
|
|
{
|
2022-02-23 11:36:56 +08:00
|
|
|
int i, ret;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2022-08-29 17:28:40 +08:00
|
|
|
for (i = first; i < last; i++) {
|
2018-12-04 09:52:17 +08:00
|
|
|
ret = nvme_tcp_start_queue(ctrl, i);
|
|
|
|
if (ret)
|
|
|
|
goto out_stop_queues;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_stop_queues:
|
2022-08-29 17:28:40 +08:00
|
|
|
for (i--; i >= first; i--)
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_stop_queue(ctrl, i);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2022-09-20 23:23:24 +08:00
|
|
|
ret = nvme_tcp_alloc_queue(ctrl, 0);
|
2018-12-04 09:52:17 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
|
|
|
|
if (ret)
|
|
|
|
goto out_free_queue;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_free_queue:
|
|
|
|
nvme_tcp_free_queue(ctrl, 0);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-04-25 02:53:19 +08:00
|
|
|
static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
|
2018-12-04 09:52:17 +08:00
|
|
|
{
|
|
|
|
int i, ret;
|
|
|
|
|
|
|
|
for (i = 1; i < ctrl->queue_count; i++) {
|
2022-09-20 23:23:24 +08:00
|
|
|
ret = nvme_tcp_alloc_queue(ctrl, i);
|
2018-12-04 09:52:17 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_free_queues;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_free_queues:
|
|
|
|
for (i--; i >= 1; i--)
|
|
|
|
nvme_tcp_free_queue(ctrl, i);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
2018-12-12 15:38:57 +08:00
|
|
|
unsigned int nr_io_queues;
|
|
|
|
|
|
|
|
nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
|
|
|
|
nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
|
2019-07-04 05:08:04 +08:00
|
|
|
nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
|
2018-12-12 15:38:57 +08:00
|
|
|
|
|
|
|
return nr_io_queues;
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
2019-05-29 13:49:05 +08:00
|
|
|
static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
|
|
|
|
unsigned int nr_io_queues)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
|
|
|
|
struct nvmf_ctrl_options *opts = nctrl->opts;
|
|
|
|
|
|
|
|
if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
|
|
|
|
/*
|
|
|
|
* separate read/write queues
|
|
|
|
* hand out dedicated default queues only after we have
|
|
|
|
* sufficient read queues.
|
|
|
|
*/
|
|
|
|
ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
|
|
|
|
nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
|
|
|
|
ctrl->io_queues[HCTX_TYPE_DEFAULT] =
|
|
|
|
min(opts->nr_write_queues, nr_io_queues);
|
|
|
|
nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* shared read/write queues
|
|
|
|
* either no write queues were requested, or we don't have
|
|
|
|
* sufficient queue count to have dedicated default queues.
|
|
|
|
*/
|
|
|
|
ctrl->io_queues[HCTX_TYPE_DEFAULT] =
|
|
|
|
min(opts->nr_io_queues, nr_io_queues);
|
|
|
|
nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
|
|
|
|
}
|
2019-07-04 05:08:04 +08:00
|
|
|
|
|
|
|
if (opts->nr_poll_queues && nr_io_queues) {
|
|
|
|
/* map dedicated poll queues only if we have queues left */
|
|
|
|
ctrl->io_queues[HCTX_TYPE_POLL] =
|
|
|
|
min(opts->nr_poll_queues, nr_io_queues);
|
|
|
|
}
|
2019-05-29 13:49:05 +08:00
|
|
|
}
|
|
|
|
|
2019-04-25 02:53:19 +08:00
|
|
|
static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
|
2018-12-04 09:52:17 +08:00
|
|
|
{
|
|
|
|
unsigned int nr_io_queues;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
|
|
|
|
ret = nvme_set_queue_count(ctrl, &nr_io_queues);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2021-08-07 11:50:23 +08:00
|
|
|
if (nr_io_queues == 0) {
|
2021-03-16 05:04:26 +08:00
|
|
|
dev_err(ctrl->device,
|
|
|
|
"unable to set any I/O queues\n");
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2021-08-07 11:50:23 +08:00
|
|
|
ctrl->queue_count = nr_io_queues + 1;
|
2018-12-04 09:52:17 +08:00
|
|
|
dev_info(ctrl->device,
|
|
|
|
"creating %d I/O queues.\n", nr_io_queues);
|
|
|
|
|
2019-05-29 13:49:05 +08:00
|
|
|
nvme_tcp_set_io_queues(ctrl, nr_io_queues);
|
|
|
|
|
2019-04-25 02:53:19 +08:00
|
|
|
return __nvme_tcp_alloc_io_queues(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
|
|
|
|
{
|
|
|
|
nvme_tcp_stop_io_queues(ctrl);
|
2022-09-20 23:12:47 +08:00
|
|
|
if (remove)
|
|
|
|
nvme_remove_io_tag_set(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_free_io_queues(ctrl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
|
|
|
|
{
|
2022-08-29 17:28:40 +08:00
|
|
|
int ret, nr_queues;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2019-04-25 02:53:19 +08:00
|
|
|
ret = nvme_tcp_alloc_io_queues(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (new) {
|
2022-09-20 23:12:47 +08:00
|
|
|
ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set,
|
|
|
|
&nvme_tcp_mq_ops,
|
2022-12-01 00:16:52 +08:00
|
|
|
ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2,
|
2022-09-20 23:12:47 +08:00
|
|
|
sizeof(struct nvme_tcp_request));
|
2022-07-21 14:23:19 +08:00
|
|
|
if (ret)
|
2018-12-04 09:52:17 +08:00
|
|
|
goto out_free_io_queues;
|
|
|
|
}
|
|
|
|
|
2022-08-29 17:28:40 +08:00
|
|
|
/*
|
|
|
|
* Only start IO queues for which we have allocated the tagset
|
|
|
|
* and limitted it to the available queues. On reconnects, the
|
|
|
|
* queue number might have changed.
|
|
|
|
*/
|
|
|
|
nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count);
|
|
|
|
ret = nvme_tcp_start_io_queues(ctrl, 1, nr_queues);
|
2018-12-04 09:52:17 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_cleanup_connect_q;
|
|
|
|
|
2020-07-25 06:10:12 +08:00
|
|
|
if (!new) {
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_unquiesce_io_queues(ctrl);
|
2020-07-31 04:25:34 +08:00
|
|
|
if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
|
|
|
|
/*
|
|
|
|
* If we timed out waiting for freeze we are likely to
|
|
|
|
* be stuck. Fail the controller initialization just
|
|
|
|
* to be safe.
|
|
|
|
*/
|
|
|
|
ret = -ENODEV;
|
|
|
|
goto out_wait_freeze_timed_out;
|
|
|
|
}
|
2020-07-25 06:10:12 +08:00
|
|
|
blk_mq_update_nr_hw_queues(ctrl->tagset,
|
|
|
|
ctrl->queue_count - 1);
|
|
|
|
nvme_unfreeze(ctrl);
|
|
|
|
}
|
|
|
|
|
2022-08-29 17:28:40 +08:00
|
|
|
/*
|
|
|
|
* If the number of queues has increased (reconnect case)
|
|
|
|
* start all new queues now.
|
|
|
|
*/
|
|
|
|
ret = nvme_tcp_start_io_queues(ctrl, nr_queues,
|
|
|
|
ctrl->tagset->nr_hw_queues + 1);
|
|
|
|
if (ret)
|
|
|
|
goto out_wait_freeze_timed_out;
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
return 0;
|
|
|
|
|
2020-07-31 04:25:34 +08:00
|
|
|
out_wait_freeze_timed_out:
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_quiesce_io_queues(ctrl);
|
2021-01-21 11:32:38 +08:00
|
|
|
nvme_sync_io_queues(ctrl);
|
2020-07-31 04:25:34 +08:00
|
|
|
nvme_tcp_stop_io_queues(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
out_cleanup_connect_q:
|
2021-01-21 11:32:38 +08:00
|
|
|
nvme_cancel_tagset(ctrl);
|
2019-01-01 15:58:30 +08:00
|
|
|
if (new)
|
2022-09-20 23:12:47 +08:00
|
|
|
nvme_remove_io_tag_set(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
out_free_io_queues:
|
|
|
|
nvme_tcp_free_io_queues(ctrl);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
|
|
|
|
{
|
|
|
|
nvme_tcp_stop_queue(ctrl, 0);
|
2022-09-20 23:12:47 +08:00
|
|
|
if (remove)
|
|
|
|
nvme_remove_admin_tag_set(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_free_admin_queue(ctrl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = nvme_tcp_alloc_admin_queue(ctrl);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
if (new) {
|
2022-09-20 23:12:47 +08:00
|
|
|
error = nvme_alloc_admin_tag_set(ctrl,
|
|
|
|
&to_tcp_ctrl(ctrl)->admin_tag_set,
|
2022-12-01 00:19:50 +08:00
|
|
|
&nvme_tcp_admin_mq_ops,
|
2022-09-20 23:12:47 +08:00
|
|
|
sizeof(struct nvme_tcp_request));
|
2022-07-21 14:23:19 +08:00
|
|
|
if (error)
|
2018-12-04 09:52:17 +08:00
|
|
|
goto out_free_queue;
|
|
|
|
}
|
|
|
|
|
|
|
|
error = nvme_tcp_start_queue(ctrl, 0);
|
|
|
|
if (error)
|
2022-09-20 23:12:47 +08:00
|
|
|
goto out_cleanup_tagset;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2019-07-23 08:06:53 +08:00
|
|
|
error = nvme_enable_ctrl(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
if (error)
|
|
|
|
goto out_stop_queue;
|
|
|
|
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_unquiesce_admin_queue(ctrl);
|
2019-08-03 10:33:59 +08:00
|
|
|
|
2022-11-08 22:48:27 +08:00
|
|
|
error = nvme_init_ctrl_finish(ctrl, false);
|
2018-12-04 09:52:17 +08:00
|
|
|
if (error)
|
2021-01-21 11:32:38 +08:00
|
|
|
goto out_quiesce_queue;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
2021-01-21 11:32:38 +08:00
|
|
|
out_quiesce_queue:
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_quiesce_admin_queue(ctrl);
|
2021-01-21 11:32:38 +08:00
|
|
|
blk_sync_queue(ctrl->admin_q);
|
2018-12-04 09:52:17 +08:00
|
|
|
out_stop_queue:
|
|
|
|
nvme_tcp_stop_queue(ctrl, 0);
|
2021-01-21 11:32:38 +08:00
|
|
|
nvme_cancel_admin_tagset(ctrl);
|
2022-09-20 23:12:47 +08:00
|
|
|
out_cleanup_tagset:
|
2019-08-03 10:33:59 +08:00
|
|
|
if (new)
|
2022-09-20 23:12:47 +08:00
|
|
|
nvme_remove_admin_tag_set(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
out_free_queue:
|
|
|
|
nvme_tcp_free_admin_queue(ctrl);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
|
|
|
|
bool remove)
|
|
|
|
{
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_quiesce_admin_queue(ctrl);
|
2020-10-22 10:15:15 +08:00
|
|
|
blk_sync_queue(ctrl->admin_q);
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_stop_queue(ctrl, 0);
|
2021-01-21 11:32:40 +08:00
|
|
|
nvme_cancel_admin_tagset(ctrl);
|
2019-08-03 10:33:59 +08:00
|
|
|
if (remove)
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_unquiesce_admin_queue(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_destroy_admin_queue(ctrl, remove);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
|
|
|
|
bool remove)
|
|
|
|
{
|
|
|
|
if (ctrl->queue_count <= 1)
|
2020-10-22 10:15:15 +08:00
|
|
|
return;
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_quiesce_admin_queue(ctrl);
|
2020-07-25 06:10:12 +08:00
|
|
|
nvme_start_freeze(ctrl);
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_quiesce_io_queues(ctrl);
|
2020-10-22 10:15:15 +08:00
|
|
|
nvme_sync_io_queues(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_stop_io_queues(ctrl);
|
2021-01-21 11:32:40 +08:00
|
|
|
nvme_cancel_tagset(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
if (remove)
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_unquiesce_io_queues(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_destroy_io_queues(ctrl, remove);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
/* If we are resetting/deleting then do nothing */
|
|
|
|
if (ctrl->state != NVME_CTRL_CONNECTING) {
|
|
|
|
WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
|
|
|
|
ctrl->state == NVME_CTRL_LIVE);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nvmf_should_reconnect(ctrl)) {
|
|
|
|
dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
|
|
|
|
ctrl->opts->reconnect_delay);
|
|
|
|
queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
|
|
|
|
ctrl->opts->reconnect_delay * HZ);
|
|
|
|
} else {
|
|
|
|
dev_info(ctrl->device, "Removing controller...\n");
|
|
|
|
nvme_delete_ctrl(ctrl);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
|
|
|
|
{
|
|
|
|
struct nvmf_ctrl_options *opts = ctrl->opts;
|
2019-09-05 22:34:35 +08:00
|
|
|
int ret;
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
ret = nvme_tcp_configure_admin_queue(ctrl, new);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (ctrl->icdoff) {
|
2021-06-05 20:48:16 +08:00
|
|
|
ret = -EOPNOTSUPP;
|
2018-12-04 09:52:17 +08:00
|
|
|
dev_err(ctrl->device, "icdoff is not supported!\n");
|
|
|
|
goto destroy_admin;
|
|
|
|
}
|
|
|
|
|
2021-06-10 09:28:26 +08:00
|
|
|
if (!nvme_ctrl_sgl_supported(ctrl)) {
|
2021-06-05 20:48:16 +08:00
|
|
|
ret = -EOPNOTSUPP;
|
2021-03-31 07:01:19 +08:00
|
|
|
dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
|
|
|
|
goto destroy_admin;
|
|
|
|
}
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
if (opts->queue_size > ctrl->sqsize + 1)
|
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"queue_size %zu > ctrl sqsize %u, clamping down\n",
|
|
|
|
opts->queue_size, ctrl->sqsize + 1);
|
|
|
|
|
|
|
|
if (ctrl->sqsize + 1 > ctrl->maxcmd) {
|
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"sqsize %u > ctrl maxcmd %u, clamping down\n",
|
|
|
|
ctrl->sqsize + 1, ctrl->maxcmd);
|
|
|
|
ctrl->sqsize = ctrl->maxcmd - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ctrl->queue_count > 1) {
|
|
|
|
ret = nvme_tcp_configure_io_queues(ctrl, new);
|
|
|
|
if (ret)
|
|
|
|
goto destroy_admin;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
|
2020-03-24 23:29:45 +08:00
|
|
|
/*
|
2020-07-23 07:32:19 +08:00
|
|
|
* state change failure is ok if we started ctrl delete,
|
2020-03-24 23:29:45 +08:00
|
|
|
* unless we're during creation of a new controller to
|
|
|
|
* avoid races with teardown flow.
|
|
|
|
*/
|
2020-07-23 07:32:19 +08:00
|
|
|
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
|
|
|
|
ctrl->state != NVME_CTRL_DELETING_NOIO);
|
2020-03-24 23:29:45 +08:00
|
|
|
WARN_ON_ONCE(new);
|
2018-12-04 09:52:17 +08:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto destroy_io;
|
|
|
|
}
|
|
|
|
|
|
|
|
nvme_start_ctrl(ctrl);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
destroy_io:
|
2021-01-21 11:32:38 +08:00
|
|
|
if (ctrl->queue_count > 1) {
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_quiesce_io_queues(ctrl);
|
2021-01-21 11:32:38 +08:00
|
|
|
nvme_sync_io_queues(ctrl);
|
|
|
|
nvme_tcp_stop_io_queues(ctrl);
|
|
|
|
nvme_cancel_tagset(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_destroy_io_queues(ctrl, new);
|
2021-01-21 11:32:38 +08:00
|
|
|
}
|
2018-12-04 09:52:17 +08:00
|
|
|
destroy_admin:
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_quiesce_admin_queue(ctrl);
|
2021-01-21 11:32:38 +08:00
|
|
|
blk_sync_queue(ctrl->admin_q);
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_stop_queue(ctrl, 0);
|
2021-01-21 11:32:38 +08:00
|
|
|
nvme_cancel_admin_tagset(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_destroy_admin_queue(ctrl, new);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
|
|
|
|
struct nvme_tcp_ctrl, connect_work);
|
|
|
|
struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
|
|
|
|
|
|
|
|
++ctrl->nr_reconnects;
|
|
|
|
|
|
|
|
if (nvme_tcp_setup_ctrl(ctrl, false))
|
|
|
|
goto requeue;
|
|
|
|
|
2018-12-14 19:42:43 +08:00
|
|
|
dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
|
2018-12-04 09:52:17 +08:00
|
|
|
ctrl->nr_reconnects);
|
|
|
|
|
|
|
|
ctrl->nr_reconnects = 0;
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
requeue:
|
|
|
|
dev_info(ctrl->device, "Failed reconnect attempt %d\n",
|
|
|
|
ctrl->nr_reconnects);
|
|
|
|
nvme_tcp_reconnect_or_remove(ctrl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_error_recovery_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
|
|
|
|
struct nvme_tcp_ctrl, err_work);
|
|
|
|
struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
|
|
|
|
|
|
|
|
nvme_stop_keep_alive(ctrl);
|
2022-02-01 20:54:20 +08:00
|
|
|
flush_work(&ctrl->async_event_work);
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_teardown_io_queues(ctrl, false);
|
|
|
|
/* unquiesce to fail fast pending requests */
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_unquiesce_io_queues(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_teardown_admin_queue(ctrl, false);
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_unquiesce_admin_queue(ctrl);
|
2022-11-13 19:24:23 +08:00
|
|
|
nvme_auth_stop(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
|
2020-07-23 07:32:19 +08:00
|
|
|
/* state change failure is ok if we started ctrl delete */
|
|
|
|
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
|
|
|
|
ctrl->state != NVME_CTRL_DELETING_NOIO);
|
2018-12-04 09:52:17 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
nvme_tcp_reconnect_or_remove(ctrl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
|
|
|
|
{
|
|
|
|
nvme_tcp_teardown_io_queues(ctrl, shutdown);
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_quiesce_admin_queue(ctrl);
|
2022-11-08 18:20:12 +08:00
|
|
|
nvme_disable_ctrl(ctrl, shutdown);
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_teardown_admin_queue(ctrl, shutdown);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
nvme_tcp_teardown_ctrl(ctrl, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_reset_ctrl_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(work, struct nvme_ctrl, reset_work);
|
|
|
|
|
|
|
|
nvme_stop_ctrl(ctrl);
|
|
|
|
nvme_tcp_teardown_ctrl(ctrl, false);
|
|
|
|
|
|
|
|
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
|
2020-07-23 07:32:19 +08:00
|
|
|
/* state change failure is ok if we started ctrl delete */
|
|
|
|
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
|
|
|
|
ctrl->state != NVME_CTRL_DELETING_NOIO);
|
2018-12-04 09:52:17 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nvme_tcp_setup_ctrl(ctrl, false))
|
|
|
|
goto out_fail;
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
out_fail:
|
|
|
|
++ctrl->nr_reconnects;
|
|
|
|
nvme_tcp_reconnect_or_remove(ctrl);
|
|
|
|
}
|
|
|
|
|
2022-06-23 14:45:39 +08:00
|
|
|
static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
nvme-tcp: fix possible hang caused during ctrl deletion
When we delete a controller, we execute the following:
1. nvme_stop_ctrl() - stop some work elements that may be
inflight or scheduled (specifically also .stop_ctrl
which cancels ctrl error recovery work)
2. nvme_remove_namespaces() - which first flushes scan_work
to avoid competing ns addition/removal
3. continue to teardown the controller
However, if err_work was scheduled to run in (1), it is designed to
cancel any inflight I/O, particularly I/O that is originating from ns
scan_work in (2), but because it is cancelled in .stop_ctrl(), we can
prevent forward progress of (2) as ns scanning is blocking on I/O
(that will never be cancelled).
The race is:
1. transport layer error observed -> err_work is scheduled
2. scan_work executes, discovers ns, generate I/O to it
3. nvme_ctop_ctrl() -> .stop_ctrl() -> cancel_work_sync(err_work)
- err_work never executed
4. nvme_remove_namespaces() -> flush_work(scan_work)
--> deadlock, because scan_work is blocked on I/O that was supposed
to be cancelled by err_work, but was cancelled before executing (see
stack trace [1]).
Fix this by flushing err_work instead of cancelling it, to force it
to execute and cancel all inflight I/O.
[1]:
--
Call Trace:
<TASK>
__schedule+0x390/0x910
? scan_shadow_nodes+0x40/0x40
schedule+0x55/0xe0
io_schedule+0x16/0x40
do_read_cache_page+0x55d/0x850
? __page_cache_alloc+0x90/0x90
read_cache_page+0x12/0x20
read_part_sector+0x3f/0x110
amiga_partition+0x3d/0x3e0
? osf_partition+0x33/0x220
? put_partition+0x90/0x90
bdev_disk_changed+0x1fe/0x4d0
blkdev_get_whole+0x7b/0x90
blkdev_get_by_dev+0xda/0x2d0
device_add_disk+0x356/0x3b0
nvme_mpath_set_live+0x13c/0x1a0 [nvme_core]
? nvme_parse_ana_log+0xae/0x1a0 [nvme_core]
nvme_update_ns_ana_state+0x3a/0x40 [nvme_core]
nvme_mpath_add_disk+0x120/0x160 [nvme_core]
nvme_alloc_ns+0x594/0xa00 [nvme_core]
nvme_validate_or_alloc_ns+0xb9/0x1a0 [nvme_core]
? __nvme_submit_sync_cmd+0x1d2/0x210 [nvme_core]
nvme_scan_work+0x281/0x410 [nvme_core]
process_one_work+0x1be/0x380
worker_thread+0x37/0x3b0
? process_one_work+0x380/0x380
kthread+0x12d/0x150
? set_kthread_struct+0x50/0x50
ret_from_fork+0x1f/0x30
</TASK>
INFO: task nvme:6725 blocked for more than 491 seconds.
Not tainted 5.15.65-f0.el7.x86_64 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:nvme state:D
stack: 0 pid: 6725 ppid: 1761 flags:0x00004000
Call Trace:
<TASK>
__schedule+0x390/0x910
? sched_clock+0x9/0x10
schedule+0x55/0xe0
schedule_timeout+0x24b/0x2e0
? try_to_wake_up+0x358/0x510
? finish_task_switch+0x88/0x2c0
wait_for_completion+0xa5/0x110
__flush_work+0x144/0x210
? worker_attach_to_pool+0xc0/0xc0
flush_work+0x10/0x20
nvme_remove_namespaces+0x41/0xf0 [nvme_core]
nvme_do_delete_ctrl+0x47/0x66 [nvme_core]
nvme_sysfs_delete.cold.96+0x8/0xd [nvme_core]
dev_attr_store+0x14/0x30
sysfs_kf_write+0x38/0x50
kernfs_fop_write_iter+0x146/0x1d0
new_sync_write+0x114/0x1b0
? intel_pmu_handle_irq+0xe0/0x420
vfs_write+0x18d/0x270
ksys_write+0x61/0xe0
__x64_sys_write+0x1a/0x20
do_syscall_64+0x37/0x90
entry_SYSCALL_64_after_hwframe+0x61/0xcb
--
Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver")
Reported-by: Jonathan Nicklin <jnicklin@blockbridge.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Tested-by: Jonathan Nicklin <jnicklin@blockbridge.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-09-28 14:23:25 +08:00
|
|
|
flush_work(&to_tcp_ctrl(ctrl)->err_work);
|
2022-06-23 14:45:39 +08:00
|
|
|
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
|
|
|
|
}
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
|
|
|
|
|
|
|
|
if (list_empty(&ctrl->list))
|
|
|
|
goto free_ctrl;
|
|
|
|
|
|
|
|
mutex_lock(&nvme_tcp_ctrl_mutex);
|
|
|
|
list_del(&ctrl->list);
|
|
|
|
mutex_unlock(&nvme_tcp_ctrl_mutex);
|
|
|
|
|
|
|
|
nvmf_free_options(nctrl->opts);
|
|
|
|
free_ctrl:
|
|
|
|
kfree(ctrl->queues);
|
|
|
|
kfree(ctrl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_set_sg_null(struct nvme_command *c)
|
|
|
|
{
|
|
|
|
struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
|
|
|
|
|
|
|
|
sg->addr = 0;
|
|
|
|
sg->length = 0;
|
|
|
|
sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
|
|
|
|
NVME_SGL_FMT_TRANSPORT_A;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
|
|
|
|
struct nvme_command *c, u32 data_len)
|
|
|
|
{
|
|
|
|
struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
|
|
|
|
|
|
|
|
sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
|
|
|
|
sg->length = cpu_to_le32(data_len);
|
|
|
|
sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
|
|
|
|
u32 data_len)
|
|
|
|
{
|
|
|
|
struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
|
|
|
|
|
|
|
|
sg->addr = 0;
|
|
|
|
sg->length = cpu_to_le32(data_len);
|
|
|
|
sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
|
|
|
|
NVME_SGL_FMT_TRANSPORT_A;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
|
|
|
|
struct nvme_tcp_queue *queue = &ctrl->queues[0];
|
|
|
|
struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
|
|
|
|
struct nvme_command *cmd = &pdu->cmd;
|
|
|
|
u8 hdgst = nvme_tcp_hdgst_len(queue);
|
|
|
|
|
|
|
|
memset(pdu, 0, sizeof(*pdu));
|
|
|
|
pdu->hdr.type = nvme_tcp_cmd;
|
|
|
|
if (queue->hdr_digest)
|
|
|
|
pdu->hdr.flags |= NVME_TCP_F_HDGST;
|
|
|
|
pdu->hdr.hlen = sizeof(*pdu);
|
|
|
|
pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
|
|
|
|
|
|
|
|
cmd->common.opcode = nvme_admin_async_event;
|
|
|
|
cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
|
|
|
|
cmd->common.flags |= NVME_CMD_SGL_METABUF;
|
|
|
|
nvme_tcp_set_sg_null(cmd);
|
|
|
|
|
|
|
|
ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
|
|
|
|
ctrl->async_req.offset = 0;
|
|
|
|
ctrl->async_req.curr_bio = NULL;
|
|
|
|
ctrl->async_req.data_len = 0;
|
|
|
|
|
2020-06-19 08:30:23 +08:00
|
|
|
nvme_tcp_queue_request(&ctrl->async_req, true, true);
|
2018-12-04 09:52:17 +08:00
|
|
|
}
|
|
|
|
|
2020-07-29 04:16:36 +08:00
|
|
|
static void nvme_tcp_complete_timed_out(struct request *rq)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
|
|
|
|
struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
|
|
|
|
|
|
|
|
nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
|
2022-03-30 17:40:32 +08:00
|
|
|
nvmf_complete_timed_out_request(rq);
|
2020-07-29 04:16:36 +08:00
|
|
|
}
|
|
|
|
|
2022-07-06 20:03:51 +08:00
|
|
|
static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
|
2018-12-04 09:52:17 +08:00
|
|
|
{
|
|
|
|
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
|
2020-07-29 04:16:36 +08:00
|
|
|
struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
|
2023-03-13 16:56:22 +08:00
|
|
|
struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
|
2022-12-13 03:40:36 +08:00
|
|
|
u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype;
|
|
|
|
int qid = nvme_tcp_queue_id(req->queue);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2020-07-29 04:16:36 +08:00
|
|
|
dev_warn(ctrl->device,
|
2022-12-13 03:40:36 +08:00
|
|
|
"queue %d: timeout cid %#x type %d opcode %#x (%s)\n",
|
|
|
|
nvme_tcp_queue_id(req->queue), nvme_cid(rq), pdu->hdr.type,
|
|
|
|
opc, nvme_opcode_str(qid, opc, fctype));
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2020-07-29 04:16:36 +08:00
|
|
|
if (ctrl->state != NVME_CTRL_LIVE) {
|
2019-01-08 17:01:30 +08:00
|
|
|
/*
|
2020-07-29 04:16:36 +08:00
|
|
|
* If we are resetting, connecting or deleting we should
|
|
|
|
* complete immediately because we may block controller
|
|
|
|
* teardown or setup sequence
|
|
|
|
* - ctrl disable/shutdown fabrics requests
|
|
|
|
* - connect requests
|
|
|
|
* - initialization admin requests
|
|
|
|
* - I/O requests that entered after unquiescing and
|
|
|
|
* the controller stopped responding
|
|
|
|
*
|
|
|
|
* All other requests should be cancelled by the error
|
|
|
|
* recovery work, so it's fine that we fail it here.
|
2019-01-08 17:01:30 +08:00
|
|
|
*/
|
2020-07-29 04:16:36 +08:00
|
|
|
nvme_tcp_complete_timed_out(rq);
|
2018-12-04 09:52:17 +08:00
|
|
|
return BLK_EH_DONE;
|
|
|
|
}
|
|
|
|
|
2020-07-29 04:16:36 +08:00
|
|
|
/*
|
|
|
|
* LIVE state should trigger the normal error recovery which will
|
|
|
|
* handle completing this request.
|
|
|
|
*/
|
|
|
|
nvme_tcp_error_recovery(ctrl);
|
2018-12-04 09:52:17 +08:00
|
|
|
return BLK_EH_RESET_TIMER;
|
|
|
|
}
|
|
|
|
|
|
|
|
static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
|
|
|
|
struct request *rq)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
|
2023-03-13 16:56:22 +08:00
|
|
|
struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
|
2018-12-04 09:52:17 +08:00
|
|
|
struct nvme_command *c = &pdu->cmd;
|
|
|
|
|
|
|
|
c->common.flags |= NVME_CMD_SGL_METABUF;
|
|
|
|
|
2020-03-24 06:06:30 +08:00
|
|
|
if (!blk_rq_nr_phys_segments(rq))
|
|
|
|
nvme_tcp_set_sg_null(c);
|
|
|
|
else if (rq_data_dir(rq) == WRITE &&
|
2022-07-08 05:12:45 +08:00
|
|
|
req->data_len <= nvme_tcp_inline_data_size(req))
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_set_sg_inline(queue, c, req->data_len);
|
|
|
|
else
|
|
|
|
nvme_tcp_set_sg_host_data(c, req->data_len);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
|
|
|
|
struct request *rq)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
|
2023-03-13 16:56:22 +08:00
|
|
|
struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
|
2018-12-04 09:52:17 +08:00
|
|
|
struct nvme_tcp_queue *queue = req->queue;
|
|
|
|
u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
|
|
|
|
blk_status_t ret;
|
|
|
|
|
2021-03-18 04:37:03 +08:00
|
|
|
ret = nvme_setup_cmd(ns, rq);
|
2018-12-04 09:52:17 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
req->state = NVME_TCP_SEND_CMD_PDU;
|
2021-08-30 21:36:26 +08:00
|
|
|
req->status = cpu_to_le16(NVME_SC_SUCCESS);
|
2018-12-04 09:52:17 +08:00
|
|
|
req->offset = 0;
|
|
|
|
req->data_sent = 0;
|
|
|
|
req->pdu_len = 0;
|
|
|
|
req->pdu_sent = 0;
|
2022-01-23 00:57:44 +08:00
|
|
|
req->h2cdata_left = 0;
|
2020-03-24 06:06:30 +08:00
|
|
|
req->data_len = blk_rq_nr_phys_segments(rq) ?
|
|
|
|
blk_rq_payload_bytes(rq) : 0;
|
2018-12-04 09:52:17 +08:00
|
|
|
req->curr_bio = rq->bio;
|
2021-02-11 06:04:00 +08:00
|
|
|
if (req->curr_bio && req->data_len)
|
2021-01-15 05:15:24 +08:00
|
|
|
nvme_tcp_init_iter(req, rq_data_dir(rq));
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
if (rq_data_dir(rq) == WRITE &&
|
2022-07-08 05:12:45 +08:00
|
|
|
req->data_len <= nvme_tcp_inline_data_size(req))
|
2018-12-04 09:52:17 +08:00
|
|
|
req->pdu_len = req->data_len;
|
|
|
|
|
|
|
|
pdu->hdr.type = nvme_tcp_cmd;
|
|
|
|
pdu->hdr.flags = 0;
|
|
|
|
if (queue->hdr_digest)
|
|
|
|
pdu->hdr.flags |= NVME_TCP_F_HDGST;
|
|
|
|
if (queue->data_digest && req->pdu_len) {
|
|
|
|
pdu->hdr.flags |= NVME_TCP_F_DDGST;
|
|
|
|
ddgst = nvme_tcp_ddgst_len(queue);
|
|
|
|
}
|
|
|
|
pdu->hdr.hlen = sizeof(*pdu);
|
|
|
|
pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
|
|
|
|
pdu->hdr.plen =
|
|
|
|
cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
|
|
|
|
|
|
|
|
ret = nvme_tcp_map_data(queue, rq);
|
|
|
|
if (unlikely(ret)) {
|
2019-10-14 00:57:38 +08:00
|
|
|
nvme_cleanup_cmd(rq);
|
2018-12-04 09:52:17 +08:00
|
|
|
dev_err(queue->ctrl->ctrl.device,
|
|
|
|
"Failed to map data (%d)\n", ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-06-19 08:30:23 +08:00
|
|
|
static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue = hctx->driver_data;
|
|
|
|
|
|
|
|
if (!llist_empty(&queue->req_list))
|
|
|
|
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
|
|
|
|
}
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
|
|
|
|
const struct blk_mq_queue_data *bd)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns = hctx->queue->queuedata;
|
|
|
|
struct nvme_tcp_queue *queue = hctx->driver_data;
|
|
|
|
struct request *rq = bd->rq;
|
|
|
|
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
|
|
|
|
bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
|
|
|
|
blk_status_t ret;
|
|
|
|
|
2021-04-26 10:53:10 +08:00
|
|
|
if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
|
|
|
|
return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
ret = nvme_tcp_setup_cmd_pdu(ns, rq);
|
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
|
|
|
|
2022-10-03 17:43:43 +08:00
|
|
|
nvme_start_request(rq);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
2020-06-19 08:30:23 +08:00
|
|
|
nvme_tcp_queue_request(req, true, bd->last);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
return BLK_STS_OK;
|
|
|
|
}
|
|
|
|
|
2022-08-16 01:00:43 +08:00
|
|
|
static void nvme_tcp_map_queues(struct blk_mq_tag_set *set)
|
2018-12-12 15:38:57 +08:00
|
|
|
{
|
2022-09-20 23:09:48 +08:00
|
|
|
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
|
2019-05-29 13:49:05 +08:00
|
|
|
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
|
2018-12-12 15:38:57 +08:00
|
|
|
|
2019-05-29 13:49:05 +08:00
|
|
|
if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
|
2018-12-12 15:38:57 +08:00
|
|
|
/* separate read/write queues */
|
|
|
|
set->map[HCTX_TYPE_DEFAULT].nr_queues =
|
2019-05-29 13:49:05 +08:00
|
|
|
ctrl->io_queues[HCTX_TYPE_DEFAULT];
|
|
|
|
set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
|
|
|
|
set->map[HCTX_TYPE_READ].nr_queues =
|
|
|
|
ctrl->io_queues[HCTX_TYPE_READ];
|
2018-12-12 15:38:57 +08:00
|
|
|
set->map[HCTX_TYPE_READ].queue_offset =
|
2019-05-29 13:49:05 +08:00
|
|
|
ctrl->io_queues[HCTX_TYPE_DEFAULT];
|
2018-12-12 15:38:57 +08:00
|
|
|
} else {
|
2019-05-29 13:49:05 +08:00
|
|
|
/* shared read/write queues */
|
2018-12-12 15:38:57 +08:00
|
|
|
set->map[HCTX_TYPE_DEFAULT].nr_queues =
|
2019-05-29 13:49:05 +08:00
|
|
|
ctrl->io_queues[HCTX_TYPE_DEFAULT];
|
|
|
|
set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
|
|
|
|
set->map[HCTX_TYPE_READ].nr_queues =
|
|
|
|
ctrl->io_queues[HCTX_TYPE_DEFAULT];
|
2018-12-12 15:38:57 +08:00
|
|
|
set->map[HCTX_TYPE_READ].queue_offset = 0;
|
|
|
|
}
|
|
|
|
blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
|
|
|
|
blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
|
2019-05-29 13:49:05 +08:00
|
|
|
|
2019-07-04 05:08:04 +08:00
|
|
|
if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
|
|
|
|
/* map dedicated poll queues only if we have queues left */
|
|
|
|
set->map[HCTX_TYPE_POLL].nr_queues =
|
|
|
|
ctrl->io_queues[HCTX_TYPE_POLL];
|
|
|
|
set->map[HCTX_TYPE_POLL].queue_offset =
|
|
|
|
ctrl->io_queues[HCTX_TYPE_DEFAULT] +
|
|
|
|
ctrl->io_queues[HCTX_TYPE_READ];
|
|
|
|
blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
|
|
|
|
}
|
|
|
|
|
2019-05-29 13:49:05 +08:00
|
|
|
dev_info(ctrl->ctrl.device,
|
2019-07-04 05:08:04 +08:00
|
|
|
"mapped %d/%d/%d default/read/poll queues.\n",
|
2019-05-29 13:49:05 +08:00
|
|
|
ctrl->io_queues[HCTX_TYPE_DEFAULT],
|
2019-07-04 05:08:04 +08:00
|
|
|
ctrl->io_queues[HCTX_TYPE_READ],
|
|
|
|
ctrl->io_queues[HCTX_TYPE_POLL]);
|
2018-12-12 15:38:57 +08:00
|
|
|
}
|
|
|
|
|
2021-10-12 23:24:29 +08:00
|
|
|
static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
|
2019-07-04 05:08:04 +08:00
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue = hctx->driver_data;
|
|
|
|
struct sock *sk = queue->sock->sk;
|
|
|
|
|
2020-03-24 07:43:52 +08:00
|
|
|
if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
|
|
|
|
return 0;
|
|
|
|
|
2020-05-02 05:25:44 +08:00
|
|
|
set_bit(NVME_TCP_Q_POLLING, &queue->flags);
|
2019-10-24 13:44:51 +08:00
|
|
|
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
|
2019-07-04 05:08:04 +08:00
|
|
|
sk_busy_loop(sk, true);
|
|
|
|
nvme_tcp_try_recv(queue);
|
2020-05-02 05:25:44 +08:00
|
|
|
clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
|
2019-07-04 05:08:04 +08:00
|
|
|
return queue->nr_cqe;
|
|
|
|
}
|
|
|
|
|
2022-09-07 20:27:37 +08:00
|
|
|
static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_queue *queue = &to_tcp_ctrl(ctrl)->queues[0];
|
|
|
|
struct sockaddr_storage src_addr;
|
|
|
|
int ret, len;
|
|
|
|
|
|
|
|
len = nvmf_get_address(ctrl, buf, size);
|
|
|
|
|
2023-02-26 20:42:54 +08:00
|
|
|
mutex_lock(&queue->queue_lock);
|
|
|
|
|
|
|
|
if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
|
|
|
|
goto done;
|
2022-09-07 20:27:37 +08:00
|
|
|
ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr);
|
|
|
|
if (ret > 0) {
|
|
|
|
if (len > 0)
|
|
|
|
len--; /* strip trailing newline */
|
|
|
|
len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n",
|
|
|
|
(len) ? "," : "", &src_addr);
|
|
|
|
}
|
2023-02-26 20:42:54 +08:00
|
|
|
done:
|
|
|
|
mutex_unlock(&queue->queue_lock);
|
2022-09-07 20:27:37 +08:00
|
|
|
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
2020-05-29 06:25:07 +08:00
|
|
|
static const struct blk_mq_ops nvme_tcp_mq_ops = {
|
2018-12-04 09:52:17 +08:00
|
|
|
.queue_rq = nvme_tcp_queue_rq,
|
2020-06-19 08:30:23 +08:00
|
|
|
.commit_rqs = nvme_tcp_commit_rqs,
|
2018-12-04 09:52:17 +08:00
|
|
|
.complete = nvme_complete_rq,
|
|
|
|
.init_request = nvme_tcp_init_request,
|
|
|
|
.exit_request = nvme_tcp_exit_request,
|
|
|
|
.init_hctx = nvme_tcp_init_hctx,
|
|
|
|
.timeout = nvme_tcp_timeout,
|
2018-12-12 15:38:57 +08:00
|
|
|
.map_queues = nvme_tcp_map_queues,
|
2019-07-04 05:08:04 +08:00
|
|
|
.poll = nvme_tcp_poll,
|
2018-12-04 09:52:17 +08:00
|
|
|
};
|
|
|
|
|
2020-05-29 06:25:07 +08:00
|
|
|
static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
|
2018-12-04 09:52:17 +08:00
|
|
|
.queue_rq = nvme_tcp_queue_rq,
|
|
|
|
.complete = nvme_complete_rq,
|
|
|
|
.init_request = nvme_tcp_init_request,
|
|
|
|
.exit_request = nvme_tcp_exit_request,
|
|
|
|
.init_hctx = nvme_tcp_init_admin_hctx,
|
|
|
|
.timeout = nvme_tcp_timeout,
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
|
|
|
|
.name = "tcp",
|
|
|
|
.module = THIS_MODULE,
|
2022-12-01 00:19:50 +08:00
|
|
|
.flags = NVME_F_FABRICS | NVME_F_BLOCKING,
|
2018-12-04 09:52:17 +08:00
|
|
|
.reg_read32 = nvmf_reg_read32,
|
|
|
|
.reg_read64 = nvmf_reg_read64,
|
|
|
|
.reg_write32 = nvmf_reg_write32,
|
|
|
|
.free_ctrl = nvme_tcp_free_ctrl,
|
|
|
|
.submit_async_event = nvme_tcp_submit_async_event,
|
|
|
|
.delete_ctrl = nvme_tcp_delete_ctrl,
|
2022-09-07 20:27:37 +08:00
|
|
|
.get_address = nvme_tcp_get_address,
|
2022-06-23 14:45:39 +08:00
|
|
|
.stop_ctrl = nvme_tcp_stop_ctrl,
|
2018-12-04 09:52:17 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static bool
|
|
|
|
nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *ctrl;
|
|
|
|
bool found = false;
|
|
|
|
|
|
|
|
mutex_lock(&nvme_tcp_ctrl_mutex);
|
|
|
|
list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
|
|
|
|
found = nvmf_ip_options_match(&ctrl->ctrl, opts);
|
|
|
|
if (found)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
mutex_unlock(&nvme_tcp_ctrl_mutex);
|
|
|
|
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
|
|
|
|
struct nvmf_ctrl_options *opts)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *ctrl;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
|
|
|
|
if (!ctrl)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&ctrl->list);
|
|
|
|
ctrl->ctrl.opts = opts;
|
2019-07-04 05:08:04 +08:00
|
|
|
ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
|
|
|
|
opts->nr_poll_queues + 1;
|
2018-12-04 09:52:17 +08:00
|
|
|
ctrl->ctrl.sqsize = opts->queue_size - 1;
|
|
|
|
ctrl->ctrl.kato = opts->kato;
|
|
|
|
|
|
|
|
INIT_DELAYED_WORK(&ctrl->connect_work,
|
|
|
|
nvme_tcp_reconnect_ctrl_work);
|
|
|
|
INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
|
|
|
|
INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
|
|
|
|
|
|
|
|
if (!(opts->mask & NVMF_OPT_TRSVCID)) {
|
|
|
|
opts->trsvcid =
|
|
|
|
kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
|
|
|
|
if (!opts->trsvcid) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out_free_ctrl;
|
|
|
|
}
|
|
|
|
opts->mask |= NVMF_OPT_TRSVCID;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
|
|
|
|
opts->traddr, opts->trsvcid, &ctrl->addr);
|
|
|
|
if (ret) {
|
|
|
|
pr_err("malformed address passed: %s:%s\n",
|
|
|
|
opts->traddr, opts->trsvcid);
|
|
|
|
goto out_free_ctrl;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (opts->mask & NVMF_OPT_HOST_TRADDR) {
|
|
|
|
ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
|
|
|
|
opts->host_traddr, NULL, &ctrl->src_addr);
|
|
|
|
if (ret) {
|
|
|
|
pr_err("malformed src address passed: %s\n",
|
|
|
|
opts->host_traddr);
|
|
|
|
goto out_free_ctrl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
nvme-tcp: allow selecting the network interface for connections
In our application, we need a way to force TCP connections to go out a
specific IP interface instead of letting Linux select the interface
based on the routing tables.
Add the 'host-iface' option to allow specifying the interface to use.
When the option host-iface is specified, the driver uses the specified
interface to set the option SO_BINDTODEVICE on the TCP socket before
connecting.
This new option is needed in addtion to the existing host-traddr for
the following reasons:
Specifying an IP interface by its associated IP address is less
intuitive than specifying the actual interface name and, in some cases,
simply doesn't work. That's because the association between interfaces
and IP addresses is not predictable. IP addresses can be changed or can
change by themselves over time (e.g. DHCP). Interface names are
predictable [1] and will persist over time. Consider the following
configuration.
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state ...
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 100.0.0.100/24 scope global lo
valid_lft forever preferred_lft forever
2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff
inet 100.0.0.100/24 scope global enp0s3
valid_lft forever preferred_lft forever
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff
inet 100.0.0.100/24 scope global enp0s8
valid_lft forever preferred_lft forever
The above is a VM that I configured with the same IP address
(100.0.0.100) on all interfaces. Doing a reverse lookup to identify the
unique interface associated with 100.0.0.100 does not work here. And
this is why the option host_iface is required. I understand that the
above config does not represent a standard host system, but I'm using
this to prove a point: "We can never know how users will configure
their systems". By te way, The above configuration is perfectly fine
by Linux.
The current TCP implementation for host_traddr performs a
bind()-before-connect(). This is a common construct to set the source
IP address on a TCP socket before connecting. This has no effect on how
Linux selects the interface for the connection. That's because Linux
uses the Weak End System model as described in RFC1122 [2]. On the other
hand, setting the Source IP Address has benefits and should be supported
by linux-nvme. In fact, setting the Source IP Address is a mandatory
FedGov requirement (e.g. connection to a RADIUS/TACACS+ server).
Consider the following configuration.
$ ip addr list dev enp0s8
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff
inet 192.168.56.101/24 brd 192.168.56.255 scope global enp0s8
valid_lft 426sec preferred_lft 426sec
inet 192.168.56.102/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
inet 192.168.56.103/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
inet 192.168.56.104/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
Here we can see that several addresses are associated with interface
enp0s8. By default, Linux always selects the default IP address,
192.168.56.101, as the source address when connecting over interface
enp0s8. Some users, however, want the ability to specify a different
source address (e.g., 192.168.56.102, 192.168.56.103, ...). The option
host_traddr can be used as-is to perform this function.
In conclusion, I believe that we need 2 options for TCP connections.
One that can be used to specify an interface (host-iface). And one that
can be used to set the source address (host-traddr). Users should be
allowed to use one or the other, or both, or none. Of course, the
documentation for host_traddr will need some clarification. It should
state that when used for TCP connection, this option only sets the
source address. And the documentation for host_iface should say that
this option is only available for TCP connections.
References:
[1] https://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames/
[2] https://tools.ietf.org/html/rfc1122
Tested both IPv4 and IPv6 connections.
Signed-off-by: Martin Belanger <martin.belanger@dell.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-05-21 03:09:34 +08:00
|
|
|
if (opts->mask & NVMF_OPT_HOST_IFACE) {
|
2021-07-13 17:31:56 +08:00
|
|
|
if (!__dev_get_by_name(&init_net, opts->host_iface)) {
|
nvme-tcp: allow selecting the network interface for connections
In our application, we need a way to force TCP connections to go out a
specific IP interface instead of letting Linux select the interface
based on the routing tables.
Add the 'host-iface' option to allow specifying the interface to use.
When the option host-iface is specified, the driver uses the specified
interface to set the option SO_BINDTODEVICE on the TCP socket before
connecting.
This new option is needed in addtion to the existing host-traddr for
the following reasons:
Specifying an IP interface by its associated IP address is less
intuitive than specifying the actual interface name and, in some cases,
simply doesn't work. That's because the association between interfaces
and IP addresses is not predictable. IP addresses can be changed or can
change by themselves over time (e.g. DHCP). Interface names are
predictable [1] and will persist over time. Consider the following
configuration.
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state ...
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 100.0.0.100/24 scope global lo
valid_lft forever preferred_lft forever
2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff
inet 100.0.0.100/24 scope global enp0s3
valid_lft forever preferred_lft forever
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff
inet 100.0.0.100/24 scope global enp0s8
valid_lft forever preferred_lft forever
The above is a VM that I configured with the same IP address
(100.0.0.100) on all interfaces. Doing a reverse lookup to identify the
unique interface associated with 100.0.0.100 does not work here. And
this is why the option host_iface is required. I understand that the
above config does not represent a standard host system, but I'm using
this to prove a point: "We can never know how users will configure
their systems". By te way, The above configuration is perfectly fine
by Linux.
The current TCP implementation for host_traddr performs a
bind()-before-connect(). This is a common construct to set the source
IP address on a TCP socket before connecting. This has no effect on how
Linux selects the interface for the connection. That's because Linux
uses the Weak End System model as described in RFC1122 [2]. On the other
hand, setting the Source IP Address has benefits and should be supported
by linux-nvme. In fact, setting the Source IP Address is a mandatory
FedGov requirement (e.g. connection to a RADIUS/TACACS+ server).
Consider the following configuration.
$ ip addr list dev enp0s8
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff
inet 192.168.56.101/24 brd 192.168.56.255 scope global enp0s8
valid_lft 426sec preferred_lft 426sec
inet 192.168.56.102/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
inet 192.168.56.103/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
inet 192.168.56.104/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
Here we can see that several addresses are associated with interface
enp0s8. By default, Linux always selects the default IP address,
192.168.56.101, as the source address when connecting over interface
enp0s8. Some users, however, want the ability to specify a different
source address (e.g., 192.168.56.102, 192.168.56.103, ...). The option
host_traddr can be used as-is to perform this function.
In conclusion, I believe that we need 2 options for TCP connections.
One that can be used to specify an interface (host-iface). And one that
can be used to set the source address (host-traddr). Users should be
allowed to use one or the other, or both, or none. Of course, the
documentation for host_traddr will need some clarification. It should
state that when used for TCP connection, this option only sets the
source address. And the documentation for host_iface should say that
this option is only available for TCP connections.
References:
[1] https://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames/
[2] https://tools.ietf.org/html/rfc1122
Tested both IPv4 and IPv6 connections.
Signed-off-by: Martin Belanger <martin.belanger@dell.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-05-21 03:09:34 +08:00
|
|
|
pr_err("invalid interface passed: %s\n",
|
|
|
|
opts->host_iface);
|
|
|
|
ret = -ENODEV;
|
|
|
|
goto out_free_ctrl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
|
|
|
|
ret = -EALREADY;
|
|
|
|
goto out_free_ctrl;
|
|
|
|
}
|
|
|
|
|
2018-12-12 15:38:57 +08:00
|
|
|
ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
|
2018-12-04 09:52:17 +08:00
|
|
|
GFP_KERNEL);
|
|
|
|
if (!ctrl->queues) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out_free_ctrl;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
|
|
|
|
if (ret)
|
|
|
|
goto out_kfree_queues;
|
|
|
|
|
|
|
|
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
|
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
ret = -EINTR;
|
|
|
|
goto out_uninit_ctrl;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
|
|
|
|
if (ret)
|
|
|
|
goto out_uninit_ctrl;
|
|
|
|
|
|
|
|
dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
|
2021-09-22 14:35:25 +08:00
|
|
|
nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
|
2018-12-04 09:52:17 +08:00
|
|
|
|
|
|
|
mutex_lock(&nvme_tcp_ctrl_mutex);
|
|
|
|
list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
|
|
|
|
mutex_unlock(&nvme_tcp_ctrl_mutex);
|
|
|
|
|
|
|
|
return &ctrl->ctrl;
|
|
|
|
|
|
|
|
out_uninit_ctrl:
|
|
|
|
nvme_uninit_ctrl(&ctrl->ctrl);
|
|
|
|
nvme_put_ctrl(&ctrl->ctrl);
|
|
|
|
if (ret > 0)
|
|
|
|
ret = -EIO;
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
out_kfree_queues:
|
|
|
|
kfree(ctrl->queues);
|
|
|
|
out_free_ctrl:
|
|
|
|
kfree(ctrl);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct nvmf_transport_ops nvme_tcp_transport = {
|
|
|
|
.name = "tcp",
|
|
|
|
.module = THIS_MODULE,
|
|
|
|
.required_opts = NVMF_OPT_TRADDR,
|
|
|
|
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
|
|
|
|
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
|
2018-12-12 15:38:57 +08:00
|
|
|
NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
|
2019-08-18 17:08:54 +08:00
|
|
|
NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
|
nvme-tcp: allow selecting the network interface for connections
In our application, we need a way to force TCP connections to go out a
specific IP interface instead of letting Linux select the interface
based on the routing tables.
Add the 'host-iface' option to allow specifying the interface to use.
When the option host-iface is specified, the driver uses the specified
interface to set the option SO_BINDTODEVICE on the TCP socket before
connecting.
This new option is needed in addtion to the existing host-traddr for
the following reasons:
Specifying an IP interface by its associated IP address is less
intuitive than specifying the actual interface name and, in some cases,
simply doesn't work. That's because the association between interfaces
and IP addresses is not predictable. IP addresses can be changed or can
change by themselves over time (e.g. DHCP). Interface names are
predictable [1] and will persist over time. Consider the following
configuration.
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state ...
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 100.0.0.100/24 scope global lo
valid_lft forever preferred_lft forever
2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff
inet 100.0.0.100/24 scope global enp0s3
valid_lft forever preferred_lft forever
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff
inet 100.0.0.100/24 scope global enp0s8
valid_lft forever preferred_lft forever
The above is a VM that I configured with the same IP address
(100.0.0.100) on all interfaces. Doing a reverse lookup to identify the
unique interface associated with 100.0.0.100 does not work here. And
this is why the option host_iface is required. I understand that the
above config does not represent a standard host system, but I'm using
this to prove a point: "We can never know how users will configure
their systems". By te way, The above configuration is perfectly fine
by Linux.
The current TCP implementation for host_traddr performs a
bind()-before-connect(). This is a common construct to set the source
IP address on a TCP socket before connecting. This has no effect on how
Linux selects the interface for the connection. That's because Linux
uses the Weak End System model as described in RFC1122 [2]. On the other
hand, setting the Source IP Address has benefits and should be supported
by linux-nvme. In fact, setting the Source IP Address is a mandatory
FedGov requirement (e.g. connection to a RADIUS/TACACS+ server).
Consider the following configuration.
$ ip addr list dev enp0s8
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff
inet 192.168.56.101/24 brd 192.168.56.255 scope global enp0s8
valid_lft 426sec preferred_lft 426sec
inet 192.168.56.102/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
inet 192.168.56.103/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
inet 192.168.56.104/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
Here we can see that several addresses are associated with interface
enp0s8. By default, Linux always selects the default IP address,
192.168.56.101, as the source address when connecting over interface
enp0s8. Some users, however, want the ability to specify a different
source address (e.g., 192.168.56.102, 192.168.56.103, ...). The option
host_traddr can be used as-is to perform this function.
In conclusion, I believe that we need 2 options for TCP connections.
One that can be used to specify an interface (host-iface). And one that
can be used to set the source address (host-traddr). Users should be
allowed to use one or the other, or both, or none. Of course, the
documentation for host_traddr will need some clarification. It should
state that when used for TCP connection, this option only sets the
source address. And the documentation for host_iface should say that
this option is only available for TCP connections.
References:
[1] https://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames/
[2] https://tools.ietf.org/html/rfc1122
Tested both IPv4 and IPv6 connections.
Signed-off-by: Martin Belanger <martin.belanger@dell.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-05-21 03:09:34 +08:00
|
|
|
NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
|
2018-12-04 09:52:17 +08:00
|
|
|
.create_ctrl = nvme_tcp_create_ctrl,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init nvme_tcp_init_module(void)
|
|
|
|
{
|
2023-03-13 16:56:23 +08:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_tcp_rsp_pdu) != 24);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_tcp_r2t_pdu) != 24);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_tcp_icreq_pdu) != 128);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);
|
|
|
|
|
2018-12-04 09:52:17 +08:00
|
|
|
nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
|
|
|
|
WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
|
|
|
|
if (!nvme_tcp_wq)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
nvmf_register_transport(&nvme_tcp_transport);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __exit nvme_tcp_cleanup_module(void)
|
|
|
|
{
|
|
|
|
struct nvme_tcp_ctrl *ctrl;
|
|
|
|
|
|
|
|
nvmf_unregister_transport(&nvme_tcp_transport);
|
|
|
|
|
|
|
|
mutex_lock(&nvme_tcp_ctrl_mutex);
|
|
|
|
list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
|
|
|
|
nvme_delete_ctrl(&ctrl->ctrl);
|
|
|
|
mutex_unlock(&nvme_tcp_ctrl_mutex);
|
|
|
|
flush_workqueue(nvme_delete_wq);
|
|
|
|
|
|
|
|
destroy_workqueue(nvme_tcp_wq);
|
|
|
|
}
|
|
|
|
|
|
|
|
module_init(nvme_tcp_init_module);
|
|
|
|
module_exit(nvme_tcp_cleanup_module);
|
|
|
|
|
|
|
|
MODULE_LICENSE("GPL v2");
|