OpenCloudOS-Kernel/drivers/nvme/host/tcp.c

2743 lines
69 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* NVMe over Fabrics TCP host.
* Copyright (c) 2018 Lightbits Labs. All rights reserved.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/nvme-tcp.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <linux/blk-mq.h>
#include <crypto/hash.h>
#include <net/busy_poll.h>
#include <trace/events/sock.h>
#include "nvme.h"
#include "fabrics.h"
struct nvme_tcp_queue;
/* Define the socket priority to use for connections were it is desirable
* that the NIC consider performing optimized packet processing or filtering.
* A non-zero value being sufficient to indicate general consideration of any
* possible optimization. Making it a module param allows for alternative
* values that may be unique for some NIC implementations.
*/
static int so_priority;
module_param(so_priority, int, 0644);
MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
nvme-tcp: lockdep: annotate in-kernel sockets Put NVMe/TCP sockets in their own class to avoid some lockdep warnings. Sockets created by nvme-tcp are not exposed to user-space, and will not trigger certain code paths that the general socket API exposes. Lockdep complains about a circular dependency between the socket and filesystem locks, because setsockopt can trigger a page fault with a socket lock held, but nvme-tcp sends requests on the socket while file system locks are held. ====================================================== WARNING: possible circular locking dependency detected 5.15.0-rc3 #1 Not tainted ------------------------------------------------------ fio/1496 is trying to acquire lock: (sk_lock-AF_INET){+.+.}-{0:0}, at: tcp_sendpage+0x23/0x80 but task is already holding lock: (&xfs_dir_ilock_class/5){+.+.}-{3:3}, at: xfs_ilock+0xcf/0x290 [xfs] which lock already depends on the new lock. other info that might help us debug this: chain exists of: sk_lock-AF_INET --> sb_internal --> &xfs_dir_ilock_class/5 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&xfs_dir_ilock_class/5); lock(sb_internal); lock(&xfs_dir_ilock_class/5); lock(sk_lock-AF_INET); *** DEADLOCK *** 6 locks held by fio/1496: #0: (sb_writers#13){.+.+}-{0:0}, at: path_openat+0x9fc/0xa20 #1: (&inode->i_sb->s_type->i_mutex_dir_key){++++}-{3:3}, at: path_openat+0x296/0xa20 #2: (sb_internal){.+.+}-{0:0}, at: xfs_trans_alloc_icreate+0x41/0xd0 [xfs] #3: (&xfs_dir_ilock_class/5){+.+.}-{3:3}, at: xfs_ilock+0xcf/0x290 [xfs] #4: (hctx->srcu){....}-{0:0}, at: hctx_lock+0x51/0xd0 #5: (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0x33e/0x380 [nvme_tcp] This annotation lets lockdep analyze nvme-tcp controlled sockets independently of what the user-space sockets API does. Link: https://lore.kernel.org/linux-nvme/CAHj4cs9MDYLJ+q+2_GXUK9HxFizv2pxUryUR0toX974M040z7g@mail.gmail.com/ Signed-off-by: Chris Leech <cleech@redhat.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-02-16 10:22:49 +08:00
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* lockdep can detect a circular dependency of the form
* sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
* because dependencies are tracked for both nvme-tcp and user contexts. Using
* a separate class prevents lockdep from conflating nvme-tcp socket use with
* user-space socket API use.
*/
static struct lock_class_key nvme_tcp_sk_key[2];
static struct lock_class_key nvme_tcp_slock_key[2];
static void nvme_tcp_reclassify_socket(struct socket *sock)
{
struct sock *sk = sock->sk;
if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
return;
switch (sk->sk_family) {
case AF_INET:
sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME",
&nvme_tcp_slock_key[0],
"sk_lock-AF_INET-NVME",
&nvme_tcp_sk_key[0]);
break;
case AF_INET6:
sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME",
&nvme_tcp_slock_key[1],
"sk_lock-AF_INET6-NVME",
&nvme_tcp_sk_key[1]);
break;
default:
WARN_ON_ONCE(1);
}
}
#else
static void nvme_tcp_reclassify_socket(struct socket *sock) { }
#endif
enum nvme_tcp_send_state {
NVME_TCP_SEND_CMD_PDU = 0,
NVME_TCP_SEND_H2C_PDU,
NVME_TCP_SEND_DATA,
NVME_TCP_SEND_DDGST,
};
struct nvme_tcp_request {
struct nvme_request req;
void *pdu;
struct nvme_tcp_queue *queue;
u32 data_len;
u32 pdu_len;
u32 pdu_sent;
u32 h2cdata_left;
u32 h2cdata_offset;
u16 ttag;
__le16 status;
struct list_head entry;
struct llist_node lentry;
__le32 ddgst;
struct bio *curr_bio;
struct iov_iter iter;
/* send state */
size_t offset;
size_t data_sent;
enum nvme_tcp_send_state state;
};
enum nvme_tcp_queue_flags {
NVME_TCP_Q_ALLOCATED = 0,
NVME_TCP_Q_LIVE = 1,
NVME_TCP_Q_POLLING = 2,
};
enum nvme_tcp_recv_state {
NVME_TCP_RECV_PDU = 0,
NVME_TCP_RECV_DATA,
NVME_TCP_RECV_DDGST,
};
struct nvme_tcp_ctrl;
struct nvme_tcp_queue {
struct socket *sock;
struct work_struct io_work;
int io_cpu;
struct mutex queue_lock;
struct mutex send_mutex;
struct llist_head req_list;
struct list_head send_list;
/* recv state */
void *pdu;
int pdu_remaining;
int pdu_offset;
size_t data_remaining;
size_t ddgst_remaining;
unsigned int nr_cqe;
/* send state */
struct nvme_tcp_request *request;
u32 maxh2cdata;
size_t cmnd_capsule_len;
struct nvme_tcp_ctrl *ctrl;
unsigned long flags;
bool rd_enabled;
bool hdr_digest;
bool data_digest;
struct ahash_request *rcv_hash;
struct ahash_request *snd_hash;
__le32 exp_ddgst;
__le32 recv_ddgst;
struct page_frag_cache pf_cache;
void (*state_change)(struct sock *);
void (*data_ready)(struct sock *);
void (*write_space)(struct sock *);
};
struct nvme_tcp_ctrl {
/* read only in the hot path */
struct nvme_tcp_queue *queues;
struct blk_mq_tag_set tag_set;
/* other member variables */
struct list_head list;
struct blk_mq_tag_set admin_tag_set;
struct sockaddr_storage addr;
struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl;
struct work_struct err_work;
struct delayed_work connect_work;
struct nvme_tcp_request async_req;
u32 io_queues[HCTX_MAX_TYPES];
};
static LIST_HEAD(nvme_tcp_ctrl_list);
static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
static struct workqueue_struct *nvme_tcp_wq;
static const struct blk_mq_ops nvme_tcp_mq_ops;
static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
{
return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
}
static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
{
return queue - queue->ctrl->queues;
}
static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
{
u32 queue_idx = nvme_tcp_queue_id(queue);
if (queue_idx == 0)
return queue->ctrl->admin_tag_set.tags[queue_idx];
return queue->ctrl->tag_set.tags[queue_idx - 1];
}
static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
{
return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
}
static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
{
return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
}
static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req)
{
return req->pdu;
}
static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req)
{
/* use the pdu space in the back for the data pdu */
return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) -
sizeof(struct nvme_tcp_data_pdu);
}
static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req)
{
if (nvme_is_fabrics(req->req.cmd))
return NVME_TCP_ADMIN_CCSZ;
return req->queue->cmnd_capsule_len - sizeof(struct nvme_command);
}
static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
{
return req == &req->queue->ctrl->async_req;
}
static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
{
struct request *rq;
if (unlikely(nvme_tcp_async_req(req)))
return false; /* async events don't have a request */
rq = blk_mq_rq_from_pdu(req);
return rq_data_dir(rq) == WRITE && req->data_len &&
req->data_len <= nvme_tcp_inline_data_size(req);
}
static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
{
return req->iter.bvec->bv_page;
}
static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
{
return req->iter.bvec->bv_offset + req->iter.iov_offset;
}
static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
{
return min_t(size_t, iov_iter_single_seg_count(&req->iter),
req->pdu_len - req->pdu_sent);
}
static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
{
return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
req->pdu_len - req->pdu_sent : 0;
}
static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
int len)
{
return nvme_tcp_pdu_data_left(req) <= len;
}
static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
unsigned int dir)
{
struct request *rq = blk_mq_rq_from_pdu(req);
struct bio_vec *vec;
unsigned int size;
int nr_bvec;
size_t offset;
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
vec = &rq->special_vec;
nr_bvec = 1;
size = blk_rq_payload_bytes(rq);
offset = 0;
} else {
struct bio *bio = req->curr_bio;
struct bvec_iter bi;
struct bio_vec bv;
vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
nr_bvec = 0;
bio_for_each_bvec(bv, bio, bi) {
nr_bvec++;
}
size = bio->bi_iter.bi_size;
offset = bio->bi_iter.bi_bvec_done;
}
iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
req->iter.iov_offset = offset;
}
static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
int len)
{
req->data_sent += len;
req->pdu_sent += len;
iov_iter_advance(&req->iter, len);
if (!iov_iter_count(&req->iter) &&
req->data_sent < req->data_len) {
req->curr_bio = req->curr_bio->bi_next;
nvme_tcp_init_iter(req, ITER_SOURCE);
}
}
nvme-tcp: Fix possible race of io_work and direct send We may send a request (with or without its data) from two paths: 1. From our I/O context nvme_tcp_io_work which is triggered from: - queue_rq - r2t reception - socket data_ready and write_space callbacks 2. Directly from queue_rq if the send_list is empty (because we want to save the context switch associated with scheduling our io_work). However, given that now we have the send_mutex, we may run into a race condition where none of these contexts will send the pending payload to the controller. Both io_work send path and queue_rq send path opportunistically attempt to acquire the send_mutex however queue_rq only attempts to send a single request, and if io_work context fails to acquire the send_mutex it will complete without rescheduling itself. The race can trigger with the following sequence: 1. queue_rq sends request (no incapsule data) and blocks 2. RX path receives r2t - prepares data PDU to send, adds h2cdata PDU to the send_list and schedules io_work 3. io_work triggers and cannot acquire the send_mutex - because of (1), ends without self rescheduling 4. queue_rq completes the send, and completes ==> no context will send the h2cdata - timeout. Fix this by having queue_rq sending as much as it can from the send_list such that if it still has any left, its because the socket buffer is full and the socket write_space callback will trigger, thus guaranteeing that a context will be scheduled to send the h2cdata PDU. Fixes: db5ad6b7f8cd ("nvme-tcp: try to send request in queue_rq context") Reported-by: Potnuri Bharat Teja <bharat@chelsio.com> Reported-by: Samuel Jones <sjones@kalrayinc.com> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Potnuri Bharat Teja <bharat@chelsio.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2020-12-21 16:03:39 +08:00
static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
{
int ret;
/* drain the send queue as much as we can... */
do {
ret = nvme_tcp_try_send(queue);
} while (ret > 0);
}
static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
{
return !list_empty(&queue->send_list) ||
nvme-tcp: fix regression that causes sporadic requests to time out When we queue requests, we strive to batch as much as possible and also signal the network stack that more data is about to be sent over a socket with MSG_SENDPAGE_NOTLAST. This flag looks at the pending requests queued as well as queue->more_requests that is derived from the block layer last-in-batch indication. We set more_request=true when we flush the request directly from .queue_rq submission context (in nvme_tcp_send_all), however this is wrongly assuming that no other requests may be queued during the execution of nvme_tcp_send_all. Due to this, a race condition may happen where: 1. request X is queued as !last-in-batch 2. request X submission context calls nvme_tcp_send_all directly 3. nvme_tcp_send_all is preempted and schedules to a different cpu 4. request Y is queued as last-in-batch 5. nvme_tcp_send_all context sends request X+Y, however signals for both MSG_SENDPAGE_NOTLAST because queue->more_requests=true. ==> none of the requests is pushed down to the wire as the network stack is waiting for more data, both requests timeout. To fix this, we eliminate queue->more_requests and only rely on the queue req_list and send_list to be not-empty. Fixes: 122e5b9f3d37 ("nvme-tcp: optimize network stack with setting msg flags according to batch size") Reported-by: Jonathan Nicklin <jnicklin@blockbridge.com> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Jonathan Nicklin <jnicklin@blockbridge.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-09-05 23:07:06 +08:00
!llist_empty(&queue->req_list);
}
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
bool sync, bool last)
{
struct nvme_tcp_queue *queue = req->queue;
bool empty;
empty = llist_add(&req->lentry, &queue->req_list) &&
list_empty(&queue->send_list) && !queue->request;
/*
* if we're the first on the send_list and we can try to send
* directly, otherwise queue io_work. Also, only do that if we
* are on the same cpu, so we don't introduce contention.
*/
if (queue->io_cpu == raw_smp_processor_id() &&
sync && empty && mutex_trylock(&queue->send_mutex)) {
nvme-tcp: Fix possible race of io_work and direct send We may send a request (with or without its data) from two paths: 1. From our I/O context nvme_tcp_io_work which is triggered from: - queue_rq - r2t reception - socket data_ready and write_space callbacks 2. Directly from queue_rq if the send_list is empty (because we want to save the context switch associated with scheduling our io_work). However, given that now we have the send_mutex, we may run into a race condition where none of these contexts will send the pending payload to the controller. Both io_work send path and queue_rq send path opportunistically attempt to acquire the send_mutex however queue_rq only attempts to send a single request, and if io_work context fails to acquire the send_mutex it will complete without rescheduling itself. The race can trigger with the following sequence: 1. queue_rq sends request (no incapsule data) and blocks 2. RX path receives r2t - prepares data PDU to send, adds h2cdata PDU to the send_list and schedules io_work 3. io_work triggers and cannot acquire the send_mutex - because of (1), ends without self rescheduling 4. queue_rq completes the send, and completes ==> no context will send the h2cdata - timeout. Fix this by having queue_rq sending as much as it can from the send_list such that if it still has any left, its because the socket buffer is full and the socket write_space callback will trigger, thus guaranteeing that a context will be scheduled to send the h2cdata PDU. Fixes: db5ad6b7f8cd ("nvme-tcp: try to send request in queue_rq context") Reported-by: Potnuri Bharat Teja <bharat@chelsio.com> Reported-by: Samuel Jones <sjones@kalrayinc.com> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Potnuri Bharat Teja <bharat@chelsio.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2020-12-21 16:03:39 +08:00
nvme_tcp_send_all(queue);
mutex_unlock(&queue->send_mutex);
}
if (last && nvme_tcp_queue_more(queue))
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_request *req;
struct llist_node *node;
for (node = llist_del_all(&queue->req_list); node; node = node->next) {
req = llist_entry(node, struct nvme_tcp_request, lentry);
list_add(&req->entry, &queue->send_list);
}
}
static inline struct nvme_tcp_request *
nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_request *req;
req = list_first_entry_or_null(&queue->send_list,
struct nvme_tcp_request, entry);
if (!req) {
nvme_tcp_process_req_list(queue);
req = list_first_entry_or_null(&queue->send_list,
struct nvme_tcp_request, entry);
if (unlikely(!req))
return NULL;
}
list_del(&req->entry);
return req;
}
static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
__le32 *dgst)
{
ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
crypto_ahash_final(hash);
}
static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
struct page *page, off_t off, size_t len)
{
struct scatterlist sg;
sg_init_table(&sg, 1);
sg_set_page(&sg, page, len, off);
ahash_request_set_crypt(hash, &sg, NULL, len);
crypto_ahash_update(hash);
}
static inline void nvme_tcp_hdgst(struct ahash_request *hash,
void *pdu, size_t len)
{
struct scatterlist sg;
sg_init_one(&sg, pdu, len);
ahash_request_set_crypt(hash, &sg, pdu + len, len);
crypto_ahash_digest(hash);
}
static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
void *pdu, size_t pdu_len)
{
struct nvme_tcp_hdr *hdr = pdu;
__le32 recv_digest;
__le32 exp_digest;
if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
dev_err(queue->ctrl->ctrl.device,
"queue %d: header digest flag is cleared\n",
nvme_tcp_queue_id(queue));
return -EPROTO;
}
recv_digest = *(__le32 *)(pdu + hdr->hlen);
nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
exp_digest = *(__le32 *)(pdu + hdr->hlen);
if (recv_digest != exp_digest) {
dev_err(queue->ctrl->ctrl.device,
"header digest error: recv %#x expected %#x\n",
le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
return -EIO;
}
return 0;
}
static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
{
struct nvme_tcp_hdr *hdr = pdu;
u8 digest_len = nvme_tcp_hdgst_len(queue);
u32 len;
len = le32_to_cpu(hdr->plen) - hdr->hlen -
((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
dev_err(queue->ctrl->ctrl.device,
"queue %d: data digest flag is cleared\n",
nvme_tcp_queue_id(queue));
return -EPROTO;
}
crypto_ahash_init(queue->rcv_hash);
return 0;
}
static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
struct request *rq, unsigned int hctx_idx)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
page_frag_free(req->pdu);
}
static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
struct request *rq, unsigned int hctx_idx,
unsigned int numa_node)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_tcp_cmd_pdu *pdu;
int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
u8 hdgst = nvme_tcp_hdgst_len(queue);
req->pdu = page_frag_alloc(&queue->pf_cache,
sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
GFP_KERNEL | __GFP_ZERO);
if (!req->pdu)
return -ENOMEM;
pdu = req->pdu;
req->queue = queue;
nvme_req(rq)->ctrl = &ctrl->ctrl;
nvme_req(rq)->cmd = &pdu->cmd;
return 0;
}
static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
hctx->driver_data = queue;
return 0;
}
static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
struct nvme_tcp_queue *queue = &ctrl->queues[0];
hctx->driver_data = queue;
return 0;
}
static enum nvme_tcp_recv_state
nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
{
return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
(queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
NVME_TCP_RECV_DATA;
}
static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
{
queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
nvme_tcp_hdgst_len(queue);
queue->pdu_offset = 0;
queue->data_remaining = -1;
queue->ddgst_remaining = 0;
}
static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
return;
dev_warn(ctrl->device, "starting error recovery\n");
nvme: prevent warning triggered by nvme_stop_keep_alive Delayed keep alive work is queued on system workqueue and may be cancelled via nvme_stop_keep_alive from nvme_reset_wq, nvme_fc_wq or nvme_wq. Check_flush_dependency detects mismatched attributes between the work-queue context used to cancel the keep alive work and system-wq. Specifically system-wq does not have the WQ_MEM_RECLAIM flag, whereas the contexts used to cancel keep alive work have WQ_MEM_RECLAIM flag. Example warning: workqueue: WQ_MEM_RECLAIM nvme-reset-wq:nvme_fc_reset_ctrl_work [nvme_fc] is flushing !WQ_MEM_RECLAIM events:nvme_keep_alive_work [nvme_core] To avoid the flags mismatch, delayed keep alive work is queued on nvme_wq. However this creates a secondary concern where work and a request to cancel that work may be in the same work queue - namely err_work in the rdma and tcp transports, which will want to flush/cancel the keep alive work which will now be on nvme_wq. After reviewing the transports, it looks like err_work can be moved to nvme_reset_wq. In fact that aligns them better with transition into RESETTING and performing related reset work in nvme_reset_wq. Change nvme-rdma and nvme-tcp to perform err_work in nvme_reset_wq. Signed-off-by: Nigel Kirkland <nigel.kirkland@broadcom.com> Signed-off-by: James Smart <jsmart2021@gmail.com> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Keith Busch <kbusch@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-02-11 08:01:45 +08:00
queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
}
static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
struct nvme_completion *cqe)
{
struct nvme_tcp_request *req;
struct request *rq;
rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
"got bad cqe.command_id %#x on queue %d\n",
cqe->command_id, nvme_tcp_queue_id(queue));
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
return -EINVAL;
}
req = blk_mq_rq_to_pdu(rq);
if (req->status == cpu_to_le16(NVME_SC_SUCCESS))
req->status = cqe->status;
if (!nvme_try_complete_req(rq, req->status, cqe->result))
nvme_complete_rq(rq);
queue->nr_cqe++;
return 0;
}
static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
struct nvme_tcp_data_pdu *pdu)
{
struct request *rq;
rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
"got bad c2hdata.command_id %#x on queue %d\n",
pdu->command_id, nvme_tcp_queue_id(queue));
return -ENOENT;
}
if (!blk_rq_payload_bytes(rq)) {
dev_err(queue->ctrl->ctrl.device,
"queue %d tag %#x unexpected data\n",
nvme_tcp_queue_id(queue), rq->tag);
return -EIO;
}
queue->data_remaining = le32_to_cpu(pdu->data_length);
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
dev_err(queue->ctrl->ctrl.device,
"queue %d tag %#x SUCCESS set but not last PDU\n",
nvme_tcp_queue_id(queue), rq->tag);
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
return -EPROTO;
}
return 0;
}
static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
struct nvme_tcp_rsp_pdu *pdu)
{
struct nvme_completion *cqe = &pdu->cqe;
int ret = 0;
/*
* AEN requests are special as they don't time out and can
* survive any kind of queue freeze and often don't respond to
* aborts. We don't even bother to allocate a struct request
* for them but rather special case them here.
*/
if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
cqe->command_id)))
nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result);
else
ret = nvme_tcp_process_nvme_cqe(queue, cqe);
return ret;
}
static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req)
{
struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req);
struct nvme_tcp_queue *queue = req->queue;
struct request *rq = blk_mq_rq_from_pdu(req);
u32 h2cdata_sent = req->pdu_len;
u8 hdgst = nvme_tcp_hdgst_len(queue);
u8 ddgst = nvme_tcp_ddgst_len(queue);
req->state = NVME_TCP_SEND_H2C_PDU;
req->offset = 0;
req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata);
req->pdu_sent = 0;
req->h2cdata_left -= req->pdu_len;
req->h2cdata_offset += h2cdata_sent;
memset(data, 0, sizeof(*data));
data->hdr.type = nvme_tcp_h2c_data;
if (!req->h2cdata_left)
data->hdr.flags = NVME_TCP_F_DATA_LAST;
if (queue->hdr_digest)
data->hdr.flags |= NVME_TCP_F_HDGST;
if (queue->data_digest)
data->hdr.flags |= NVME_TCP_F_DDGST;
data->hdr.hlen = sizeof(*data);
data->hdr.pdo = data->hdr.hlen + hdgst;
data->hdr.plen =
cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
data->ttag = req->ttag;
data->command_id = nvme_cid(rq);
data->data_offset = cpu_to_le32(req->h2cdata_offset);
data->data_length = cpu_to_le32(req->pdu_len);
}
static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
struct nvme_tcp_r2t_pdu *pdu)
{
struct nvme_tcp_request *req;
struct request *rq;
u32 r2t_length = le32_to_cpu(pdu->r2t_length);
u32 r2t_offset = le32_to_cpu(pdu->r2t_offset);
rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
"got bad r2t.command_id %#x on queue %d\n",
pdu->command_id, nvme_tcp_queue_id(queue));
return -ENOENT;
}
req = blk_mq_rq_to_pdu(rq);
if (unlikely(!r2t_length)) {
dev_err(queue->ctrl->ctrl.device,
"req %d r2t len is %u, probably a bug...\n",
rq->tag, r2t_length);
return -EPROTO;
}
if (unlikely(req->data_sent + r2t_length > req->data_len)) {
dev_err(queue->ctrl->ctrl.device,
"req %d r2t len %u exceeded data len %u (%zu sent)\n",
rq->tag, r2t_length, req->data_len, req->data_sent);
return -EPROTO;
}
if (unlikely(r2t_offset < req->data_sent)) {
dev_err(queue->ctrl->ctrl.device,
"req %d unexpected r2t offset %u (expected %zu)\n",
rq->tag, r2t_offset, req->data_sent);
return -EPROTO;
}
req->pdu_len = 0;
req->h2cdata_left = r2t_length;
req->h2cdata_offset = r2t_offset;
req->ttag = pdu->ttag;
nvme_tcp_setup_h2c_data_pdu(req);
nvme_tcp_queue_request(req, false, true);
return 0;
}
static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
unsigned int *offset, size_t *len)
{
struct nvme_tcp_hdr *hdr;
char *pdu = queue->pdu;
size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
int ret;
ret = skb_copy_bits(skb, *offset,
&pdu[queue->pdu_offset], rcv_len);
if (unlikely(ret))
return ret;
queue->pdu_remaining -= rcv_len;
queue->pdu_offset += rcv_len;
*offset += rcv_len;
*len -= rcv_len;
if (queue->pdu_remaining)
return 0;
hdr = queue->pdu;
if (queue->hdr_digest) {
ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
if (unlikely(ret))
return ret;
}
if (queue->data_digest) {
ret = nvme_tcp_check_ddgst(queue, queue->pdu);
if (unlikely(ret))
return ret;
}
switch (hdr->type) {
case nvme_tcp_c2h_data:
return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
case nvme_tcp_rsp:
nvme_tcp_init_recv_ctx(queue);
return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
case nvme_tcp_r2t:
nvme_tcp_init_recv_ctx(queue);
return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
default:
dev_err(queue->ctrl->ctrl.device,
"unsupported pdu type (%d)\n", hdr->type);
return -EINVAL;
}
}
static inline void nvme_tcp_end_request(struct request *rq, u16 status)
{
union nvme_result res = {};
if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
nvme_complete_rq(rq);
}
static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
unsigned int *offset, size_t *len)
{
struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
struct request *rq =
nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
while (true) {
int recv_len, ret;
recv_len = min_t(size_t, *len, queue->data_remaining);
if (!recv_len)
break;
if (!iov_iter_count(&req->iter)) {
req->curr_bio = req->curr_bio->bi_next;
/*
* If we don`t have any bios it means that controller
* sent more data than we requested, hence error
*/
if (!req->curr_bio) {
dev_err(queue->ctrl->ctrl.device,
"queue %d no space in request %#x",
nvme_tcp_queue_id(queue), rq->tag);
nvme_tcp_init_recv_ctx(queue);
return -EIO;
}
nvme_tcp_init_iter(req, ITER_DEST);
}
/* we can read only from what is left in this bio */
recv_len = min_t(size_t, recv_len,
iov_iter_count(&req->iter));
if (queue->data_digest)
ret = skb_copy_and_hash_datagram_iter(skb, *offset,
&req->iter, recv_len, queue->rcv_hash);
else
ret = skb_copy_datagram_iter(skb, *offset,
&req->iter, recv_len);
if (ret) {
dev_err(queue->ctrl->ctrl.device,
"queue %d failed to copy request %#x data",
nvme_tcp_queue_id(queue), rq->tag);
return ret;
}
*len -= recv_len;
*offset += recv_len;
queue->data_remaining -= recv_len;
}
if (!queue->data_remaining) {
if (queue->data_digest) {
nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
} else {
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
nvme_tcp_end_request(rq,
le16_to_cpu(req->status));
queue->nr_cqe++;
}
nvme_tcp_init_recv_ctx(queue);
}
}
return 0;
}
static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
struct sk_buff *skb, unsigned int *offset, size_t *len)
{
struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
char *ddgst = (char *)&queue->recv_ddgst;
size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
int ret;
ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
if (unlikely(ret))
return ret;
queue->ddgst_remaining -= recv_len;
*offset += recv_len;
*len -= recv_len;
if (queue->ddgst_remaining)
return 0;
if (queue->recv_ddgst != queue->exp_ddgst) {
struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
pdu->command_id);
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR);
dev_err(queue->ctrl->ctrl.device,
"data digest error: recv %#x expected %#x\n",
le32_to_cpu(queue->recv_ddgst),
le32_to_cpu(queue->exp_ddgst));
}
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
pdu->command_id);
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
nvme_tcp_end_request(rq, le16_to_cpu(req->status));
queue->nr_cqe++;
}
nvme_tcp_init_recv_ctx(queue);
return 0;
}
static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
unsigned int offset, size_t len)
{
struct nvme_tcp_queue *queue = desc->arg.data;
size_t consumed = len;
int result;
if (unlikely(!queue->rd_enabled))
return -EFAULT;
while (len) {
switch (nvme_tcp_recv_state(queue)) {
case NVME_TCP_RECV_PDU:
result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
break;
case NVME_TCP_RECV_DATA:
result = nvme_tcp_recv_data(queue, skb, &offset, &len);
break;
case NVME_TCP_RECV_DDGST:
result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
break;
default:
result = -EFAULT;
}
if (result) {
dev_err(queue->ctrl->ctrl.device,
"receive failed: %d\n", result);
queue->rd_enabled = false;
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
return result;
}
}
return consumed;
}
static void nvme_tcp_data_ready(struct sock *sk)
{
struct nvme_tcp_queue *queue;
trace_sk_data_ready(sk);
read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data;
if (likely(queue && queue->rd_enabled) &&
!test_bit(NVME_TCP_Q_POLLING, &queue->flags))
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
read_unlock_bh(&sk->sk_callback_lock);
}
static void nvme_tcp_write_space(struct sock *sk)
{
struct nvme_tcp_queue *queue;
read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data;
if (likely(queue && sk_stream_is_writeable(sk))) {
clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
read_unlock_bh(&sk->sk_callback_lock);
}
static void nvme_tcp_state_change(struct sock *sk)
{
struct nvme_tcp_queue *queue;
read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data;
if (!queue)
goto done;
switch (sk->sk_state) {
case TCP_CLOSE:
case TCP_CLOSE_WAIT:
case TCP_LAST_ACK:
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
break;
default:
dev_info(queue->ctrl->ctrl.device,
"queue %d socket state %d\n",
nvme_tcp_queue_id(queue), sk->sk_state);
}
queue->state_change(sk);
done:
read_unlock_bh(&sk->sk_callback_lock);
}
static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
{
queue->request = NULL;
}
static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
{
if (nvme_tcp_async_req(req)) {
union nvme_result res = {};
nvme_complete_async_event(&req->queue->ctrl->ctrl,
cpu_to_le16(NVME_SC_HOST_PATH_ERROR), &res);
} else {
nvme_tcp_end_request(blk_mq_rq_from_pdu(req),
NVME_SC_HOST_PATH_ERROR);
}
}
static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
int req_data_len = req->data_len;
u32 h2cdata_left = req->h2cdata_left;
while (true) {
struct page *page = nvme_tcp_req_cur_page(req);
size_t offset = nvme_tcp_req_cur_offset(req);
size_t len = nvme_tcp_req_cur_length(req);
bool last = nvme_tcp_pdu_last_send(req, len);
int req_data_sent = req->data_sent;
int ret, flags = MSG_DONTWAIT;
if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
flags |= MSG_EOR;
else
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
if (sendpage_ok(page)) {
ret = kernel_sendpage(queue->sock, page, offset, len,
flags);
} else {
ret = sock_no_sendpage(queue->sock, page, offset, len,
flags);
}
if (ret <= 0)
return ret;
if (queue->data_digest)
nvme_tcp_ddgst_update(queue->snd_hash, page,
offset, ret);
/*
* update the request iterator except for the last payload send
* in the request where we don't want to modify it as we may
* compete with the RX path completing the request.
*/
if (req_data_sent + ret < req_data_len)
nvme_tcp_advance_req(req, ret);
/* fully successful last send in current PDU */
if (last && ret == len) {
if (queue->data_digest) {
nvme_tcp_ddgst_final(queue->snd_hash,
&req->ddgst);
req->state = NVME_TCP_SEND_DDGST;
req->offset = 0;
} else {
if (h2cdata_left)
nvme_tcp_setup_h2c_data_pdu(req);
else
nvme_tcp_done_send_req(queue);
}
return 1;
}
}
return -EAGAIN;
}
static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
bool inline_data = nvme_tcp_has_inline_data(req);
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) + hdgst - req->offset;
int flags = MSG_DONTWAIT;
int ret;
if (inline_data || nvme_tcp_queue_more(queue))
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
else
flags |= MSG_EOR;
if (queue->hdr_digest && !req->offset)
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
offset_in_page(pdu) + req->offset, len, flags);
if (unlikely(ret <= 0))
return ret;
len -= ret;
if (!len) {
if (inline_data) {
req->state = NVME_TCP_SEND_DATA;
if (queue->data_digest)
crypto_ahash_init(queue->snd_hash);
} else {
nvme_tcp_done_send_req(queue);
}
return 1;
}
req->offset += ret;
return -EAGAIN;
}
static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) - req->offset + hdgst;
int ret;
if (queue->hdr_digest && !req->offset)
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
if (!req->h2cdata_left)
ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
offset_in_page(pdu) + req->offset, len,
MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
else
ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
offset_in_page(pdu) + req->offset, len,
MSG_DONTWAIT | MSG_MORE);
if (unlikely(ret <= 0))
return ret;
len -= ret;
if (!len) {
req->state = NVME_TCP_SEND_DATA;
if (queue->data_digest)
crypto_ahash_init(queue->snd_hash);
return 1;
}
req->offset += ret;
return -EAGAIN;
}
static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
size_t offset = req->offset;
u32 h2cdata_left = req->h2cdata_left;
int ret;
struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
struct kvec iov = {
.iov_base = (u8 *)&req->ddgst + req->offset,
.iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
};
if (nvme_tcp_queue_more(queue))
msg.msg_flags |= MSG_MORE;
else
msg.msg_flags |= MSG_EOR;
ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
if (unlikely(ret <= 0))
return ret;
if (offset + ret == NVME_TCP_DIGEST_LENGTH) {
if (h2cdata_left)
nvme_tcp_setup_h2c_data_pdu(req);
else
nvme_tcp_done_send_req(queue);
return 1;
}
req->offset += ret;
return -EAGAIN;
}
static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_request *req;
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure When destroying a queue, when calling sock_release, the network stack might need to allocate an skb to send a FIN/RST. When that happens during memory pressure, there is a need to reclaim memory, which in turn may ask the nvme-tcp device to write out dirty pages, however this is not possible due to a ctrl teardown that is going on. Set PF_MEMALLOC to the task that releases the socket to grant access to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp thread as this may also originate from the swap itself and should be more resilient to memory pressure situations. This fixes the following lockdep complaint: -- ====================================================== WARNING: possible circular locking dependency detected 6.0.0-rc2+ #25 Tainted: G W ------------------------------------------------------ kswapd0/92 is trying to acquire lock: ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0 but task is already holding lock: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x11e/0x160 kmem_cache_alloc_node+0x44/0x530 __alloc_skb+0x158/0x230 tcp_send_active_reset+0x7e/0x730 tcp_disconnect+0x1272/0x1ae0 __tcp_close+0x707/0xd90 tcp_close+0x26/0x80 inet_release+0xfa/0x220 sock_release+0x85/0x1a0 nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp] nvme_do_delete_ctrl+0x130/0x13d [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write_iter+0x356/0x530 vfs_write+0x4e8/0xce0 ksys_write+0xfd/0x1d0 do_syscall_64+0x58/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}: __lock_acquire+0x2a0c/0x5690 lock_acquire+0x18e/0x4f0 lock_sock_nested+0x37/0xc0 tcp_sendpage+0x23/0xa0 inet_sendpage+0xad/0x120 kernel_sendpage+0x156/0x440 nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp] nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp] __blk_mq_try_issue_directly+0x452/0x660 blk_mq_plug_issue_direct.constprop.0+0x207/0x700 blk_mq_flush_plug_list+0x6f5/0xc70 __blk_flush_plug+0x264/0x410 blk_finish_plug+0x4b/0xa0 shrink_lruvec+0x1263/0x1ea0 shrink_node+0x736/0x1a80 balance_pgdat+0x740/0x10d0 kswapd+0x5f2/0xaf0 kthread+0x256/0x2f0 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); *** DEADLOCK *** 3 locks held by kswapd0/92: #0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 #1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70 #2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp] Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Reported-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
unsigned int noreclaim_flag;
int ret = 1;
if (!queue->request) {
queue->request = nvme_tcp_fetch_request(queue);
if (!queue->request)
return 0;
}
req = queue->request;
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure When destroying a queue, when calling sock_release, the network stack might need to allocate an skb to send a FIN/RST. When that happens during memory pressure, there is a need to reclaim memory, which in turn may ask the nvme-tcp device to write out dirty pages, however this is not possible due to a ctrl teardown that is going on. Set PF_MEMALLOC to the task that releases the socket to grant access to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp thread as this may also originate from the swap itself and should be more resilient to memory pressure situations. This fixes the following lockdep complaint: -- ====================================================== WARNING: possible circular locking dependency detected 6.0.0-rc2+ #25 Tainted: G W ------------------------------------------------------ kswapd0/92 is trying to acquire lock: ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0 but task is already holding lock: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x11e/0x160 kmem_cache_alloc_node+0x44/0x530 __alloc_skb+0x158/0x230 tcp_send_active_reset+0x7e/0x730 tcp_disconnect+0x1272/0x1ae0 __tcp_close+0x707/0xd90 tcp_close+0x26/0x80 inet_release+0xfa/0x220 sock_release+0x85/0x1a0 nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp] nvme_do_delete_ctrl+0x130/0x13d [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write_iter+0x356/0x530 vfs_write+0x4e8/0xce0 ksys_write+0xfd/0x1d0 do_syscall_64+0x58/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}: __lock_acquire+0x2a0c/0x5690 lock_acquire+0x18e/0x4f0 lock_sock_nested+0x37/0xc0 tcp_sendpage+0x23/0xa0 inet_sendpage+0xad/0x120 kernel_sendpage+0x156/0x440 nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp] nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp] __blk_mq_try_issue_directly+0x452/0x660 blk_mq_plug_issue_direct.constprop.0+0x207/0x700 blk_mq_flush_plug_list+0x6f5/0xc70 __blk_flush_plug+0x264/0x410 blk_finish_plug+0x4b/0xa0 shrink_lruvec+0x1263/0x1ea0 shrink_node+0x736/0x1a80 balance_pgdat+0x740/0x10d0 kswapd+0x5f2/0xaf0 kthread+0x256/0x2f0 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); *** DEADLOCK *** 3 locks held by kswapd0/92: #0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 #1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70 #2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp] Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Reported-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
noreclaim_flag = memalloc_noreclaim_save();
if (req->state == NVME_TCP_SEND_CMD_PDU) {
ret = nvme_tcp_try_send_cmd_pdu(req);
if (ret <= 0)
goto done;
if (!nvme_tcp_has_inline_data(req))
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure When destroying a queue, when calling sock_release, the network stack might need to allocate an skb to send a FIN/RST. When that happens during memory pressure, there is a need to reclaim memory, which in turn may ask the nvme-tcp device to write out dirty pages, however this is not possible due to a ctrl teardown that is going on. Set PF_MEMALLOC to the task that releases the socket to grant access to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp thread as this may also originate from the swap itself and should be more resilient to memory pressure situations. This fixes the following lockdep complaint: -- ====================================================== WARNING: possible circular locking dependency detected 6.0.0-rc2+ #25 Tainted: G W ------------------------------------------------------ kswapd0/92 is trying to acquire lock: ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0 but task is already holding lock: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x11e/0x160 kmem_cache_alloc_node+0x44/0x530 __alloc_skb+0x158/0x230 tcp_send_active_reset+0x7e/0x730 tcp_disconnect+0x1272/0x1ae0 __tcp_close+0x707/0xd90 tcp_close+0x26/0x80 inet_release+0xfa/0x220 sock_release+0x85/0x1a0 nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp] nvme_do_delete_ctrl+0x130/0x13d [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write_iter+0x356/0x530 vfs_write+0x4e8/0xce0 ksys_write+0xfd/0x1d0 do_syscall_64+0x58/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}: __lock_acquire+0x2a0c/0x5690 lock_acquire+0x18e/0x4f0 lock_sock_nested+0x37/0xc0 tcp_sendpage+0x23/0xa0 inet_sendpage+0xad/0x120 kernel_sendpage+0x156/0x440 nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp] nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp] __blk_mq_try_issue_directly+0x452/0x660 blk_mq_plug_issue_direct.constprop.0+0x207/0x700 blk_mq_flush_plug_list+0x6f5/0xc70 __blk_flush_plug+0x264/0x410 blk_finish_plug+0x4b/0xa0 shrink_lruvec+0x1263/0x1ea0 shrink_node+0x736/0x1a80 balance_pgdat+0x740/0x10d0 kswapd+0x5f2/0xaf0 kthread+0x256/0x2f0 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); *** DEADLOCK *** 3 locks held by kswapd0/92: #0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 #1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70 #2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp] Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Reported-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
goto out;
}
if (req->state == NVME_TCP_SEND_H2C_PDU) {
ret = nvme_tcp_try_send_data_pdu(req);
if (ret <= 0)
goto done;
}
if (req->state == NVME_TCP_SEND_DATA) {
ret = nvme_tcp_try_send_data(req);
if (ret <= 0)
goto done;
}
if (req->state == NVME_TCP_SEND_DDGST)
ret = nvme_tcp_try_send_ddgst(req);
done:
if (ret == -EAGAIN) {
ret = 0;
} else if (ret < 0) {
dev_err(queue->ctrl->ctrl.device,
"failed to send request %d\n", ret);
nvme_tcp_fail_request(queue->request);
nvme_tcp_done_send_req(queue);
}
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure When destroying a queue, when calling sock_release, the network stack might need to allocate an skb to send a FIN/RST. When that happens during memory pressure, there is a need to reclaim memory, which in turn may ask the nvme-tcp device to write out dirty pages, however this is not possible due to a ctrl teardown that is going on. Set PF_MEMALLOC to the task that releases the socket to grant access to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp thread as this may also originate from the swap itself and should be more resilient to memory pressure situations. This fixes the following lockdep complaint: -- ====================================================== WARNING: possible circular locking dependency detected 6.0.0-rc2+ #25 Tainted: G W ------------------------------------------------------ kswapd0/92 is trying to acquire lock: ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0 but task is already holding lock: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x11e/0x160 kmem_cache_alloc_node+0x44/0x530 __alloc_skb+0x158/0x230 tcp_send_active_reset+0x7e/0x730 tcp_disconnect+0x1272/0x1ae0 __tcp_close+0x707/0xd90 tcp_close+0x26/0x80 inet_release+0xfa/0x220 sock_release+0x85/0x1a0 nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp] nvme_do_delete_ctrl+0x130/0x13d [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write_iter+0x356/0x530 vfs_write+0x4e8/0xce0 ksys_write+0xfd/0x1d0 do_syscall_64+0x58/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}: __lock_acquire+0x2a0c/0x5690 lock_acquire+0x18e/0x4f0 lock_sock_nested+0x37/0xc0 tcp_sendpage+0x23/0xa0 inet_sendpage+0xad/0x120 kernel_sendpage+0x156/0x440 nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp] nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp] __blk_mq_try_issue_directly+0x452/0x660 blk_mq_plug_issue_direct.constprop.0+0x207/0x700 blk_mq_flush_plug_list+0x6f5/0xc70 __blk_flush_plug+0x264/0x410 blk_finish_plug+0x4b/0xa0 shrink_lruvec+0x1263/0x1ea0 shrink_node+0x736/0x1a80 balance_pgdat+0x740/0x10d0 kswapd+0x5f2/0xaf0 kthread+0x256/0x2f0 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); *** DEADLOCK *** 3 locks held by kswapd0/92: #0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 #1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70 #2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp] Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Reported-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
out:
memalloc_noreclaim_restore(noreclaim_flag);
return ret;
}
static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
{
struct socket *sock = queue->sock;
struct sock *sk = sock->sk;
read_descriptor_t rd_desc;
int consumed;
rd_desc.arg.data = queue;
rd_desc.count = 1;
lock_sock(sk);
queue->nr_cqe = 0;
consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
release_sock(sk);
return consumed;
}
static void nvme_tcp_io_work(struct work_struct *w)
{
struct nvme_tcp_queue *queue =
container_of(w, struct nvme_tcp_queue, io_work);
unsigned long deadline = jiffies + msecs_to_jiffies(1);
do {
bool pending = false;
int result;
if (mutex_trylock(&queue->send_mutex)) {
result = nvme_tcp_try_send(queue);
mutex_unlock(&queue->send_mutex);
if (result > 0)
pending = true;
else if (unlikely(result < 0))
break;
}
result = nvme_tcp_try_recv(queue);
if (result > 0)
pending = true;
else if (unlikely(result < 0))
nvme-tcp: fix possible crash in recv error flow If the target misbehaves and sends us unexpected payload we need to make sure to fail the controller and stop processing the input stream. We clear the rd_enabled flag and stop the io_work, but we may still requeue it if we still have pending sends and then in the next invocation we will process the input stream as the check is only in the .data_ready upcall. To fix this we need to make sure not to self-requeue io_work upon a recv flow error. This fixes the crash: nvme nvme2: receive failed: -22 BUG: unable to handle page fault for address: ffffbeb5816c3b48 nvme_ns_head_make_request: 29 callbacks suppressed block nvme0n5: no usable path - requeuing I/O block nvme0n5: no usable path - requeuing I/O block nvme0n7: no usable path - requeuing I/O block nvme0n7: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O block nvme0n7: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O #PF: supervisor read access inkernel mode #PF: error_code(0x0000) - not-present page PGD 1039157067 P4D 1039157067 PUD 103915a067 PMD 102719f067 PTE 0 Oops: 0000 [#1] SMP PTI CPU: 8 PID: 411 Comm: kworker/8:1H Not tainted 5.3.0-40-generic #32~18.04.1-Ubuntu Hardware name: Supermicro Super Server/X10SRi-F, BIOS 2.0 12/17/2015 Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] RIP: 0010:nvme_tcp_recv_skb+0x2ae/0xb50 [nvme_tcp] RSP: 0018:ffffbeb5806cfd10 EFLAGS: 00010246 RAX: ffffbeb5816c3b48 RBX: 00000000000003d0 RCX: 0000000000000008 RDX: 00000000000003d0 RSI: 0000000000000001 RDI: ffff9a3040684b40 RBP: ffffbeb5806cfd90 R08: 0000000000000000 R09: ffffffff946e6900 R10: ffffbeb5806cfce0 R11: 0000000000000001 R12: 0000000000000000 R13: ffff9a2ff86501c0 R14: 00000000000003d0 R15: ffff9a30b85f2798 FS: 0000000000000000(0000) GS:ffff9a30bf800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffbeb5816c3b48 CR3: 000000088400a006 CR4: 00000000003626e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcp_read_sock+0x8c/0x290 ? __release_sock+0x9d/0xe0 ? nvme_tcp_write_space+0xb0/0xb0 [nvme_tcp] nvme_tcp_io_work+0x4b4/0x830 [nvme_tcp] ? finish_task_switch+0x163/0x270 process_one_work+0x1fd/0x3f0 worker_thread+0x34/0x410 kthread+0x121/0x140 ? process_one_work+0x3f0/0x3f0 ? kthread_park+0xb0/0xb0 ret_from_fork+0x35/0x40 Reported-by: Roy Shterman <roys@lightbitslabs.com> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Christoph Hellwig <hch@lst.de>
2020-04-01 13:44:23 +08:00
return;
if (!pending || !queue->rd_enabled)
return;
} while (!time_after(jiffies, deadline)); /* quota is exhausted */
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
{
struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
ahash_request_free(queue->rcv_hash);
ahash_request_free(queue->snd_hash);
crypto_free_ahash(tfm);
}
static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
{
struct crypto_ahash *tfm;
tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm))
return PTR_ERR(tfm);
queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
if (!queue->snd_hash)
goto free_tfm;
ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
if (!queue->rcv_hash)
goto free_snd_hash;
ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
return 0;
free_snd_hash:
ahash_request_free(queue->snd_hash);
free_tfm:
crypto_free_ahash(tfm);
return -ENOMEM;
}
static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
{
struct nvme_tcp_request *async = &ctrl->async_req;
page_frag_free(async->pdu);
}
static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
{
struct nvme_tcp_queue *queue = &ctrl->queues[0];
struct nvme_tcp_request *async = &ctrl->async_req;
u8 hdgst = nvme_tcp_hdgst_len(queue);
async->pdu = page_frag_alloc(&queue->pf_cache,
sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
GFP_KERNEL | __GFP_ZERO);
if (!async->pdu)
return -ENOMEM;
async->queue = &ctrl->queues[0];
return 0;
}
static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
{
struct page *page;
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure When destroying a queue, when calling sock_release, the network stack might need to allocate an skb to send a FIN/RST. When that happens during memory pressure, there is a need to reclaim memory, which in turn may ask the nvme-tcp device to write out dirty pages, however this is not possible due to a ctrl teardown that is going on. Set PF_MEMALLOC to the task that releases the socket to grant access to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp thread as this may also originate from the swap itself and should be more resilient to memory pressure situations. This fixes the following lockdep complaint: -- ====================================================== WARNING: possible circular locking dependency detected 6.0.0-rc2+ #25 Tainted: G W ------------------------------------------------------ kswapd0/92 is trying to acquire lock: ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0 but task is already holding lock: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x11e/0x160 kmem_cache_alloc_node+0x44/0x530 __alloc_skb+0x158/0x230 tcp_send_active_reset+0x7e/0x730 tcp_disconnect+0x1272/0x1ae0 __tcp_close+0x707/0xd90 tcp_close+0x26/0x80 inet_release+0xfa/0x220 sock_release+0x85/0x1a0 nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp] nvme_do_delete_ctrl+0x130/0x13d [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write_iter+0x356/0x530 vfs_write+0x4e8/0xce0 ksys_write+0xfd/0x1d0 do_syscall_64+0x58/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}: __lock_acquire+0x2a0c/0x5690 lock_acquire+0x18e/0x4f0 lock_sock_nested+0x37/0xc0 tcp_sendpage+0x23/0xa0 inet_sendpage+0xad/0x120 kernel_sendpage+0x156/0x440 nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp] nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp] __blk_mq_try_issue_directly+0x452/0x660 blk_mq_plug_issue_direct.constprop.0+0x207/0x700 blk_mq_flush_plug_list+0x6f5/0xc70 __blk_flush_plug+0x264/0x410 blk_finish_plug+0x4b/0xa0 shrink_lruvec+0x1263/0x1ea0 shrink_node+0x736/0x1a80 balance_pgdat+0x740/0x10d0 kswapd+0x5f2/0xaf0 kthread+0x256/0x2f0 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); *** DEADLOCK *** 3 locks held by kswapd0/92: #0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 #1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70 #2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp] Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Reported-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
unsigned int noreclaim_flag;
if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
return;
if (queue->hdr_digest || queue->data_digest)
nvme_tcp_free_crypto(queue);
if (queue->pf_cache.va) {
page = virt_to_head_page(queue->pf_cache.va);
__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
queue->pf_cache.va = NULL;
}
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure When destroying a queue, when calling sock_release, the network stack might need to allocate an skb to send a FIN/RST. When that happens during memory pressure, there is a need to reclaim memory, which in turn may ask the nvme-tcp device to write out dirty pages, however this is not possible due to a ctrl teardown that is going on. Set PF_MEMALLOC to the task that releases the socket to grant access to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp thread as this may also originate from the swap itself and should be more resilient to memory pressure situations. This fixes the following lockdep complaint: -- ====================================================== WARNING: possible circular locking dependency detected 6.0.0-rc2+ #25 Tainted: G W ------------------------------------------------------ kswapd0/92 is trying to acquire lock: ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0 but task is already holding lock: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x11e/0x160 kmem_cache_alloc_node+0x44/0x530 __alloc_skb+0x158/0x230 tcp_send_active_reset+0x7e/0x730 tcp_disconnect+0x1272/0x1ae0 __tcp_close+0x707/0xd90 tcp_close+0x26/0x80 inet_release+0xfa/0x220 sock_release+0x85/0x1a0 nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp] nvme_do_delete_ctrl+0x130/0x13d [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write_iter+0x356/0x530 vfs_write+0x4e8/0xce0 ksys_write+0xfd/0x1d0 do_syscall_64+0x58/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}: __lock_acquire+0x2a0c/0x5690 lock_acquire+0x18e/0x4f0 lock_sock_nested+0x37/0xc0 tcp_sendpage+0x23/0xa0 inet_sendpage+0xad/0x120 kernel_sendpage+0x156/0x440 nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp] nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp] __blk_mq_try_issue_directly+0x452/0x660 blk_mq_plug_issue_direct.constprop.0+0x207/0x700 blk_mq_flush_plug_list+0x6f5/0xc70 __blk_flush_plug+0x264/0x410 blk_finish_plug+0x4b/0xa0 shrink_lruvec+0x1263/0x1ea0 shrink_node+0x736/0x1a80 balance_pgdat+0x740/0x10d0 kswapd+0x5f2/0xaf0 kthread+0x256/0x2f0 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); *** DEADLOCK *** 3 locks held by kswapd0/92: #0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 #1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70 #2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp] Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Reported-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
noreclaim_flag = memalloc_noreclaim_save();
sock_release(queue->sock);
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure When destroying a queue, when calling sock_release, the network stack might need to allocate an skb to send a FIN/RST. When that happens during memory pressure, there is a need to reclaim memory, which in turn may ask the nvme-tcp device to write out dirty pages, however this is not possible due to a ctrl teardown that is going on. Set PF_MEMALLOC to the task that releases the socket to grant access to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp thread as this may also originate from the swap itself and should be more resilient to memory pressure situations. This fixes the following lockdep complaint: -- ====================================================== WARNING: possible circular locking dependency detected 6.0.0-rc2+ #25 Tainted: G W ------------------------------------------------------ kswapd0/92 is trying to acquire lock: ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0 but task is already holding lock: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x11e/0x160 kmem_cache_alloc_node+0x44/0x530 __alloc_skb+0x158/0x230 tcp_send_active_reset+0x7e/0x730 tcp_disconnect+0x1272/0x1ae0 __tcp_close+0x707/0xd90 tcp_close+0x26/0x80 inet_release+0xfa/0x220 sock_release+0x85/0x1a0 nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp] nvme_do_delete_ctrl+0x130/0x13d [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write_iter+0x356/0x530 vfs_write+0x4e8/0xce0 ksys_write+0xfd/0x1d0 do_syscall_64+0x58/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}: __lock_acquire+0x2a0c/0x5690 lock_acquire+0x18e/0x4f0 lock_sock_nested+0x37/0xc0 tcp_sendpage+0x23/0xa0 inet_sendpage+0xad/0x120 kernel_sendpage+0x156/0x440 nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp] nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp] __blk_mq_try_issue_directly+0x452/0x660 blk_mq_plug_issue_direct.constprop.0+0x207/0x700 blk_mq_flush_plug_list+0x6f5/0xc70 __blk_flush_plug+0x264/0x410 blk_finish_plug+0x4b/0xa0 shrink_lruvec+0x1263/0x1ea0 shrink_node+0x736/0x1a80 balance_pgdat+0x740/0x10d0 kswapd+0x5f2/0xaf0 kthread+0x256/0x2f0 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); *** DEADLOCK *** 3 locks held by kswapd0/92: #0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 #1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70 #2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp] Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Reported-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-10-23 16:04:43 +08:00
memalloc_noreclaim_restore(noreclaim_flag);
kfree(queue->pdu);
mutex_destroy(&queue->send_mutex);
mutex_destroy(&queue->queue_lock);
}
static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_icreq_pdu *icreq;
struct nvme_tcp_icresp_pdu *icresp;
struct msghdr msg = {};
struct kvec iov;
bool ctrl_hdgst, ctrl_ddgst;
u32 maxh2cdata;
int ret;
icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
if (!icreq)
return -ENOMEM;
icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
if (!icresp) {
ret = -ENOMEM;
goto free_icreq;
}
icreq->hdr.type = nvme_tcp_icreq;
icreq->hdr.hlen = sizeof(*icreq);
icreq->hdr.pdo = 0;
icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
icreq->maxr2t = 0; /* single inflight r2t supported */
icreq->hpda = 0; /* no alignment constraint */
if (queue->hdr_digest)
icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
if (queue->data_digest)
icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
iov.iov_base = icreq;
iov.iov_len = sizeof(*icreq);
ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
if (ret < 0)
goto free_icresp;
memset(&msg, 0, sizeof(msg));
iov.iov_base = icresp;
iov.iov_len = sizeof(*icresp);
ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
iov.iov_len, msg.msg_flags);
if (ret < 0)
goto free_icresp;
ret = -EINVAL;
if (icresp->hdr.type != nvme_tcp_icresp) {
pr_err("queue %d: bad type returned %d\n",
nvme_tcp_queue_id(queue), icresp->hdr.type);
goto free_icresp;
}
if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
pr_err("queue %d: bad pdu length returned %d\n",
nvme_tcp_queue_id(queue), icresp->hdr.plen);
goto free_icresp;
}
if (icresp->pfv != NVME_TCP_PFV_1_0) {
pr_err("queue %d: bad pfv returned %d\n",
nvme_tcp_queue_id(queue), icresp->pfv);
goto free_icresp;
}
ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
if ((queue->data_digest && !ctrl_ddgst) ||
(!queue->data_digest && ctrl_ddgst)) {
pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
nvme_tcp_queue_id(queue),
queue->data_digest ? "enabled" : "disabled",
ctrl_ddgst ? "enabled" : "disabled");
goto free_icresp;
}
ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
if ((queue->hdr_digest && !ctrl_hdgst) ||
(!queue->hdr_digest && ctrl_hdgst)) {
pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
nvme_tcp_queue_id(queue),
queue->hdr_digest ? "enabled" : "disabled",
ctrl_hdgst ? "enabled" : "disabled");
goto free_icresp;
}
if (icresp->cpda != 0) {
pr_err("queue %d: unsupported cpda returned %d\n",
nvme_tcp_queue_id(queue), icresp->cpda);
goto free_icresp;
}
maxh2cdata = le32_to_cpu(icresp->maxdata);
if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) {
pr_err("queue %d: invalid maxh2cdata returned %u\n",
nvme_tcp_queue_id(queue), maxh2cdata);
goto free_icresp;
}
queue->maxh2cdata = maxh2cdata;
ret = 0;
free_icresp:
kfree(icresp);
free_icreq:
kfree(icreq);
return ret;
}
static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
{
return nvme_tcp_queue_id(queue) == 0;
}
static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
int qid = nvme_tcp_queue_id(queue);
return !nvme_tcp_admin_queue(queue) &&
qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
}
static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
int qid = nvme_tcp_queue_id(queue);
return !nvme_tcp_admin_queue(queue) &&
!nvme_tcp_default_queue(queue) &&
qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
ctrl->io_queues[HCTX_TYPE_READ];
}
static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
int qid = nvme_tcp_queue_id(queue);
return !nvme_tcp_admin_queue(queue) &&
!nvme_tcp_default_queue(queue) &&
!nvme_tcp_read_queue(queue) &&
qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
ctrl->io_queues[HCTX_TYPE_READ] +
ctrl->io_queues[HCTX_TYPE_POLL];
}
static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
int qid = nvme_tcp_queue_id(queue);
int n = 0;
if (nvme_tcp_default_queue(queue))
n = qid - 1;
else if (nvme_tcp_read_queue(queue))
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
else if (nvme_tcp_poll_queue(queue))
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
ctrl->io_queues[HCTX_TYPE_READ] - 1;
queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
}
static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
int ret, rcv_pdu_size;
mutex_init(&queue->queue_lock);
queue->ctrl = ctrl;
init_llist_head(&queue->req_list);
INIT_LIST_HEAD(&queue->send_list);
mutex_init(&queue->send_mutex);
INIT_WORK(&queue->io_work, nvme_tcp_io_work);
if (qid > 0)
queue->cmnd_capsule_len = nctrl->ioccsz * 16;
else
queue->cmnd_capsule_len = sizeof(struct nvme_command) +
NVME_TCP_ADMIN_CCSZ;
ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
IPPROTO_TCP, &queue->sock);
if (ret) {
dev_err(nctrl->device,
"failed to create socket: %d\n", ret);
goto err_destroy_mutex;
}
nvme-tcp: lockdep: annotate in-kernel sockets Put NVMe/TCP sockets in their own class to avoid some lockdep warnings. Sockets created by nvme-tcp are not exposed to user-space, and will not trigger certain code paths that the general socket API exposes. Lockdep complains about a circular dependency between the socket and filesystem locks, because setsockopt can trigger a page fault with a socket lock held, but nvme-tcp sends requests on the socket while file system locks are held. ====================================================== WARNING: possible circular locking dependency detected 5.15.0-rc3 #1 Not tainted ------------------------------------------------------ fio/1496 is trying to acquire lock: (sk_lock-AF_INET){+.+.}-{0:0}, at: tcp_sendpage+0x23/0x80 but task is already holding lock: (&xfs_dir_ilock_class/5){+.+.}-{3:3}, at: xfs_ilock+0xcf/0x290 [xfs] which lock already depends on the new lock. other info that might help us debug this: chain exists of: sk_lock-AF_INET --> sb_internal --> &xfs_dir_ilock_class/5 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&xfs_dir_ilock_class/5); lock(sb_internal); lock(&xfs_dir_ilock_class/5); lock(sk_lock-AF_INET); *** DEADLOCK *** 6 locks held by fio/1496: #0: (sb_writers#13){.+.+}-{0:0}, at: path_openat+0x9fc/0xa20 #1: (&inode->i_sb->s_type->i_mutex_dir_key){++++}-{3:3}, at: path_openat+0x296/0xa20 #2: (sb_internal){.+.+}-{0:0}, at: xfs_trans_alloc_icreate+0x41/0xd0 [xfs] #3: (&xfs_dir_ilock_class/5){+.+.}-{3:3}, at: xfs_ilock+0xcf/0x290 [xfs] #4: (hctx->srcu){....}-{0:0}, at: hctx_lock+0x51/0xd0 #5: (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0x33e/0x380 [nvme_tcp] This annotation lets lockdep analyze nvme-tcp controlled sockets independently of what the user-space sockets API does. Link: https://lore.kernel.org/linux-nvme/CAHj4cs9MDYLJ+q+2_GXUK9HxFizv2pxUryUR0toX974M040z7g@mail.gmail.com/ Signed-off-by: Chris Leech <cleech@redhat.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-02-16 10:22:49 +08:00
nvme_tcp_reclassify_socket(queue->sock);
/* Single syn retry */
tcp_sock_set_syncnt(queue->sock->sk, 1);
/* Set TCP no delay */
tcp_sock_set_nodelay(queue->sock->sk);
/*
* Cleanup whatever is sitting in the TCP transmit queue on socket
* close. This is done to prevent stale data from being sent should
* the network connection be restored before TCP times out.
*/
sock_no_linger(queue->sock->sk);
if (so_priority > 0)
sock_set_priority(queue->sock->sk, so_priority);
/* Set socket type of service */
if (nctrl->opts->tos >= 0)
ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
/* Set 10 seconds timeout for icresp recvmsg */
queue->sock->sk->sk_rcvtimeo = 10 * HZ;
queue->sock->sk->sk_allocation = GFP_ATOMIC;
Treewide: Stop corrupting socket's task_frag Since moving to memalloc_nofs_save/restore, SUNRPC has stopped setting the GFP_NOIO flag on sk_allocation which the networking system uses to decide when it is safe to use current->task_frag. The results of this are unexpected corruption in task_frag when SUNRPC is involved in memory reclaim. The corruption can be seen in crashes, but the root cause is often difficult to ascertain as a crashing machine's stack trace will have no evidence of being near NFS or SUNRPC code. I believe this problem to be much more pervasive than reports to the community may indicate. Fix this by having kernel users of sockets that may corrupt task_frag due to reclaim set sk_use_task_frag = false. Preemptively correcting this situation for users that still set sk_allocation allows them to convert to memalloc_nofs_save/restore without the same unexpected corruptions that are sure to follow, unlikely to show up in testing, and difficult to bisect. CC: Philipp Reisner <philipp.reisner@linbit.com> CC: Lars Ellenberg <lars.ellenberg@linbit.com> CC: "Christoph Böhmwalder" <christoph.boehmwalder@linbit.com> CC: Jens Axboe <axboe@kernel.dk> CC: Josef Bacik <josef@toxicpanda.com> CC: Keith Busch <kbusch@kernel.org> CC: Christoph Hellwig <hch@lst.de> CC: Sagi Grimberg <sagi@grimberg.me> CC: Lee Duncan <lduncan@suse.com> CC: Chris Leech <cleech@redhat.com> CC: Mike Christie <michael.christie@oracle.com> CC: "James E.J. Bottomley" <jejb@linux.ibm.com> CC: "Martin K. Petersen" <martin.petersen@oracle.com> CC: Valentina Manea <valentina.manea.m@gmail.com> CC: Shuah Khan <shuah@kernel.org> CC: Greg Kroah-Hartman <gregkh@linuxfoundation.org> CC: David Howells <dhowells@redhat.com> CC: Marc Dionne <marc.dionne@auristor.com> CC: Steve French <sfrench@samba.org> CC: Christine Caulfield <ccaulfie@redhat.com> CC: David Teigland <teigland@redhat.com> CC: Mark Fasheh <mark@fasheh.com> CC: Joel Becker <jlbec@evilplan.org> CC: Joseph Qi <joseph.qi@linux.alibaba.com> CC: Eric Van Hensbergen <ericvh@gmail.com> CC: Latchesar Ionkov <lucho@ionkov.net> CC: Dominique Martinet <asmadeus@codewreck.org> CC: Ilya Dryomov <idryomov@gmail.com> CC: Xiubo Li <xiubli@redhat.com> CC: Chuck Lever <chuck.lever@oracle.com> CC: Jeff Layton <jlayton@kernel.org> CC: Trond Myklebust <trond.myklebust@hammerspace.com> CC: Anna Schumaker <anna@kernel.org> CC: Steffen Klassert <steffen.klassert@secunet.com> CC: Herbert Xu <herbert@gondor.apana.org.au> Suggested-by: Guillaume Nault <gnault@redhat.com> Signed-off-by: Benjamin Coddington <bcodding@redhat.com> Reviewed-by: Guillaume Nault <gnault@redhat.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-12-16 20:45:27 +08:00
queue->sock->sk->sk_use_task_frag = false;
nvme_tcp_set_queue_io_cpu(queue);
queue->request = NULL;
queue->data_remaining = 0;
queue->ddgst_remaining = 0;
queue->pdu_remaining = 0;
queue->pdu_offset = 0;
sk_set_memalloc(queue->sock->sk);
if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
sizeof(ctrl->src_addr));
if (ret) {
dev_err(nctrl->device,
"failed to bind queue %d socket %d\n",
qid, ret);
goto err_sock;
}
}
nvme-tcp: allow selecting the network interface for connections In our application, we need a way to force TCP connections to go out a specific IP interface instead of letting Linux select the interface based on the routing tables. Add the 'host-iface' option to allow specifying the interface to use. When the option host-iface is specified, the driver uses the specified interface to set the option SO_BINDTODEVICE on the TCP socket before connecting. This new option is needed in addtion to the existing host-traddr for the following reasons: Specifying an IP interface by its associated IP address is less intuitive than specifying the actual interface name and, in some cases, simply doesn't work. That's because the association between interfaces and IP addresses is not predictable. IP addresses can be changed or can change by themselves over time (e.g. DHCP). Interface names are predictable [1] and will persist over time. Consider the following configuration. 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state ... link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 100.0.0.100/24 scope global lo valid_lft forever preferred_lft forever 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ... link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff inet 100.0.0.100/24 scope global enp0s3 valid_lft forever preferred_lft forever 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ... link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff inet 100.0.0.100/24 scope global enp0s8 valid_lft forever preferred_lft forever The above is a VM that I configured with the same IP address (100.0.0.100) on all interfaces. Doing a reverse lookup to identify the unique interface associated with 100.0.0.100 does not work here. And this is why the option host_iface is required. I understand that the above config does not represent a standard host system, but I'm using this to prove a point: "We can never know how users will configure their systems". By te way, The above configuration is perfectly fine by Linux. The current TCP implementation for host_traddr performs a bind()-before-connect(). This is a common construct to set the source IP address on a TCP socket before connecting. This has no effect on how Linux selects the interface for the connection. That's because Linux uses the Weak End System model as described in RFC1122 [2]. On the other hand, setting the Source IP Address has benefits and should be supported by linux-nvme. In fact, setting the Source IP Address is a mandatory FedGov requirement (e.g. connection to a RADIUS/TACACS+ server). Consider the following configuration. $ ip addr list dev enp0s8 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ... link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff inet 192.168.56.101/24 brd 192.168.56.255 scope global enp0s8 valid_lft 426sec preferred_lft 426sec inet 192.168.56.102/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever inet 192.168.56.103/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever inet 192.168.56.104/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever Here we can see that several addresses are associated with interface enp0s8. By default, Linux always selects the default IP address, 192.168.56.101, as the source address when connecting over interface enp0s8. Some users, however, want the ability to specify a different source address (e.g., 192.168.56.102, 192.168.56.103, ...). The option host_traddr can be used as-is to perform this function. In conclusion, I believe that we need 2 options for TCP connections. One that can be used to specify an interface (host-iface). And one that can be used to set the source address (host-traddr). Users should be allowed to use one or the other, or both, or none. Of course, the documentation for host_traddr will need some clarification. It should state that when used for TCP connection, this option only sets the source address. And the documentation for host_iface should say that this option is only available for TCP connections. References: [1] https://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames/ [2] https://tools.ietf.org/html/rfc1122 Tested both IPv4 and IPv6 connections. Signed-off-by: Martin Belanger <martin.belanger@dell.com> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-05-21 03:09:34 +08:00
if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) {
char *iface = nctrl->opts->host_iface;
sockptr_t optval = KERNEL_SOCKPTR(iface);
ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
optval, strlen(iface));
if (ret) {
dev_err(nctrl->device,
"failed to bind to interface %s queue %d err %d\n",
iface, qid, ret);
goto err_sock;
}
}
queue->hdr_digest = nctrl->opts->hdr_digest;
queue->data_digest = nctrl->opts->data_digest;
if (queue->hdr_digest || queue->data_digest) {
ret = nvme_tcp_alloc_crypto(queue);
if (ret) {
dev_err(nctrl->device,
"failed to allocate queue %d crypto\n", qid);
goto err_sock;
}
}
rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
nvme_tcp_hdgst_len(queue);
queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
if (!queue->pdu) {
ret = -ENOMEM;
goto err_crypto;
}
dev_dbg(nctrl->device, "connecting queue %d\n",
nvme_tcp_queue_id(queue));
ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
sizeof(ctrl->addr), 0);
if (ret) {
dev_err(nctrl->device,
"failed to connect socket: %d\n", ret);
goto err_rcv_pdu;
}
ret = nvme_tcp_init_connection(queue);
if (ret)
goto err_init_connect;
set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
return 0;
err_init_connect:
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
err_rcv_pdu:
kfree(queue->pdu);
err_crypto:
if (queue->hdr_digest || queue->data_digest)
nvme_tcp_free_crypto(queue);
err_sock:
sock_release(queue->sock);
queue->sock = NULL;
err_destroy_mutex:
mutex_destroy(&queue->send_mutex);
mutex_destroy(&queue->queue_lock);
return ret;
}
nvme-tcp: fix a possible UAF when failing to allocate an io queue When we allocate a nvme-tcp queue, we set the data_ready callback before we actually need to use it. This creates the potential that if a stray controller sends us data on the socket before we connect, we can trigger the io_work and start consuming the socket. In this case reported: we failed to allocate one of the io queues, and as we start releasing the queues that we already allocated, we get a UAF [1] from the io_work which is running before it should really. Fix this by setting the socket ops callbacks only before we start the queue, so that we can't accidentally schedule the io_work in the initialization phase before the queue started. While we are at it, rename nvme_tcp_restore_sock_calls to pair with nvme_tcp_setup_sock_ops. [1]: [16802.107284] nvme nvme4: starting error recovery [16802.109166] nvme nvme4: Reconnecting in 10 seconds... [16812.173535] nvme nvme4: failed to connect socket: -111 [16812.173745] nvme nvme4: Failed reconnect attempt 1 [16812.173747] nvme nvme4: Reconnecting in 10 seconds... [16822.413555] nvme nvme4: failed to connect socket: -111 [16822.413762] nvme nvme4: Failed reconnect attempt 2 [16822.413765] nvme nvme4: Reconnecting in 10 seconds... [16832.661274] nvme nvme4: creating 32 I/O queues. [16833.919887] BUG: kernel NULL pointer dereference, address: 0000000000000088 [16833.920068] nvme nvme4: Failed reconnect attempt 3 [16833.920094] #PF: supervisor write access in kernel mode [16833.920261] nvme nvme4: Reconnecting in 10 seconds... [16833.920368] #PF: error_code(0x0002) - not-present page [16833.921086] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] [16833.921191] RIP: 0010:_raw_spin_lock_bh+0x17/0x30 ... [16833.923138] Call Trace: [16833.923271] <TASK> [16833.923402] lock_sock_nested+0x1e/0x50 [16833.923545] nvme_tcp_try_recv+0x40/0xa0 [nvme_tcp] [16833.923685] nvme_tcp_io_work+0x68/0xa0 [nvme_tcp] [16833.923824] process_one_work+0x1e8/0x390 [16833.923969] worker_thread+0x53/0x3d0 [16833.924104] ? process_one_work+0x390/0x390 [16833.924240] kthread+0x124/0x150 [16833.924376] ? set_kthread_struct+0x50/0x50 [16833.924518] ret_from_fork+0x1f/0x30 [16833.924655] </TASK> Reported-by: Yanjun Zhang <zhangyanjun@cestc.cn> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Yanjun Zhang <zhangyanjun@cestc.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2023-03-20 21:33:34 +08:00
static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue)
{
struct socket *sock = queue->sock;
write_lock_bh(&sock->sk->sk_callback_lock);
sock->sk->sk_user_data = NULL;
sock->sk->sk_data_ready = queue->data_ready;
sock->sk->sk_state_change = queue->state_change;
sock->sk->sk_write_space = queue->write_space;
write_unlock_bh(&sock->sk->sk_callback_lock);
}
static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
{
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
nvme-tcp: fix a possible UAF when failing to allocate an io queue When we allocate a nvme-tcp queue, we set the data_ready callback before we actually need to use it. This creates the potential that if a stray controller sends us data on the socket before we connect, we can trigger the io_work and start consuming the socket. In this case reported: we failed to allocate one of the io queues, and as we start releasing the queues that we already allocated, we get a UAF [1] from the io_work which is running before it should really. Fix this by setting the socket ops callbacks only before we start the queue, so that we can't accidentally schedule the io_work in the initialization phase before the queue started. While we are at it, rename nvme_tcp_restore_sock_calls to pair with nvme_tcp_setup_sock_ops. [1]: [16802.107284] nvme nvme4: starting error recovery [16802.109166] nvme nvme4: Reconnecting in 10 seconds... [16812.173535] nvme nvme4: failed to connect socket: -111 [16812.173745] nvme nvme4: Failed reconnect attempt 1 [16812.173747] nvme nvme4: Reconnecting in 10 seconds... [16822.413555] nvme nvme4: failed to connect socket: -111 [16822.413762] nvme nvme4: Failed reconnect attempt 2 [16822.413765] nvme nvme4: Reconnecting in 10 seconds... [16832.661274] nvme nvme4: creating 32 I/O queues. [16833.919887] BUG: kernel NULL pointer dereference, address: 0000000000000088 [16833.920068] nvme nvme4: Failed reconnect attempt 3 [16833.920094] #PF: supervisor write access in kernel mode [16833.920261] nvme nvme4: Reconnecting in 10 seconds... [16833.920368] #PF: error_code(0x0002) - not-present page [16833.921086] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] [16833.921191] RIP: 0010:_raw_spin_lock_bh+0x17/0x30 ... [16833.923138] Call Trace: [16833.923271] <TASK> [16833.923402] lock_sock_nested+0x1e/0x50 [16833.923545] nvme_tcp_try_recv+0x40/0xa0 [nvme_tcp] [16833.923685] nvme_tcp_io_work+0x68/0xa0 [nvme_tcp] [16833.923824] process_one_work+0x1e8/0x390 [16833.923969] worker_thread+0x53/0x3d0 [16833.924104] ? process_one_work+0x390/0x390 [16833.924240] kthread+0x124/0x150 [16833.924376] ? set_kthread_struct+0x50/0x50 [16833.924518] ret_from_fork+0x1f/0x30 [16833.924655] </TASK> Reported-by: Yanjun Zhang <zhangyanjun@cestc.cn> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Yanjun Zhang <zhangyanjun@cestc.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2023-03-20 21:33:34 +08:00
nvme_tcp_restore_sock_ops(queue);
cancel_work_sync(&queue->io_work);
}
static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
nvme-tcp: check if the queue is allocated before stopping it When an error is detected and the host reconnects, the nvme_tcp_error_recovery_work() function is called and starts tearing down the io queues and de-allocating them; If at the same time the "nvme" process deletes the controller via sysfs, the nvme_tcp_delete_ctrl() gets called and waits until the nvme_tcp_error_recovery_work() finishes its job; then starts tearing down the io queues, but at this point they have already been freed and the mutexes are destroyed. Calling mutex_lock() against a destroyed mutex triggers a warning: [ 1299.025575] nvme nvme1: Reconnecting in 10 seconds... [ 1299.636449] nvme nvme1: Removing ctrl: NQN "blktests-subsystem-1" [ 1299.645262] ------------[ cut here ]------------ [ 1299.649949] DEBUG_LOCKS_WARN_ON(lock->magic != lock) [ 1299.649971] WARNING: CPU: 4 PID: 104150 at kernel/locking/mutex.c:579 __mutex_lock+0x2d0/0x7dc [ 1299.717934] CPU: 4 PID: 104150 Comm: nvme [ 1299.828075] Call trace: [ 1299.830526] __mutex_lock+0x2d0/0x7dc [ 1299.834203] mutex_lock_nested+0x64/0xd4 [ 1299.838139] nvme_tcp_stop_queue+0x54/0xe0 [nvme_tcp] [ 1299.843211] nvme_tcp_teardown_io_queues.part.0+0x90/0x280 [nvme_tcp] [ 1299.849672] nvme_tcp_delete_ctrl+0x6c/0xf0 [nvme_tcp] [ 1299.854831] nvme_do_delete_ctrl+0x108/0x120 [nvme_core] [ 1299.860181] nvme_sysfs_delete+0xec/0xf0 [nvme_core] [ 1299.865179] dev_attr_store+0x40/0x70 Fix the warning by checking if the queues are allocated in the nvme_tcp_stop_queue(). If they are not, it makes no sense to try to stop them. Signed-off-by: Maurizio Lombardi <mlombard@redhat.com> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-08-01 16:09:00 +08:00
if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
return;
mutex_lock(&queue->queue_lock);
if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
__nvme_tcp_stop_queue(queue);
mutex_unlock(&queue->queue_lock);
}
nvme-tcp: fix a possible UAF when failing to allocate an io queue When we allocate a nvme-tcp queue, we set the data_ready callback before we actually need to use it. This creates the potential that if a stray controller sends us data on the socket before we connect, we can trigger the io_work and start consuming the socket. In this case reported: we failed to allocate one of the io queues, and as we start releasing the queues that we already allocated, we get a UAF [1] from the io_work which is running before it should really. Fix this by setting the socket ops callbacks only before we start the queue, so that we can't accidentally schedule the io_work in the initialization phase before the queue started. While we are at it, rename nvme_tcp_restore_sock_calls to pair with nvme_tcp_setup_sock_ops. [1]: [16802.107284] nvme nvme4: starting error recovery [16802.109166] nvme nvme4: Reconnecting in 10 seconds... [16812.173535] nvme nvme4: failed to connect socket: -111 [16812.173745] nvme nvme4: Failed reconnect attempt 1 [16812.173747] nvme nvme4: Reconnecting in 10 seconds... [16822.413555] nvme nvme4: failed to connect socket: -111 [16822.413762] nvme nvme4: Failed reconnect attempt 2 [16822.413765] nvme nvme4: Reconnecting in 10 seconds... [16832.661274] nvme nvme4: creating 32 I/O queues. [16833.919887] BUG: kernel NULL pointer dereference, address: 0000000000000088 [16833.920068] nvme nvme4: Failed reconnect attempt 3 [16833.920094] #PF: supervisor write access in kernel mode [16833.920261] nvme nvme4: Reconnecting in 10 seconds... [16833.920368] #PF: error_code(0x0002) - not-present page [16833.921086] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] [16833.921191] RIP: 0010:_raw_spin_lock_bh+0x17/0x30 ... [16833.923138] Call Trace: [16833.923271] <TASK> [16833.923402] lock_sock_nested+0x1e/0x50 [16833.923545] nvme_tcp_try_recv+0x40/0xa0 [nvme_tcp] [16833.923685] nvme_tcp_io_work+0x68/0xa0 [nvme_tcp] [16833.923824] process_one_work+0x1e8/0x390 [16833.923969] worker_thread+0x53/0x3d0 [16833.924104] ? process_one_work+0x390/0x390 [16833.924240] kthread+0x124/0x150 [16833.924376] ? set_kthread_struct+0x50/0x50 [16833.924518] ret_from_fork+0x1f/0x30 [16833.924655] </TASK> Reported-by: Yanjun Zhang <zhangyanjun@cestc.cn> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Yanjun Zhang <zhangyanjun@cestc.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2023-03-20 21:33:34 +08:00
static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
{
write_lock_bh(&queue->sock->sk->sk_callback_lock);
queue->sock->sk->sk_user_data = queue;
queue->state_change = queue->sock->sk->sk_state_change;
queue->data_ready = queue->sock->sk->sk_data_ready;
queue->write_space = queue->sock->sk->sk_write_space;
queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
queue->sock->sk->sk_state_change = nvme_tcp_state_change;
queue->sock->sk->sk_write_space = nvme_tcp_write_space;
#ifdef CONFIG_NET_RX_BUSY_POLL
queue->sock->sk->sk_ll_usec = 1;
#endif
write_unlock_bh(&queue->sock->sk->sk_callback_lock);
}
static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
nvme-tcp: fix a possible UAF when failing to allocate an io queue When we allocate a nvme-tcp queue, we set the data_ready callback before we actually need to use it. This creates the potential that if a stray controller sends us data on the socket before we connect, we can trigger the io_work and start consuming the socket. In this case reported: we failed to allocate one of the io queues, and as we start releasing the queues that we already allocated, we get a UAF [1] from the io_work which is running before it should really. Fix this by setting the socket ops callbacks only before we start the queue, so that we can't accidentally schedule the io_work in the initialization phase before the queue started. While we are at it, rename nvme_tcp_restore_sock_calls to pair with nvme_tcp_setup_sock_ops. [1]: [16802.107284] nvme nvme4: starting error recovery [16802.109166] nvme nvme4: Reconnecting in 10 seconds... [16812.173535] nvme nvme4: failed to connect socket: -111 [16812.173745] nvme nvme4: Failed reconnect attempt 1 [16812.173747] nvme nvme4: Reconnecting in 10 seconds... [16822.413555] nvme nvme4: failed to connect socket: -111 [16822.413762] nvme nvme4: Failed reconnect attempt 2 [16822.413765] nvme nvme4: Reconnecting in 10 seconds... [16832.661274] nvme nvme4: creating 32 I/O queues. [16833.919887] BUG: kernel NULL pointer dereference, address: 0000000000000088 [16833.920068] nvme nvme4: Failed reconnect attempt 3 [16833.920094] #PF: supervisor write access in kernel mode [16833.920261] nvme nvme4: Reconnecting in 10 seconds... [16833.920368] #PF: error_code(0x0002) - not-present page [16833.921086] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] [16833.921191] RIP: 0010:_raw_spin_lock_bh+0x17/0x30 ... [16833.923138] Call Trace: [16833.923271] <TASK> [16833.923402] lock_sock_nested+0x1e/0x50 [16833.923545] nvme_tcp_try_recv+0x40/0xa0 [nvme_tcp] [16833.923685] nvme_tcp_io_work+0x68/0xa0 [nvme_tcp] [16833.923824] process_one_work+0x1e8/0x390 [16833.923969] worker_thread+0x53/0x3d0 [16833.924104] ? process_one_work+0x390/0x390 [16833.924240] kthread+0x124/0x150 [16833.924376] ? set_kthread_struct+0x50/0x50 [16833.924518] ret_from_fork+0x1f/0x30 [16833.924655] </TASK> Reported-by: Yanjun Zhang <zhangyanjun@cestc.cn> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Yanjun Zhang <zhangyanjun@cestc.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2023-03-20 21:33:34 +08:00
struct nvme_tcp_queue *queue = &ctrl->queues[idx];
int ret;
nvme-tcp: fix a possible UAF when failing to allocate an io queue When we allocate a nvme-tcp queue, we set the data_ready callback before we actually need to use it. This creates the potential that if a stray controller sends us data on the socket before we connect, we can trigger the io_work and start consuming the socket. In this case reported: we failed to allocate one of the io queues, and as we start releasing the queues that we already allocated, we get a UAF [1] from the io_work which is running before it should really. Fix this by setting the socket ops callbacks only before we start the queue, so that we can't accidentally schedule the io_work in the initialization phase before the queue started. While we are at it, rename nvme_tcp_restore_sock_calls to pair with nvme_tcp_setup_sock_ops. [1]: [16802.107284] nvme nvme4: starting error recovery [16802.109166] nvme nvme4: Reconnecting in 10 seconds... [16812.173535] nvme nvme4: failed to connect socket: -111 [16812.173745] nvme nvme4: Failed reconnect attempt 1 [16812.173747] nvme nvme4: Reconnecting in 10 seconds... [16822.413555] nvme nvme4: failed to connect socket: -111 [16822.413762] nvme nvme4: Failed reconnect attempt 2 [16822.413765] nvme nvme4: Reconnecting in 10 seconds... [16832.661274] nvme nvme4: creating 32 I/O queues. [16833.919887] BUG: kernel NULL pointer dereference, address: 0000000000000088 [16833.920068] nvme nvme4: Failed reconnect attempt 3 [16833.920094] #PF: supervisor write access in kernel mode [16833.920261] nvme nvme4: Reconnecting in 10 seconds... [16833.920368] #PF: error_code(0x0002) - not-present page [16833.921086] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] [16833.921191] RIP: 0010:_raw_spin_lock_bh+0x17/0x30 ... [16833.923138] Call Trace: [16833.923271] <TASK> [16833.923402] lock_sock_nested+0x1e/0x50 [16833.923545] nvme_tcp_try_recv+0x40/0xa0 [nvme_tcp] [16833.923685] nvme_tcp_io_work+0x68/0xa0 [nvme_tcp] [16833.923824] process_one_work+0x1e8/0x390 [16833.923969] worker_thread+0x53/0x3d0 [16833.924104] ? process_one_work+0x390/0x390 [16833.924240] kthread+0x124/0x150 [16833.924376] ? set_kthread_struct+0x50/0x50 [16833.924518] ret_from_fork+0x1f/0x30 [16833.924655] </TASK> Reported-by: Yanjun Zhang <zhangyanjun@cestc.cn> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Yanjun Zhang <zhangyanjun@cestc.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2023-03-20 21:33:34 +08:00
queue->rd_enabled = true;
nvme_tcp_init_recv_ctx(queue);
nvme_tcp_setup_sock_ops(queue);
if (idx)
ret = nvmf_connect_io_queue(nctrl, idx);
else
ret = nvmf_connect_admin_queue(nctrl);
if (!ret) {
nvme-tcp: fix a possible UAF when failing to allocate an io queue When we allocate a nvme-tcp queue, we set the data_ready callback before we actually need to use it. This creates the potential that if a stray controller sends us data on the socket before we connect, we can trigger the io_work and start consuming the socket. In this case reported: we failed to allocate one of the io queues, and as we start releasing the queues that we already allocated, we get a UAF [1] from the io_work which is running before it should really. Fix this by setting the socket ops callbacks only before we start the queue, so that we can't accidentally schedule the io_work in the initialization phase before the queue started. While we are at it, rename nvme_tcp_restore_sock_calls to pair with nvme_tcp_setup_sock_ops. [1]: [16802.107284] nvme nvme4: starting error recovery [16802.109166] nvme nvme4: Reconnecting in 10 seconds... [16812.173535] nvme nvme4: failed to connect socket: -111 [16812.173745] nvme nvme4: Failed reconnect attempt 1 [16812.173747] nvme nvme4: Reconnecting in 10 seconds... [16822.413555] nvme nvme4: failed to connect socket: -111 [16822.413762] nvme nvme4: Failed reconnect attempt 2 [16822.413765] nvme nvme4: Reconnecting in 10 seconds... [16832.661274] nvme nvme4: creating 32 I/O queues. [16833.919887] BUG: kernel NULL pointer dereference, address: 0000000000000088 [16833.920068] nvme nvme4: Failed reconnect attempt 3 [16833.920094] #PF: supervisor write access in kernel mode [16833.920261] nvme nvme4: Reconnecting in 10 seconds... [16833.920368] #PF: error_code(0x0002) - not-present page [16833.921086] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] [16833.921191] RIP: 0010:_raw_spin_lock_bh+0x17/0x30 ... [16833.923138] Call Trace: [16833.923271] <TASK> [16833.923402] lock_sock_nested+0x1e/0x50 [16833.923545] nvme_tcp_try_recv+0x40/0xa0 [nvme_tcp] [16833.923685] nvme_tcp_io_work+0x68/0xa0 [nvme_tcp] [16833.923824] process_one_work+0x1e8/0x390 [16833.923969] worker_thread+0x53/0x3d0 [16833.924104] ? process_one_work+0x390/0x390 [16833.924240] kthread+0x124/0x150 [16833.924376] ? set_kthread_struct+0x50/0x50 [16833.924518] ret_from_fork+0x1f/0x30 [16833.924655] </TASK> Reported-by: Yanjun Zhang <zhangyanjun@cestc.cn> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Yanjun Zhang <zhangyanjun@cestc.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2023-03-20 21:33:34 +08:00
set_bit(NVME_TCP_Q_LIVE, &queue->flags);
} else {
nvme-tcp: fix a possible UAF when failing to allocate an io queue When we allocate a nvme-tcp queue, we set the data_ready callback before we actually need to use it. This creates the potential that if a stray controller sends us data on the socket before we connect, we can trigger the io_work and start consuming the socket. In this case reported: we failed to allocate one of the io queues, and as we start releasing the queues that we already allocated, we get a UAF [1] from the io_work which is running before it should really. Fix this by setting the socket ops callbacks only before we start the queue, so that we can't accidentally schedule the io_work in the initialization phase before the queue started. While we are at it, rename nvme_tcp_restore_sock_calls to pair with nvme_tcp_setup_sock_ops. [1]: [16802.107284] nvme nvme4: starting error recovery [16802.109166] nvme nvme4: Reconnecting in 10 seconds... [16812.173535] nvme nvme4: failed to connect socket: -111 [16812.173745] nvme nvme4: Failed reconnect attempt 1 [16812.173747] nvme nvme4: Reconnecting in 10 seconds... [16822.413555] nvme nvme4: failed to connect socket: -111 [16822.413762] nvme nvme4: Failed reconnect attempt 2 [16822.413765] nvme nvme4: Reconnecting in 10 seconds... [16832.661274] nvme nvme4: creating 32 I/O queues. [16833.919887] BUG: kernel NULL pointer dereference, address: 0000000000000088 [16833.920068] nvme nvme4: Failed reconnect attempt 3 [16833.920094] #PF: supervisor write access in kernel mode [16833.920261] nvme nvme4: Reconnecting in 10 seconds... [16833.920368] #PF: error_code(0x0002) - not-present page [16833.921086] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] [16833.921191] RIP: 0010:_raw_spin_lock_bh+0x17/0x30 ... [16833.923138] Call Trace: [16833.923271] <TASK> [16833.923402] lock_sock_nested+0x1e/0x50 [16833.923545] nvme_tcp_try_recv+0x40/0xa0 [nvme_tcp] [16833.923685] nvme_tcp_io_work+0x68/0xa0 [nvme_tcp] [16833.923824] process_one_work+0x1e8/0x390 [16833.923969] worker_thread+0x53/0x3d0 [16833.924104] ? process_one_work+0x390/0x390 [16833.924240] kthread+0x124/0x150 [16833.924376] ? set_kthread_struct+0x50/0x50 [16833.924518] ret_from_fork+0x1f/0x30 [16833.924655] </TASK> Reported-by: Yanjun Zhang <zhangyanjun@cestc.cn> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Yanjun Zhang <zhangyanjun@cestc.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2023-03-20 21:33:34 +08:00
if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
__nvme_tcp_stop_queue(queue);
dev_err(nctrl->device,
"failed to connect queue: %d ret=%d\n", idx, ret);
}
return ret;
}
static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
{
if (to_tcp_ctrl(ctrl)->async_req.pdu) {
cancel_work_sync(&ctrl->async_event_work);
nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
}
nvme_tcp_free_queue(ctrl, 0);
}
static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
{
int i;
for (i = 1; i < ctrl->queue_count; i++)
nvme_tcp_free_queue(ctrl, i);
}
static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
{
int i;
for (i = 1; i < ctrl->queue_count; i++)
nvme_tcp_stop_queue(ctrl, i);
}
static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl,
int first, int last)
{
int i, ret;
for (i = first; i < last; i++) {
ret = nvme_tcp_start_queue(ctrl, i);
if (ret)
goto out_stop_queues;
}
return 0;
out_stop_queues:
for (i--; i >= first; i--)
nvme_tcp_stop_queue(ctrl, i);
return ret;
}
static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
{
int ret;
ret = nvme_tcp_alloc_queue(ctrl, 0);
if (ret)
return ret;
ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
if (ret)
goto out_free_queue;
return 0;
out_free_queue:
nvme_tcp_free_queue(ctrl, 0);
return ret;
}
static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
{
int i, ret;
for (i = 1; i < ctrl->queue_count; i++) {
ret = nvme_tcp_alloc_queue(ctrl, i);
if (ret)
goto out_free_queues;
}
return 0;
out_free_queues:
for (i--; i >= 1; i--)
nvme_tcp_free_queue(ctrl, i);
return ret;
}
static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
{
unsigned int nr_io_queues;
nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
return nr_io_queues;
}
static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
unsigned int nr_io_queues)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvmf_ctrl_options *opts = nctrl->opts;
if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
/*
* separate read/write queues
* hand out dedicated default queues only after we have
* sufficient read queues.
*/
ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
ctrl->io_queues[HCTX_TYPE_DEFAULT] =
min(opts->nr_write_queues, nr_io_queues);
nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
} else {
/*
* shared read/write queues
* either no write queues were requested, or we don't have
* sufficient queue count to have dedicated default queues.
*/
ctrl->io_queues[HCTX_TYPE_DEFAULT] =
min(opts->nr_io_queues, nr_io_queues);
nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
}
if (opts->nr_poll_queues && nr_io_queues) {
/* map dedicated poll queues only if we have queues left */
ctrl->io_queues[HCTX_TYPE_POLL] =
min(opts->nr_poll_queues, nr_io_queues);
}
}
static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
{
unsigned int nr_io_queues;
int ret;
nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
ret = nvme_set_queue_count(ctrl, &nr_io_queues);
if (ret)
return ret;
if (nr_io_queues == 0) {
dev_err(ctrl->device,
"unable to set any I/O queues\n");
return -ENOMEM;
}
ctrl->queue_count = nr_io_queues + 1;
dev_info(ctrl->device,
"creating %d I/O queues.\n", nr_io_queues);
nvme_tcp_set_io_queues(ctrl, nr_io_queues);
return __nvme_tcp_alloc_io_queues(ctrl);
}
static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
{
nvme_tcp_stop_io_queues(ctrl);
if (remove)
nvme_remove_io_tag_set(ctrl);
nvme_tcp_free_io_queues(ctrl);
}
static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
{
int ret, nr_queues;
ret = nvme_tcp_alloc_io_queues(ctrl);
if (ret)
return ret;
if (new) {
ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set,
&nvme_tcp_mq_ops,
ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2,
sizeof(struct nvme_tcp_request));
if (ret)
goto out_free_io_queues;
}
/*
* Only start IO queues for which we have allocated the tagset
* and limitted it to the available queues. On reconnects, the
* queue number might have changed.
*/
nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count);
ret = nvme_tcp_start_io_queues(ctrl, 1, nr_queues);
if (ret)
goto out_cleanup_connect_q;
if (!new) {
nvme_unquiesce_io_queues(ctrl);
if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
/*
* If we timed out waiting for freeze we are likely to
* be stuck. Fail the controller initialization just
* to be safe.
*/
ret = -ENODEV;
goto out_wait_freeze_timed_out;
}
blk_mq_update_nr_hw_queues(ctrl->tagset,
ctrl->queue_count - 1);
nvme_unfreeze(ctrl);
}
/*
* If the number of queues has increased (reconnect case)
* start all new queues now.
*/
ret = nvme_tcp_start_io_queues(ctrl, nr_queues,
ctrl->tagset->nr_hw_queues + 1);
if (ret)
goto out_wait_freeze_timed_out;
return 0;
out_wait_freeze_timed_out:
nvme_quiesce_io_queues(ctrl);
nvme_sync_io_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
out_cleanup_connect_q:
nvme_cancel_tagset(ctrl);
if (new)
nvme_remove_io_tag_set(ctrl);
out_free_io_queues:
nvme_tcp_free_io_queues(ctrl);
return ret;
}
static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
{
nvme_tcp_stop_queue(ctrl, 0);
if (remove)
nvme_remove_admin_tag_set(ctrl);
nvme_tcp_free_admin_queue(ctrl);
}
static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
{
int error;
error = nvme_tcp_alloc_admin_queue(ctrl);
if (error)
return error;
if (new) {
error = nvme_alloc_admin_tag_set(ctrl,
&to_tcp_ctrl(ctrl)->admin_tag_set,
&nvme_tcp_admin_mq_ops,
sizeof(struct nvme_tcp_request));
if (error)
goto out_free_queue;
}
error = nvme_tcp_start_queue(ctrl, 0);
if (error)
goto out_cleanup_tagset;
error = nvme_enable_ctrl(ctrl);
if (error)
goto out_stop_queue;
nvme_unquiesce_admin_queue(ctrl);
error = nvme_init_ctrl_finish(ctrl, false);
if (error)
goto out_quiesce_queue;
return 0;
out_quiesce_queue:
nvme_quiesce_admin_queue(ctrl);
blk_sync_queue(ctrl->admin_q);
out_stop_queue:
nvme_tcp_stop_queue(ctrl, 0);
nvme_cancel_admin_tagset(ctrl);
out_cleanup_tagset:
if (new)
nvme_remove_admin_tag_set(ctrl);
out_free_queue:
nvme_tcp_free_admin_queue(ctrl);
return error;
}
static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
bool remove)
{
nvme_quiesce_admin_queue(ctrl);
blk_sync_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0);
nvme_cancel_admin_tagset(ctrl);
if (remove)
nvme_unquiesce_admin_queue(ctrl);
nvme_tcp_destroy_admin_queue(ctrl, remove);
}
static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
bool remove)
{
if (ctrl->queue_count <= 1)
return;
nvme_quiesce_admin_queue(ctrl);
nvme_start_freeze(ctrl);
nvme_quiesce_io_queues(ctrl);
nvme_sync_io_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
nvme_cancel_tagset(ctrl);
if (remove)
nvme_unquiesce_io_queues(ctrl);
nvme_tcp_destroy_io_queues(ctrl, remove);
}
static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
{
/* If we are resetting/deleting then do nothing */
if (ctrl->state != NVME_CTRL_CONNECTING) {
WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
ctrl->state == NVME_CTRL_LIVE);
return;
}
if (nvmf_should_reconnect(ctrl)) {
dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
ctrl->opts->reconnect_delay);
queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
ctrl->opts->reconnect_delay * HZ);
} else {
dev_info(ctrl->device, "Removing controller...\n");
nvme_delete_ctrl(ctrl);
}
}
static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
{
struct nvmf_ctrl_options *opts = ctrl->opts;
int ret;
ret = nvme_tcp_configure_admin_queue(ctrl, new);
if (ret)
return ret;
if (ctrl->icdoff) {
ret = -EOPNOTSUPP;
dev_err(ctrl->device, "icdoff is not supported!\n");
goto destroy_admin;
}
if (!nvme_ctrl_sgl_supported(ctrl)) {
ret = -EOPNOTSUPP;
dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
goto destroy_admin;
}
if (opts->queue_size > ctrl->sqsize + 1)
dev_warn(ctrl->device,
"queue_size %zu > ctrl sqsize %u, clamping down\n",
opts->queue_size, ctrl->sqsize + 1);
if (ctrl->sqsize + 1 > ctrl->maxcmd) {
dev_warn(ctrl->device,
"sqsize %u > ctrl maxcmd %u, clamping down\n",
ctrl->sqsize + 1, ctrl->maxcmd);
ctrl->sqsize = ctrl->maxcmd - 1;
}
if (ctrl->queue_count > 1) {
ret = nvme_tcp_configure_io_queues(ctrl, new);
if (ret)
goto destroy_admin;
}
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
/*
nvme: fix deadlock in disconnect during scan_work and/or ana_work A deadlock happens in the following scenario with multipath: 1) scan_work(nvme0) detects a new nsid while nvme0 is an optimized path to it, path nvme1 happens to be inaccessible. 2) Before scan_work is complete nvme0 disconnect is initiated nvme_delete_ctrl_sync() sets nvme0 state to NVME_CTRL_DELETING 3) scan_work(1) attempts to submit IO, but nvme_path_is_optimized() observes nvme0 is not LIVE. Since nvme1 is a possible path IO is requeued and scan_work hangs. -- Workqueue: nvme-wq nvme_scan_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: io_schedule+0x16/0x40 kernel: do_read_cache_page+0x438/0x830 kernel: read_cache_page+0x12/0x20 kernel: read_dev_sector+0x27/0xc0 kernel: read_lba+0xc1/0x220 kernel: efi_partition+0x1e6/0x708 kernel: check_partition+0x154/0x244 kernel: rescan_partitions+0xae/0x280 kernel: __blkdev_get+0x40f/0x560 kernel: blkdev_get+0x3d/0x140 kernel: __device_add_disk+0x388/0x480 kernel: device_add_disk+0x13/0x20 kernel: nvme_mpath_set_live+0x119/0x140 [nvme_core] kernel: nvme_update_ns_ana_state+0x5c/0x60 [nvme_core] kernel: nvme_set_ns_ana_state+0x1e/0x30 [nvme_core] kernel: nvme_parse_ana_log+0xa1/0x180 [nvme_core] kernel: nvme_mpath_add_disk+0x47/0x90 [nvme_core] kernel: nvme_validate_ns+0x396/0x940 [nvme_core] kernel: nvme_scan_work+0x24f/0x380 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x249/0x400 kernel: kthread+0x104/0x140 -- 4) Delete also hangs in flush_work(ctrl->scan_work) from nvme_remove_namespaces(). Similiarly a deadlock with ana_work may happen: if ana_work has started and calls nvme_mpath_set_live and device_add_disk, it will trigger I/O. When we trigger disconnect I/O will block because our accessible (optimized) path is disconnecting, but the alternate path is inaccessible, so I/O blocks. Then disconnect tries to flush the ana_work and hangs. [ 605.550896] Workqueue: nvme-wq nvme_ana_work [nvme_core] [ 605.552087] Call Trace: [ 605.552683] __schedule+0x2b9/0x6c0 [ 605.553507] schedule+0x42/0xb0 [ 605.554201] io_schedule+0x16/0x40 [ 605.555012] do_read_cache_page+0x438/0x830 [ 605.556925] read_cache_page+0x12/0x20 [ 605.557757] read_dev_sector+0x27/0xc0 [ 605.558587] amiga_partition+0x4d/0x4c5 [ 605.561278] check_partition+0x154/0x244 [ 605.562138] rescan_partitions+0xae/0x280 [ 605.563076] __blkdev_get+0x40f/0x560 [ 605.563830] blkdev_get+0x3d/0x140 [ 605.564500] __device_add_disk+0x388/0x480 [ 605.565316] device_add_disk+0x13/0x20 [ 605.566070] nvme_mpath_set_live+0x5e/0x130 [nvme_core] [ 605.567114] nvme_update_ns_ana_state+0x2c/0x30 [nvme_core] [ 605.568197] nvme_update_ana_state+0xca/0xe0 [nvme_core] [ 605.569360] nvme_parse_ana_log+0xa1/0x180 [nvme_core] [ 605.571385] nvme_read_ana_log+0x76/0x100 [nvme_core] [ 605.572376] nvme_ana_work+0x15/0x20 [nvme_core] [ 605.573330] process_one_work+0x1db/0x380 [ 605.574144] worker_thread+0x4d/0x400 [ 605.574896] kthread+0x104/0x140 [ 605.577205] ret_from_fork+0x35/0x40 [ 605.577955] INFO: task nvme:14044 blocked for more than 120 seconds. [ 605.579239] Tainted: G OE 5.3.5-050305-generic #201910071830 [ 605.580712] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 605.582320] nvme D 0 14044 14043 0x00000000 [ 605.583424] Call Trace: [ 605.583935] __schedule+0x2b9/0x6c0 [ 605.584625] schedule+0x42/0xb0 [ 605.585290] schedule_timeout+0x203/0x2f0 [ 605.588493] wait_for_completion+0xb1/0x120 [ 605.590066] __flush_work+0x123/0x1d0 [ 605.591758] __cancel_work_timer+0x10e/0x190 [ 605.593542] cancel_work_sync+0x10/0x20 [ 605.594347] nvme_mpath_stop+0x2f/0x40 [nvme_core] [ 605.595328] nvme_stop_ctrl+0x12/0x50 [nvme_core] [ 605.596262] nvme_do_delete_ctrl+0x3f/0x90 [nvme_core] [ 605.597333] nvme_sysfs_delete+0x5c/0x70 [nvme_core] [ 605.598320] dev_attr_store+0x17/0x30 Fix this by introducing a new state: NVME_CTRL_DELETE_NOIO, which will indicate the phase of controller deletion where I/O cannot be allowed to access the namespace. NVME_CTRL_DELETING still allows mpath I/O to be issued to the bottom device, and only after we flush the ana_work and scan_work (after nvme_stop_ctrl and nvme_prep_remove_namespaces) we change the state to NVME_CTRL_DELETING_NOIO. Also we prevent ana_work from re-firing by aborting early if we are not LIVE, so we should be safe here. In addition, change the transport drivers to follow the updated state machine. Fixes: 0d0b660f214d ("nvme: add ANA support") Reported-by: Anton Eidelman <anton@lightbitslabs.com> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Christoph Hellwig <hch@lst.de>
2020-07-23 07:32:19 +08:00
* state change failure is ok if we started ctrl delete,
* unless we're during creation of a new controller to
* avoid races with teardown flow.
*/
nvme: fix deadlock in disconnect during scan_work and/or ana_work A deadlock happens in the following scenario with multipath: 1) scan_work(nvme0) detects a new nsid while nvme0 is an optimized path to it, path nvme1 happens to be inaccessible. 2) Before scan_work is complete nvme0 disconnect is initiated nvme_delete_ctrl_sync() sets nvme0 state to NVME_CTRL_DELETING 3) scan_work(1) attempts to submit IO, but nvme_path_is_optimized() observes nvme0 is not LIVE. Since nvme1 is a possible path IO is requeued and scan_work hangs. -- Workqueue: nvme-wq nvme_scan_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: io_schedule+0x16/0x40 kernel: do_read_cache_page+0x438/0x830 kernel: read_cache_page+0x12/0x20 kernel: read_dev_sector+0x27/0xc0 kernel: read_lba+0xc1/0x220 kernel: efi_partition+0x1e6/0x708 kernel: check_partition+0x154/0x244 kernel: rescan_partitions+0xae/0x280 kernel: __blkdev_get+0x40f/0x560 kernel: blkdev_get+0x3d/0x140 kernel: __device_add_disk+0x388/0x480 kernel: device_add_disk+0x13/0x20 kernel: nvme_mpath_set_live+0x119/0x140 [nvme_core] kernel: nvme_update_ns_ana_state+0x5c/0x60 [nvme_core] kernel: nvme_set_ns_ana_state+0x1e/0x30 [nvme_core] kernel: nvme_parse_ana_log+0xa1/0x180 [nvme_core] kernel: nvme_mpath_add_disk+0x47/0x90 [nvme_core] kernel: nvme_validate_ns+0x396/0x940 [nvme_core] kernel: nvme_scan_work+0x24f/0x380 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x249/0x400 kernel: kthread+0x104/0x140 -- 4) Delete also hangs in flush_work(ctrl->scan_work) from nvme_remove_namespaces(). Similiarly a deadlock with ana_work may happen: if ana_work has started and calls nvme_mpath_set_live and device_add_disk, it will trigger I/O. When we trigger disconnect I/O will block because our accessible (optimized) path is disconnecting, but the alternate path is inaccessible, so I/O blocks. Then disconnect tries to flush the ana_work and hangs. [ 605.550896] Workqueue: nvme-wq nvme_ana_work [nvme_core] [ 605.552087] Call Trace: [ 605.552683] __schedule+0x2b9/0x6c0 [ 605.553507] schedule+0x42/0xb0 [ 605.554201] io_schedule+0x16/0x40 [ 605.555012] do_read_cache_page+0x438/0x830 [ 605.556925] read_cache_page+0x12/0x20 [ 605.557757] read_dev_sector+0x27/0xc0 [ 605.558587] amiga_partition+0x4d/0x4c5 [ 605.561278] check_partition+0x154/0x244 [ 605.562138] rescan_partitions+0xae/0x280 [ 605.563076] __blkdev_get+0x40f/0x560 [ 605.563830] blkdev_get+0x3d/0x140 [ 605.564500] __device_add_disk+0x388/0x480 [ 605.565316] device_add_disk+0x13/0x20 [ 605.566070] nvme_mpath_set_live+0x5e/0x130 [nvme_core] [ 605.567114] nvme_update_ns_ana_state+0x2c/0x30 [nvme_core] [ 605.568197] nvme_update_ana_state+0xca/0xe0 [nvme_core] [ 605.569360] nvme_parse_ana_log+0xa1/0x180 [nvme_core] [ 605.571385] nvme_read_ana_log+0x76/0x100 [nvme_core] [ 605.572376] nvme_ana_work+0x15/0x20 [nvme_core] [ 605.573330] process_one_work+0x1db/0x380 [ 605.574144] worker_thread+0x4d/0x400 [ 605.574896] kthread+0x104/0x140 [ 605.577205] ret_from_fork+0x35/0x40 [ 605.577955] INFO: task nvme:14044 blocked for more than 120 seconds. [ 605.579239] Tainted: G OE 5.3.5-050305-generic #201910071830 [ 605.580712] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 605.582320] nvme D 0 14044 14043 0x00000000 [ 605.583424] Call Trace: [ 605.583935] __schedule+0x2b9/0x6c0 [ 605.584625] schedule+0x42/0xb0 [ 605.585290] schedule_timeout+0x203/0x2f0 [ 605.588493] wait_for_completion+0xb1/0x120 [ 605.590066] __flush_work+0x123/0x1d0 [ 605.591758] __cancel_work_timer+0x10e/0x190 [ 605.593542] cancel_work_sync+0x10/0x20 [ 605.594347] nvme_mpath_stop+0x2f/0x40 [nvme_core] [ 605.595328] nvme_stop_ctrl+0x12/0x50 [nvme_core] [ 605.596262] nvme_do_delete_ctrl+0x3f/0x90 [nvme_core] [ 605.597333] nvme_sysfs_delete+0x5c/0x70 [nvme_core] [ 605.598320] dev_attr_store+0x17/0x30 Fix this by introducing a new state: NVME_CTRL_DELETE_NOIO, which will indicate the phase of controller deletion where I/O cannot be allowed to access the namespace. NVME_CTRL_DELETING still allows mpath I/O to be issued to the bottom device, and only after we flush the ana_work and scan_work (after nvme_stop_ctrl and nvme_prep_remove_namespaces) we change the state to NVME_CTRL_DELETING_NOIO. Also we prevent ana_work from re-firing by aborting early if we are not LIVE, so we should be safe here. In addition, change the transport drivers to follow the updated state machine. Fixes: 0d0b660f214d ("nvme: add ANA support") Reported-by: Anton Eidelman <anton@lightbitslabs.com> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Christoph Hellwig <hch@lst.de>
2020-07-23 07:32:19 +08:00
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
ctrl->state != NVME_CTRL_DELETING_NOIO);
WARN_ON_ONCE(new);
ret = -EINVAL;
goto destroy_io;
}
nvme_start_ctrl(ctrl);
return 0;
destroy_io:
if (ctrl->queue_count > 1) {
nvme_quiesce_io_queues(ctrl);
nvme_sync_io_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
nvme_cancel_tagset(ctrl);
nvme_tcp_destroy_io_queues(ctrl, new);
}
destroy_admin:
nvme_quiesce_admin_queue(ctrl);
blk_sync_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0);
nvme_cancel_admin_tagset(ctrl);
nvme_tcp_destroy_admin_queue(ctrl, new);
return ret;
}
static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
{
struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
struct nvme_tcp_ctrl, connect_work);
struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
++ctrl->nr_reconnects;
if (nvme_tcp_setup_ctrl(ctrl, false))
goto requeue;
dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
ctrl->nr_reconnects);
ctrl->nr_reconnects = 0;
return;
requeue:
dev_info(ctrl->device, "Failed reconnect attempt %d\n",
ctrl->nr_reconnects);
nvme_tcp_reconnect_or_remove(ctrl);
}
static void nvme_tcp_error_recovery_work(struct work_struct *work)
{
struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
struct nvme_tcp_ctrl, err_work);
struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
nvme_tcp_teardown_io_queues(ctrl, false);
/* unquiesce to fail fast pending requests */
nvme_unquiesce_io_queues(ctrl);
nvme_tcp_teardown_admin_queue(ctrl, false);
nvme_unquiesce_admin_queue(ctrl);
nvme_auth_stop(ctrl);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
nvme: fix deadlock in disconnect during scan_work and/or ana_work A deadlock happens in the following scenario with multipath: 1) scan_work(nvme0) detects a new nsid while nvme0 is an optimized path to it, path nvme1 happens to be inaccessible. 2) Before scan_work is complete nvme0 disconnect is initiated nvme_delete_ctrl_sync() sets nvme0 state to NVME_CTRL_DELETING 3) scan_work(1) attempts to submit IO, but nvme_path_is_optimized() observes nvme0 is not LIVE. Since nvme1 is a possible path IO is requeued and scan_work hangs. -- Workqueue: nvme-wq nvme_scan_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: io_schedule+0x16/0x40 kernel: do_read_cache_page+0x438/0x830 kernel: read_cache_page+0x12/0x20 kernel: read_dev_sector+0x27/0xc0 kernel: read_lba+0xc1/0x220 kernel: efi_partition+0x1e6/0x708 kernel: check_partition+0x154/0x244 kernel: rescan_partitions+0xae/0x280 kernel: __blkdev_get+0x40f/0x560 kernel: blkdev_get+0x3d/0x140 kernel: __device_add_disk+0x388/0x480 kernel: device_add_disk+0x13/0x20 kernel: nvme_mpath_set_live+0x119/0x140 [nvme_core] kernel: nvme_update_ns_ana_state+0x5c/0x60 [nvme_core] kernel: nvme_set_ns_ana_state+0x1e/0x30 [nvme_core] kernel: nvme_parse_ana_log+0xa1/0x180 [nvme_core] kernel: nvme_mpath_add_disk+0x47/0x90 [nvme_core] kernel: nvme_validate_ns+0x396/0x940 [nvme_core] kernel: nvme_scan_work+0x24f/0x380 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x249/0x400 kernel: kthread+0x104/0x140 -- 4) Delete also hangs in flush_work(ctrl->scan_work) from nvme_remove_namespaces(). Similiarly a deadlock with ana_work may happen: if ana_work has started and calls nvme_mpath_set_live and device_add_disk, it will trigger I/O. When we trigger disconnect I/O will block because our accessible (optimized) path is disconnecting, but the alternate path is inaccessible, so I/O blocks. Then disconnect tries to flush the ana_work and hangs. [ 605.550896] Workqueue: nvme-wq nvme_ana_work [nvme_core] [ 605.552087] Call Trace: [ 605.552683] __schedule+0x2b9/0x6c0 [ 605.553507] schedule+0x42/0xb0 [ 605.554201] io_schedule+0x16/0x40 [ 605.555012] do_read_cache_page+0x438/0x830 [ 605.556925] read_cache_page+0x12/0x20 [ 605.557757] read_dev_sector+0x27/0xc0 [ 605.558587] amiga_partition+0x4d/0x4c5 [ 605.561278] check_partition+0x154/0x244 [ 605.562138] rescan_partitions+0xae/0x280 [ 605.563076] __blkdev_get+0x40f/0x560 [ 605.563830] blkdev_get+0x3d/0x140 [ 605.564500] __device_add_disk+0x388/0x480 [ 605.565316] device_add_disk+0x13/0x20 [ 605.566070] nvme_mpath_set_live+0x5e/0x130 [nvme_core] [ 605.567114] nvme_update_ns_ana_state+0x2c/0x30 [nvme_core] [ 605.568197] nvme_update_ana_state+0xca/0xe0 [nvme_core] [ 605.569360] nvme_parse_ana_log+0xa1/0x180 [nvme_core] [ 605.571385] nvme_read_ana_log+0x76/0x100 [nvme_core] [ 605.572376] nvme_ana_work+0x15/0x20 [nvme_core] [ 605.573330] process_one_work+0x1db/0x380 [ 605.574144] worker_thread+0x4d/0x400 [ 605.574896] kthread+0x104/0x140 [ 605.577205] ret_from_fork+0x35/0x40 [ 605.577955] INFO: task nvme:14044 blocked for more than 120 seconds. [ 605.579239] Tainted: G OE 5.3.5-050305-generic #201910071830 [ 605.580712] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 605.582320] nvme D 0 14044 14043 0x00000000 [ 605.583424] Call Trace: [ 605.583935] __schedule+0x2b9/0x6c0 [ 605.584625] schedule+0x42/0xb0 [ 605.585290] schedule_timeout+0x203/0x2f0 [ 605.588493] wait_for_completion+0xb1/0x120 [ 605.590066] __flush_work+0x123/0x1d0 [ 605.591758] __cancel_work_timer+0x10e/0x190 [ 605.593542] cancel_work_sync+0x10/0x20 [ 605.594347] nvme_mpath_stop+0x2f/0x40 [nvme_core] [ 605.595328] nvme_stop_ctrl+0x12/0x50 [nvme_core] [ 605.596262] nvme_do_delete_ctrl+0x3f/0x90 [nvme_core] [ 605.597333] nvme_sysfs_delete+0x5c/0x70 [nvme_core] [ 605.598320] dev_attr_store+0x17/0x30 Fix this by introducing a new state: NVME_CTRL_DELETE_NOIO, which will indicate the phase of controller deletion where I/O cannot be allowed to access the namespace. NVME_CTRL_DELETING still allows mpath I/O to be issued to the bottom device, and only after we flush the ana_work and scan_work (after nvme_stop_ctrl and nvme_prep_remove_namespaces) we change the state to NVME_CTRL_DELETING_NOIO. Also we prevent ana_work from re-firing by aborting early if we are not LIVE, so we should be safe here. In addition, change the transport drivers to follow the updated state machine. Fixes: 0d0b660f214d ("nvme: add ANA support") Reported-by: Anton Eidelman <anton@lightbitslabs.com> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Christoph Hellwig <hch@lst.de>
2020-07-23 07:32:19 +08:00
/* state change failure is ok if we started ctrl delete */
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
ctrl->state != NVME_CTRL_DELETING_NOIO);
return;
}
nvme_tcp_reconnect_or_remove(ctrl);
}
static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
{
nvme_tcp_teardown_io_queues(ctrl, shutdown);
nvme_quiesce_admin_queue(ctrl);
nvme_disable_ctrl(ctrl, shutdown);
nvme_tcp_teardown_admin_queue(ctrl, shutdown);
}
static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
{
nvme_tcp_teardown_ctrl(ctrl, true);
}
static void nvme_reset_ctrl_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl =
container_of(work, struct nvme_ctrl, reset_work);
nvme_stop_ctrl(ctrl);
nvme_tcp_teardown_ctrl(ctrl, false);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
nvme: fix deadlock in disconnect during scan_work and/or ana_work A deadlock happens in the following scenario with multipath: 1) scan_work(nvme0) detects a new nsid while nvme0 is an optimized path to it, path nvme1 happens to be inaccessible. 2) Before scan_work is complete nvme0 disconnect is initiated nvme_delete_ctrl_sync() sets nvme0 state to NVME_CTRL_DELETING 3) scan_work(1) attempts to submit IO, but nvme_path_is_optimized() observes nvme0 is not LIVE. Since nvme1 is a possible path IO is requeued and scan_work hangs. -- Workqueue: nvme-wq nvme_scan_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: io_schedule+0x16/0x40 kernel: do_read_cache_page+0x438/0x830 kernel: read_cache_page+0x12/0x20 kernel: read_dev_sector+0x27/0xc0 kernel: read_lba+0xc1/0x220 kernel: efi_partition+0x1e6/0x708 kernel: check_partition+0x154/0x244 kernel: rescan_partitions+0xae/0x280 kernel: __blkdev_get+0x40f/0x560 kernel: blkdev_get+0x3d/0x140 kernel: __device_add_disk+0x388/0x480 kernel: device_add_disk+0x13/0x20 kernel: nvme_mpath_set_live+0x119/0x140 [nvme_core] kernel: nvme_update_ns_ana_state+0x5c/0x60 [nvme_core] kernel: nvme_set_ns_ana_state+0x1e/0x30 [nvme_core] kernel: nvme_parse_ana_log+0xa1/0x180 [nvme_core] kernel: nvme_mpath_add_disk+0x47/0x90 [nvme_core] kernel: nvme_validate_ns+0x396/0x940 [nvme_core] kernel: nvme_scan_work+0x24f/0x380 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x249/0x400 kernel: kthread+0x104/0x140 -- 4) Delete also hangs in flush_work(ctrl->scan_work) from nvme_remove_namespaces(). Similiarly a deadlock with ana_work may happen: if ana_work has started and calls nvme_mpath_set_live and device_add_disk, it will trigger I/O. When we trigger disconnect I/O will block because our accessible (optimized) path is disconnecting, but the alternate path is inaccessible, so I/O blocks. Then disconnect tries to flush the ana_work and hangs. [ 605.550896] Workqueue: nvme-wq nvme_ana_work [nvme_core] [ 605.552087] Call Trace: [ 605.552683] __schedule+0x2b9/0x6c0 [ 605.553507] schedule+0x42/0xb0 [ 605.554201] io_schedule+0x16/0x40 [ 605.555012] do_read_cache_page+0x438/0x830 [ 605.556925] read_cache_page+0x12/0x20 [ 605.557757] read_dev_sector+0x27/0xc0 [ 605.558587] amiga_partition+0x4d/0x4c5 [ 605.561278] check_partition+0x154/0x244 [ 605.562138] rescan_partitions+0xae/0x280 [ 605.563076] __blkdev_get+0x40f/0x560 [ 605.563830] blkdev_get+0x3d/0x140 [ 605.564500] __device_add_disk+0x388/0x480 [ 605.565316] device_add_disk+0x13/0x20 [ 605.566070] nvme_mpath_set_live+0x5e/0x130 [nvme_core] [ 605.567114] nvme_update_ns_ana_state+0x2c/0x30 [nvme_core] [ 605.568197] nvme_update_ana_state+0xca/0xe0 [nvme_core] [ 605.569360] nvme_parse_ana_log+0xa1/0x180 [nvme_core] [ 605.571385] nvme_read_ana_log+0x76/0x100 [nvme_core] [ 605.572376] nvme_ana_work+0x15/0x20 [nvme_core] [ 605.573330] process_one_work+0x1db/0x380 [ 605.574144] worker_thread+0x4d/0x400 [ 605.574896] kthread+0x104/0x140 [ 605.577205] ret_from_fork+0x35/0x40 [ 605.577955] INFO: task nvme:14044 blocked for more than 120 seconds. [ 605.579239] Tainted: G OE 5.3.5-050305-generic #201910071830 [ 605.580712] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 605.582320] nvme D 0 14044 14043 0x00000000 [ 605.583424] Call Trace: [ 605.583935] __schedule+0x2b9/0x6c0 [ 605.584625] schedule+0x42/0xb0 [ 605.585290] schedule_timeout+0x203/0x2f0 [ 605.588493] wait_for_completion+0xb1/0x120 [ 605.590066] __flush_work+0x123/0x1d0 [ 605.591758] __cancel_work_timer+0x10e/0x190 [ 605.593542] cancel_work_sync+0x10/0x20 [ 605.594347] nvme_mpath_stop+0x2f/0x40 [nvme_core] [ 605.595328] nvme_stop_ctrl+0x12/0x50 [nvme_core] [ 605.596262] nvme_do_delete_ctrl+0x3f/0x90 [nvme_core] [ 605.597333] nvme_sysfs_delete+0x5c/0x70 [nvme_core] [ 605.598320] dev_attr_store+0x17/0x30 Fix this by introducing a new state: NVME_CTRL_DELETE_NOIO, which will indicate the phase of controller deletion where I/O cannot be allowed to access the namespace. NVME_CTRL_DELETING still allows mpath I/O to be issued to the bottom device, and only after we flush the ana_work and scan_work (after nvme_stop_ctrl and nvme_prep_remove_namespaces) we change the state to NVME_CTRL_DELETING_NOIO. Also we prevent ana_work from re-firing by aborting early if we are not LIVE, so we should be safe here. In addition, change the transport drivers to follow the updated state machine. Fixes: 0d0b660f214d ("nvme: add ANA support") Reported-by: Anton Eidelman <anton@lightbitslabs.com> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Christoph Hellwig <hch@lst.de>
2020-07-23 07:32:19 +08:00
/* state change failure is ok if we started ctrl delete */
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
ctrl->state != NVME_CTRL_DELETING_NOIO);
return;
}
if (nvme_tcp_setup_ctrl(ctrl, false))
goto out_fail;
return;
out_fail:
++ctrl->nr_reconnects;
nvme_tcp_reconnect_or_remove(ctrl);
}
static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
{
nvme-tcp: fix possible hang caused during ctrl deletion When we delete a controller, we execute the following: 1. nvme_stop_ctrl() - stop some work elements that may be inflight or scheduled (specifically also .stop_ctrl which cancels ctrl error recovery work) 2. nvme_remove_namespaces() - which first flushes scan_work to avoid competing ns addition/removal 3. continue to teardown the controller However, if err_work was scheduled to run in (1), it is designed to cancel any inflight I/O, particularly I/O that is originating from ns scan_work in (2), but because it is cancelled in .stop_ctrl(), we can prevent forward progress of (2) as ns scanning is blocking on I/O (that will never be cancelled). The race is: 1. transport layer error observed -> err_work is scheduled 2. scan_work executes, discovers ns, generate I/O to it 3. nvme_ctop_ctrl() -> .stop_ctrl() -> cancel_work_sync(err_work) - err_work never executed 4. nvme_remove_namespaces() -> flush_work(scan_work) --> deadlock, because scan_work is blocked on I/O that was supposed to be cancelled by err_work, but was cancelled before executing (see stack trace [1]). Fix this by flushing err_work instead of cancelling it, to force it to execute and cancel all inflight I/O. [1]: -- Call Trace: <TASK> __schedule+0x390/0x910 ? scan_shadow_nodes+0x40/0x40 schedule+0x55/0xe0 io_schedule+0x16/0x40 do_read_cache_page+0x55d/0x850 ? __page_cache_alloc+0x90/0x90 read_cache_page+0x12/0x20 read_part_sector+0x3f/0x110 amiga_partition+0x3d/0x3e0 ? osf_partition+0x33/0x220 ? put_partition+0x90/0x90 bdev_disk_changed+0x1fe/0x4d0 blkdev_get_whole+0x7b/0x90 blkdev_get_by_dev+0xda/0x2d0 device_add_disk+0x356/0x3b0 nvme_mpath_set_live+0x13c/0x1a0 [nvme_core] ? nvme_parse_ana_log+0xae/0x1a0 [nvme_core] nvme_update_ns_ana_state+0x3a/0x40 [nvme_core] nvme_mpath_add_disk+0x120/0x160 [nvme_core] nvme_alloc_ns+0x594/0xa00 [nvme_core] nvme_validate_or_alloc_ns+0xb9/0x1a0 [nvme_core] ? __nvme_submit_sync_cmd+0x1d2/0x210 [nvme_core] nvme_scan_work+0x281/0x410 [nvme_core] process_one_work+0x1be/0x380 worker_thread+0x37/0x3b0 ? process_one_work+0x380/0x380 kthread+0x12d/0x150 ? set_kthread_struct+0x50/0x50 ret_from_fork+0x1f/0x30 </TASK> INFO: task nvme:6725 blocked for more than 491 seconds. Not tainted 5.15.65-f0.el7.x86_64 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:nvme state:D stack: 0 pid: 6725 ppid: 1761 flags:0x00004000 Call Trace: <TASK> __schedule+0x390/0x910 ? sched_clock+0x9/0x10 schedule+0x55/0xe0 schedule_timeout+0x24b/0x2e0 ? try_to_wake_up+0x358/0x510 ? finish_task_switch+0x88/0x2c0 wait_for_completion+0xa5/0x110 __flush_work+0x144/0x210 ? worker_attach_to_pool+0xc0/0xc0 flush_work+0x10/0x20 nvme_remove_namespaces+0x41/0xf0 [nvme_core] nvme_do_delete_ctrl+0x47/0x66 [nvme_core] nvme_sysfs_delete.cold.96+0x8/0xd [nvme_core] dev_attr_store+0x14/0x30 sysfs_kf_write+0x38/0x50 kernfs_fop_write_iter+0x146/0x1d0 new_sync_write+0x114/0x1b0 ? intel_pmu_handle_irq+0xe0/0x420 vfs_write+0x18d/0x270 ksys_write+0x61/0xe0 __x64_sys_write+0x1a/0x20 do_syscall_64+0x37/0x90 entry_SYSCALL_64_after_hwframe+0x61/0xcb -- Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Reported-by: Jonathan Nicklin <jnicklin@blockbridge.com> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Jonathan Nicklin <jnicklin@blockbridge.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-09-28 14:23:25 +08:00
flush_work(&to_tcp_ctrl(ctrl)->err_work);
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
}
static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
if (list_empty(&ctrl->list))
goto free_ctrl;
mutex_lock(&nvme_tcp_ctrl_mutex);
list_del(&ctrl->list);
mutex_unlock(&nvme_tcp_ctrl_mutex);
nvmf_free_options(nctrl->opts);
free_ctrl:
kfree(ctrl->queues);
kfree(ctrl);
}
static void nvme_tcp_set_sg_null(struct nvme_command *c)
{
struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
sg->addr = 0;
sg->length = 0;
sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
NVME_SGL_FMT_TRANSPORT_A;
}
static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
struct nvme_command *c, u32 data_len)
{
struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
sg->length = cpu_to_le32(data_len);
sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
}
static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
u32 data_len)
{
struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
sg->addr = 0;
sg->length = cpu_to_le32(data_len);
sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
NVME_SGL_FMT_TRANSPORT_A;
}
static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
struct nvme_tcp_queue *queue = &ctrl->queues[0];
struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
struct nvme_command *cmd = &pdu->cmd;
u8 hdgst = nvme_tcp_hdgst_len(queue);
memset(pdu, 0, sizeof(*pdu));
pdu->hdr.type = nvme_tcp_cmd;
if (queue->hdr_digest)
pdu->hdr.flags |= NVME_TCP_F_HDGST;
pdu->hdr.hlen = sizeof(*pdu);
pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
cmd->common.opcode = nvme_admin_async_event;
cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
cmd->common.flags |= NVME_CMD_SGL_METABUF;
nvme_tcp_set_sg_null(cmd);
ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
ctrl->async_req.offset = 0;
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
nvme_tcp_queue_request(&ctrl->async_req, true, true);
}
static void nvme_tcp_complete_timed_out(struct request *rq)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
nvmf_complete_timed_out_request(rq);
}
static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype;
int qid = nvme_tcp_queue_id(req->queue);
dev_warn(ctrl->device,
"queue %d: timeout cid %#x type %d opcode %#x (%s)\n",
nvme_tcp_queue_id(req->queue), nvme_cid(rq), pdu->hdr.type,
opc, nvme_opcode_str(qid, opc, fctype));
if (ctrl->state != NVME_CTRL_LIVE) {
/*
* If we are resetting, connecting or deleting we should
* complete immediately because we may block controller
* teardown or setup sequence
* - ctrl disable/shutdown fabrics requests
* - connect requests
* - initialization admin requests
* - I/O requests that entered after unquiescing and
* the controller stopped responding
*
* All other requests should be cancelled by the error
* recovery work, so it's fine that we fail it here.
*/
nvme_tcp_complete_timed_out(rq);
return BLK_EH_DONE;
}
/*
* LIVE state should trigger the normal error recovery which will
* handle completing this request.
*/
nvme_tcp_error_recovery(ctrl);
return BLK_EH_RESET_TIMER;
}
static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
struct request *rq)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
struct nvme_command *c = &pdu->cmd;
c->common.flags |= NVME_CMD_SGL_METABUF;
if (!blk_rq_nr_phys_segments(rq))
nvme_tcp_set_sg_null(c);
else if (rq_data_dir(rq) == WRITE &&
req->data_len <= nvme_tcp_inline_data_size(req))
nvme_tcp_set_sg_inline(queue, c, req->data_len);
else
nvme_tcp_set_sg_host_data(c, req->data_len);
return 0;
}
static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
struct request *rq)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
struct nvme_tcp_queue *queue = req->queue;
u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
blk_status_t ret;
ret = nvme_setup_cmd(ns, rq);
if (ret)
return ret;
req->state = NVME_TCP_SEND_CMD_PDU;
req->status = cpu_to_le16(NVME_SC_SUCCESS);
req->offset = 0;
req->data_sent = 0;
req->pdu_len = 0;
req->pdu_sent = 0;
req->h2cdata_left = 0;
req->data_len = blk_rq_nr_phys_segments(rq) ?
blk_rq_payload_bytes(rq) : 0;
req->curr_bio = rq->bio;
nvme-tcp: fix crash triggered with a dataless request submission write-zeros has a bio, but does not have any data buffers associated with it. Hence should not initialize the request iter for it (which attempts to reference the bi_io_vec (and crash). -- run blktests nvme/012 at 2021-02-05 21:53:34 BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 15 PID: 12069 Comm: kworker/15:2H Tainted: G S I 5.11.0-rc6+ #1 Hardware name: Dell Inc. PowerEdge R640/06NR82, BIOS 2.10.0 11/12/2020 Workqueue: kblockd blk_mq_run_work_fn RIP: 0010:nvme_tcp_init_iter+0x7d/0xd0 [nvme_tcp] RSP: 0018:ffffbd084447bd18 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffa0bba9f3ce80 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000002000000 RBP: ffffa0ba8ac6fec0 R08: 0000000002000000 R09: 0000000000000000 R10: 0000000002800809 R11: 0000000000000000 R12: 0000000000000000 R13: ffffa0bba9f3cf90 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffffa0c9ff9c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 00000001c9c6c005 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: nvme_tcp_queue_rq+0xef/0x330 [nvme_tcp] blk_mq_dispatch_rq_list+0x11c/0x7c0 ? blk_mq_flush_busy_ctxs+0xf6/0x110 __blk_mq_sched_dispatch_requests+0x12b/0x170 blk_mq_sched_dispatch_requests+0x30/0x60 __blk_mq_run_hw_queue+0x2b/0x60 process_one_work+0x1cb/0x360 ? process_one_work+0x360/0x360 worker_thread+0x30/0x370 ? process_one_work+0x360/0x360 kthread+0x116/0x130 ? kthread_park+0x80/0x80 ret_from_fork+0x1f/0x30 -- Fixes: cb9b870fba3e ("nvme-tcp: fix wrong setting of request iov_iter") Reported-by: Yi Zhang <yi.zhang@redhat.com> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Keith Busch <kbusch@kernel.org> Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com> Tested-by: Yi Zhang <yi.zhang@redhat.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-02-11 06:04:00 +08:00
if (req->curr_bio && req->data_len)
nvme_tcp_init_iter(req, rq_data_dir(rq));
if (rq_data_dir(rq) == WRITE &&
req->data_len <= nvme_tcp_inline_data_size(req))
req->pdu_len = req->data_len;
pdu->hdr.type = nvme_tcp_cmd;
pdu->hdr.flags = 0;
if (queue->hdr_digest)
pdu->hdr.flags |= NVME_TCP_F_HDGST;
if (queue->data_digest && req->pdu_len) {
pdu->hdr.flags |= NVME_TCP_F_DDGST;
ddgst = nvme_tcp_ddgst_len(queue);
}
pdu->hdr.hlen = sizeof(*pdu);
pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
pdu->hdr.plen =
cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
ret = nvme_tcp_map_data(queue, rq);
if (unlikely(ret)) {
nvme_cleanup_cmd(rq);
dev_err(queue->ctrl->ctrl.device,
"Failed to map data (%d)\n", ret);
return ret;
}
return 0;
}
static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
struct nvme_tcp_queue *queue = hctx->driver_data;
if (!llist_empty(&queue->req_list))
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct nvme_ns *ns = hctx->queue->queuedata;
struct nvme_tcp_queue *queue = hctx->driver_data;
struct request *rq = bd->rq;
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
blk_status_t ret;
if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
ret = nvme_tcp_setup_cmd_pdu(ns, rq);
if (unlikely(ret))
return ret;
nvme_start_request(rq);
nvme_tcp_queue_request(req, true, bd->last);
return BLK_STS_OK;
}
static void nvme_tcp_map_queues(struct blk_mq_tag_set *set)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
/* separate read/write queues */
set->map[HCTX_TYPE_DEFAULT].nr_queues =
ctrl->io_queues[HCTX_TYPE_DEFAULT];
set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
set->map[HCTX_TYPE_READ].nr_queues =
ctrl->io_queues[HCTX_TYPE_READ];
set->map[HCTX_TYPE_READ].queue_offset =
ctrl->io_queues[HCTX_TYPE_DEFAULT];
} else {
/* shared read/write queues */
set->map[HCTX_TYPE_DEFAULT].nr_queues =
ctrl->io_queues[HCTX_TYPE_DEFAULT];
set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
set->map[HCTX_TYPE_READ].nr_queues =
ctrl->io_queues[HCTX_TYPE_DEFAULT];
set->map[HCTX_TYPE_READ].queue_offset = 0;
}
blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
/* map dedicated poll queues only if we have queues left */
set->map[HCTX_TYPE_POLL].nr_queues =
ctrl->io_queues[HCTX_TYPE_POLL];
set->map[HCTX_TYPE_POLL].queue_offset =
ctrl->io_queues[HCTX_TYPE_DEFAULT] +
ctrl->io_queues[HCTX_TYPE_READ];
blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
}
dev_info(ctrl->ctrl.device,
"mapped %d/%d/%d default/read/poll queues.\n",
ctrl->io_queues[HCTX_TYPE_DEFAULT],
ctrl->io_queues[HCTX_TYPE_READ],
ctrl->io_queues[HCTX_TYPE_POLL]);
}
static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
struct nvme_tcp_queue *queue = hctx->driver_data;
struct sock *sk = queue->sock->sk;
if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
return 0;
set_bit(NVME_TCP_Q_POLLING, &queue->flags);
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
nvme_tcp_try_recv(queue);
clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
return queue->nr_cqe;
}
static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
{
struct nvme_tcp_queue *queue = &to_tcp_ctrl(ctrl)->queues[0];
struct sockaddr_storage src_addr;
int ret, len;
len = nvmf_get_address(ctrl, buf, size);
mutex_lock(&queue->queue_lock);
if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
goto done;
ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr);
if (ret > 0) {
if (len > 0)
len--; /* strip trailing newline */
len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n",
(len) ? "," : "", &src_addr);
}
done:
mutex_unlock(&queue->queue_lock);
return len;
}
static const struct blk_mq_ops nvme_tcp_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
.commit_rqs = nvme_tcp_commit_rqs,
.complete = nvme_complete_rq,
.init_request = nvme_tcp_init_request,
.exit_request = nvme_tcp_exit_request,
.init_hctx = nvme_tcp_init_hctx,
.timeout = nvme_tcp_timeout,
.map_queues = nvme_tcp_map_queues,
.poll = nvme_tcp_poll,
};
static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
.complete = nvme_complete_rq,
.init_request = nvme_tcp_init_request,
.exit_request = nvme_tcp_exit_request,
.init_hctx = nvme_tcp_init_admin_hctx,
.timeout = nvme_tcp_timeout,
};
static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
.name = "tcp",
.module = THIS_MODULE,
.flags = NVME_F_FABRICS | NVME_F_BLOCKING,
.reg_read32 = nvmf_reg_read32,
.reg_read64 = nvmf_reg_read64,
.reg_write32 = nvmf_reg_write32,
.free_ctrl = nvme_tcp_free_ctrl,
.submit_async_event = nvme_tcp_submit_async_event,
.delete_ctrl = nvme_tcp_delete_ctrl,
.get_address = nvme_tcp_get_address,
.stop_ctrl = nvme_tcp_stop_ctrl,
};
static bool
nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
{
struct nvme_tcp_ctrl *ctrl;
bool found = false;
mutex_lock(&nvme_tcp_ctrl_mutex);
list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
found = nvmf_ip_options_match(&ctrl->ctrl, opts);
if (found)
break;
}
mutex_unlock(&nvme_tcp_ctrl_mutex);
return found;
}
static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
struct nvmf_ctrl_options *opts)
{
struct nvme_tcp_ctrl *ctrl;
int ret;
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&ctrl->list);
ctrl->ctrl.opts = opts;
ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
opts->nr_poll_queues + 1;
ctrl->ctrl.sqsize = opts->queue_size - 1;
ctrl->ctrl.kato = opts->kato;
INIT_DELAYED_WORK(&ctrl->connect_work,
nvme_tcp_reconnect_ctrl_work);
INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
if (!(opts->mask & NVMF_OPT_TRSVCID)) {
opts->trsvcid =
kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
if (!opts->trsvcid) {
ret = -ENOMEM;
goto out_free_ctrl;
}
opts->mask |= NVMF_OPT_TRSVCID;
}
ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
opts->traddr, opts->trsvcid, &ctrl->addr);
if (ret) {
pr_err("malformed address passed: %s:%s\n",
opts->traddr, opts->trsvcid);
goto out_free_ctrl;
}
if (opts->mask & NVMF_OPT_HOST_TRADDR) {
ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
opts->host_traddr, NULL, &ctrl->src_addr);
if (ret) {
pr_err("malformed src address passed: %s\n",
opts->host_traddr);
goto out_free_ctrl;
}
}
nvme-tcp: allow selecting the network interface for connections In our application, we need a way to force TCP connections to go out a specific IP interface instead of letting Linux select the interface based on the routing tables. Add the 'host-iface' option to allow specifying the interface to use. When the option host-iface is specified, the driver uses the specified interface to set the option SO_BINDTODEVICE on the TCP socket before connecting. This new option is needed in addtion to the existing host-traddr for the following reasons: Specifying an IP interface by its associated IP address is less intuitive than specifying the actual interface name and, in some cases, simply doesn't work. That's because the association between interfaces and IP addresses is not predictable. IP addresses can be changed or can change by themselves over time (e.g. DHCP). Interface names are predictable [1] and will persist over time. Consider the following configuration. 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state ... link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 100.0.0.100/24 scope global lo valid_lft forever preferred_lft forever 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ... link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff inet 100.0.0.100/24 scope global enp0s3 valid_lft forever preferred_lft forever 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ... link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff inet 100.0.0.100/24 scope global enp0s8 valid_lft forever preferred_lft forever The above is a VM that I configured with the same IP address (100.0.0.100) on all interfaces. Doing a reverse lookup to identify the unique interface associated with 100.0.0.100 does not work here. And this is why the option host_iface is required. I understand that the above config does not represent a standard host system, but I'm using this to prove a point: "We can never know how users will configure their systems". By te way, The above configuration is perfectly fine by Linux. The current TCP implementation for host_traddr performs a bind()-before-connect(). This is a common construct to set the source IP address on a TCP socket before connecting. This has no effect on how Linux selects the interface for the connection. That's because Linux uses the Weak End System model as described in RFC1122 [2]. On the other hand, setting the Source IP Address has benefits and should be supported by linux-nvme. In fact, setting the Source IP Address is a mandatory FedGov requirement (e.g. connection to a RADIUS/TACACS+ server). Consider the following configuration. $ ip addr list dev enp0s8 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ... link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff inet 192.168.56.101/24 brd 192.168.56.255 scope global enp0s8 valid_lft 426sec preferred_lft 426sec inet 192.168.56.102/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever inet 192.168.56.103/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever inet 192.168.56.104/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever Here we can see that several addresses are associated with interface enp0s8. By default, Linux always selects the default IP address, 192.168.56.101, as the source address when connecting over interface enp0s8. Some users, however, want the ability to specify a different source address (e.g., 192.168.56.102, 192.168.56.103, ...). The option host_traddr can be used as-is to perform this function. In conclusion, I believe that we need 2 options for TCP connections. One that can be used to specify an interface (host-iface). And one that can be used to set the source address (host-traddr). Users should be allowed to use one or the other, or both, or none. Of course, the documentation for host_traddr will need some clarification. It should state that when used for TCP connection, this option only sets the source address. And the documentation for host_iface should say that this option is only available for TCP connections. References: [1] https://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames/ [2] https://tools.ietf.org/html/rfc1122 Tested both IPv4 and IPv6 connections. Signed-off-by: Martin Belanger <martin.belanger@dell.com> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-05-21 03:09:34 +08:00
if (opts->mask & NVMF_OPT_HOST_IFACE) {
if (!__dev_get_by_name(&init_net, opts->host_iface)) {
nvme-tcp: allow selecting the network interface for connections In our application, we need a way to force TCP connections to go out a specific IP interface instead of letting Linux select the interface based on the routing tables. Add the 'host-iface' option to allow specifying the interface to use. When the option host-iface is specified, the driver uses the specified interface to set the option SO_BINDTODEVICE on the TCP socket before connecting. This new option is needed in addtion to the existing host-traddr for the following reasons: Specifying an IP interface by its associated IP address is less intuitive than specifying the actual interface name and, in some cases, simply doesn't work. That's because the association between interfaces and IP addresses is not predictable. IP addresses can be changed or can change by themselves over time (e.g. DHCP). Interface names are predictable [1] and will persist over time. Consider the following configuration. 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state ... link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 100.0.0.100/24 scope global lo valid_lft forever preferred_lft forever 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ... link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff inet 100.0.0.100/24 scope global enp0s3 valid_lft forever preferred_lft forever 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ... link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff inet 100.0.0.100/24 scope global enp0s8 valid_lft forever preferred_lft forever The above is a VM that I configured with the same IP address (100.0.0.100) on all interfaces. Doing a reverse lookup to identify the unique interface associated with 100.0.0.100 does not work here. And this is why the option host_iface is required. I understand that the above config does not represent a standard host system, but I'm using this to prove a point: "We can never know how users will configure their systems". By te way, The above configuration is perfectly fine by Linux. The current TCP implementation for host_traddr performs a bind()-before-connect(). This is a common construct to set the source IP address on a TCP socket before connecting. This has no effect on how Linux selects the interface for the connection. That's because Linux uses the Weak End System model as described in RFC1122 [2]. On the other hand, setting the Source IP Address has benefits and should be supported by linux-nvme. In fact, setting the Source IP Address is a mandatory FedGov requirement (e.g. connection to a RADIUS/TACACS+ server). Consider the following configuration. $ ip addr list dev enp0s8 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ... link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff inet 192.168.56.101/24 brd 192.168.56.255 scope global enp0s8 valid_lft 426sec preferred_lft 426sec inet 192.168.56.102/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever inet 192.168.56.103/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever inet 192.168.56.104/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever Here we can see that several addresses are associated with interface enp0s8. By default, Linux always selects the default IP address, 192.168.56.101, as the source address when connecting over interface enp0s8. Some users, however, want the ability to specify a different source address (e.g., 192.168.56.102, 192.168.56.103, ...). The option host_traddr can be used as-is to perform this function. In conclusion, I believe that we need 2 options for TCP connections. One that can be used to specify an interface (host-iface). And one that can be used to set the source address (host-traddr). Users should be allowed to use one or the other, or both, or none. Of course, the documentation for host_traddr will need some clarification. It should state that when used for TCP connection, this option only sets the source address. And the documentation for host_iface should say that this option is only available for TCP connections. References: [1] https://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames/ [2] https://tools.ietf.org/html/rfc1122 Tested both IPv4 and IPv6 connections. Signed-off-by: Martin Belanger <martin.belanger@dell.com> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-05-21 03:09:34 +08:00
pr_err("invalid interface passed: %s\n",
opts->host_iface);
ret = -ENODEV;
goto out_free_ctrl;
}
}
if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
ret = -EALREADY;
goto out_free_ctrl;
}
ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
GFP_KERNEL);
if (!ctrl->queues) {
ret = -ENOMEM;
goto out_free_ctrl;
}
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
if (ret)
goto out_kfree_queues;
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
WARN_ON_ONCE(1);
ret = -EINTR;
goto out_uninit_ctrl;
}
ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
if (ret)
goto out_uninit_ctrl;
dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
mutex_lock(&nvme_tcp_ctrl_mutex);
list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
mutex_unlock(&nvme_tcp_ctrl_mutex);
return &ctrl->ctrl;
out_uninit_ctrl:
nvme_uninit_ctrl(&ctrl->ctrl);
nvme_put_ctrl(&ctrl->ctrl);
if (ret > 0)
ret = -EIO;
return ERR_PTR(ret);
out_kfree_queues:
kfree(ctrl->queues);
out_free_ctrl:
kfree(ctrl);
return ERR_PTR(ret);
}
static struct nvmf_transport_ops nvme_tcp_transport = {
.name = "tcp",
.module = THIS_MODULE,
.required_opts = NVMF_OPT_TRADDR,
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
nvme-tcp: allow selecting the network interface for connections In our application, we need a way to force TCP connections to go out a specific IP interface instead of letting Linux select the interface based on the routing tables. Add the 'host-iface' option to allow specifying the interface to use. When the option host-iface is specified, the driver uses the specified interface to set the option SO_BINDTODEVICE on the TCP socket before connecting. This new option is needed in addtion to the existing host-traddr for the following reasons: Specifying an IP interface by its associated IP address is less intuitive than specifying the actual interface name and, in some cases, simply doesn't work. That's because the association between interfaces and IP addresses is not predictable. IP addresses can be changed or can change by themselves over time (e.g. DHCP). Interface names are predictable [1] and will persist over time. Consider the following configuration. 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state ... link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 100.0.0.100/24 scope global lo valid_lft forever preferred_lft forever 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ... link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff inet 100.0.0.100/24 scope global enp0s3 valid_lft forever preferred_lft forever 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ... link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff inet 100.0.0.100/24 scope global enp0s8 valid_lft forever preferred_lft forever The above is a VM that I configured with the same IP address (100.0.0.100) on all interfaces. Doing a reverse lookup to identify the unique interface associated with 100.0.0.100 does not work here. And this is why the option host_iface is required. I understand that the above config does not represent a standard host system, but I'm using this to prove a point: "We can never know how users will configure their systems". By te way, The above configuration is perfectly fine by Linux. The current TCP implementation for host_traddr performs a bind()-before-connect(). This is a common construct to set the source IP address on a TCP socket before connecting. This has no effect on how Linux selects the interface for the connection. That's because Linux uses the Weak End System model as described in RFC1122 [2]. On the other hand, setting the Source IP Address has benefits and should be supported by linux-nvme. In fact, setting the Source IP Address is a mandatory FedGov requirement (e.g. connection to a RADIUS/TACACS+ server). Consider the following configuration. $ ip addr list dev enp0s8 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ... link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff inet 192.168.56.101/24 brd 192.168.56.255 scope global enp0s8 valid_lft 426sec preferred_lft 426sec inet 192.168.56.102/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever inet 192.168.56.103/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever inet 192.168.56.104/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever Here we can see that several addresses are associated with interface enp0s8. By default, Linux always selects the default IP address, 192.168.56.101, as the source address when connecting over interface enp0s8. Some users, however, want the ability to specify a different source address (e.g., 192.168.56.102, 192.168.56.103, ...). The option host_traddr can be used as-is to perform this function. In conclusion, I believe that we need 2 options for TCP connections. One that can be used to specify an interface (host-iface). And one that can be used to set the source address (host-traddr). Users should be allowed to use one or the other, or both, or none. Of course, the documentation for host_traddr will need some clarification. It should state that when used for TCP connection, this option only sets the source address. And the documentation for host_iface should say that this option is only available for TCP connections. References: [1] https://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames/ [2] https://tools.ietf.org/html/rfc1122 Tested both IPv4 and IPv6 connections. Signed-off-by: Martin Belanger <martin.belanger@dell.com> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-05-21 03:09:34 +08:00
NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
.create_ctrl = nvme_tcp_create_ctrl,
};
static int __init nvme_tcp_init_module(void)
{
BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24);
BUILD_BUG_ON(sizeof(struct nvme_tcp_rsp_pdu) != 24);
BUILD_BUG_ON(sizeof(struct nvme_tcp_r2t_pdu) != 24);
BUILD_BUG_ON(sizeof(struct nvme_tcp_icreq_pdu) != 128);
BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);
nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
if (!nvme_tcp_wq)
return -ENOMEM;
nvmf_register_transport(&nvme_tcp_transport);
return 0;
}
static void __exit nvme_tcp_cleanup_module(void)
{
struct nvme_tcp_ctrl *ctrl;
nvmf_unregister_transport(&nvme_tcp_transport);
mutex_lock(&nvme_tcp_ctrl_mutex);
list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
nvme_delete_ctrl(&ctrl->ctrl);
mutex_unlock(&nvme_tcp_ctrl_mutex);
flush_workqueue(nvme_delete_wq);
destroy_workqueue(nvme_tcp_wq);
}
module_init(nvme_tcp_init_module);
module_exit(nvme_tcp_cleanup_module);
MODULE_LICENSE("GPL v2");