Merge branch 'nvme-5.4' of git://git.infradead.org/nvme into for-linus

Pull NVMe updates from Keith:

"This is a collection of bug fixes committed since the previous pull
 request that address deadlocks, double resets, memory leaks, and other
 regression."

* 'nvme-5.4' of git://git.infradead.org/nvme:
  nvme-pci: Set the prp2 correctly when using more than 4k page
  nvme-tcp: fix possible leakage during error flow
  nvmet-loop: fix possible leakage during error flow
  nvme-tcp: Initialize sk->sk_ll_usec only with NET_RX_BUSY_POLL
  nvme: Wait for reset state when required
  nvme: Prevent resets during paused controller state
  nvme: Restart request timers in resetting state
  nvme: Remove ADMIN_ONLY state
  nvme-pci: Free tagset if no IO queues
  nvme: retain split access workaround for capability reads
  nvme: fix possible deadlock when nvme_update_formats fails
This commit is contained in:
Jens Axboe 2019-10-18 08:49:25 -06:00
commit b55f0097ae
7 changed files with 144 additions and 64 deletions

View File

@ -116,10 +116,26 @@ static void nvme_queue_scan(struct nvme_ctrl *ctrl)
/*
* Only new queue scan work when admin and IO queues are both alive
*/
if (ctrl->state == NVME_CTRL_LIVE)
if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
queue_work(nvme_wq, &ctrl->scan_work);
}
/*
* Use this function to proceed with scheduling reset_work for a controller
* that had previously been set to the resetting state. This is intended for
* code paths that can't be interrupted by other reset attempts. A hot removal
* may prevent this from succeeding.
*/
int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
{
if (ctrl->state != NVME_CTRL_RESETTING)
return -EBUSY;
if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
return -EBUSY;
return 0;
}
EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
@ -137,8 +153,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
ret = nvme_reset_ctrl(ctrl);
if (!ret) {
flush_work(&ctrl->reset_work);
if (ctrl->state != NVME_CTRL_LIVE &&
ctrl->state != NVME_CTRL_ADMIN_ONLY)
if (ctrl->state != NVME_CTRL_LIVE)
ret = -ENETRESET;
}
@ -315,15 +330,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
old_state = ctrl->state;
switch (new_state) {
case NVME_CTRL_ADMIN_ONLY:
switch (old_state) {
case NVME_CTRL_CONNECTING:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_LIVE:
switch (old_state) {
case NVME_CTRL_NEW:
@ -339,7 +345,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
switch (old_state) {
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
case NVME_CTRL_ADMIN_ONLY:
changed = true;
/* FALLTHRU */
default:
@ -359,7 +364,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_DELETING:
switch (old_state) {
case NVME_CTRL_LIVE:
case NVME_CTRL_ADMIN_ONLY:
case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
changed = true;
@ -381,8 +385,10 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
break;
}
if (changed)
if (changed) {
ctrl->state = new_state;
wake_up_all(&ctrl->state_wq);
}
spin_unlock_irqrestore(&ctrl->lock, flags);
if (changed && ctrl->state == NVME_CTRL_LIVE)
@ -391,6 +397,39 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
}
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
/*
* Returns true for sink states that can't ever transition back to live.
*/
static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
{
switch (ctrl->state) {
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
return false;
case NVME_CTRL_DELETING:
case NVME_CTRL_DEAD:
return true;
default:
WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
return true;
}
}
/*
* Waits for the controller state to be resetting, or returns false if it is
* not possible to ever transition to that state.
*/
bool nvme_wait_reset(struct nvme_ctrl *ctrl)
{
wait_event(ctrl->state_wq,
nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
nvme_state_terminal(ctrl));
return ctrl->state == NVME_CTRL_RESETTING;
}
EXPORT_SYMBOL_GPL(nvme_wait_reset);
static void nvme_free_ns_head(struct kref *ref)
{
struct nvme_ns_head *head =
@ -1306,8 +1345,6 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl)
if (ns->disk && nvme_revalidate_disk(ns->disk))
nvme_set_queue_dying(ns);
up_read(&ctrl->namespaces_rwsem);
nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
}
static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
@ -1323,6 +1360,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
nvme_unfreeze(ctrl);
nvme_mpath_unfreeze(ctrl->subsys);
mutex_unlock(&ctrl->subsys->lock);
nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
mutex_unlock(&ctrl->scan_lock);
}
if (effects & NVME_CMD_EFFECTS_CCC)
@ -2874,7 +2912,6 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
switch (ctrl->state) {
case NVME_CTRL_LIVE:
case NVME_CTRL_ADMIN_ONLY:
break;
default:
return -EWOULDBLOCK;
@ -3168,7 +3205,6 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
static const char *const state_name[] = {
[NVME_CTRL_NEW] = "new",
[NVME_CTRL_LIVE] = "live",
[NVME_CTRL_ADMIN_ONLY] = "only-admin",
[NVME_CTRL_RESETTING] = "resetting",
[NVME_CTRL_CONNECTING] = "connecting",
[NVME_CTRL_DELETING] = "deleting",
@ -3679,11 +3715,10 @@ static void nvme_scan_work(struct work_struct *work)
struct nvme_id_ctrl *id;
unsigned nn;
if (ctrl->state != NVME_CTRL_LIVE)
/* No tagset on a live ctrl means IO queues could not created */
if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
return;
WARN_ON_ONCE(!ctrl->tagset);
if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
dev_info(ctrl->device, "rescanning namespaces.\n");
nvme_clear_changed_ns_log(ctrl);
@ -3844,13 +3879,13 @@ static void nvme_fw_act_work(struct work_struct *work)
if (time_after(jiffies, fw_act_timeout)) {
dev_warn(ctrl->device,
"Fw activation timeout, reset controller\n");
nvme_reset_ctrl(ctrl);
break;
nvme_try_sched_reset(ctrl);
return;
}
msleep(100);
}
if (ctrl->state != NVME_CTRL_LIVE)
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
return;
nvme_start_queues(ctrl);
@ -3870,7 +3905,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
nvme_queue_scan(ctrl);
break;
case NVME_AER_NOTICE_FW_ACT_STARTING:
queue_work(nvme_wq, &ctrl->fw_act_work);
/*
* We are (ab)using the RESETTING state to prevent subsequent
* recovery actions from interfering with the controller's
* firmware activation.
*/
if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
queue_work(nvme_wq, &ctrl->fw_act_work);
break;
#ifdef CONFIG_NVME_MULTIPATH
case NVME_AER_NOTICE_ANA:
@ -3993,6 +4034,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
init_waitqueue_head(&ctrl->state_wq);
INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));

View File

@ -182,8 +182,7 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
bool queue_live)
{
if (likely(ctrl->state == NVME_CTRL_LIVE ||
ctrl->state == NVME_CTRL_ADMIN_ONLY))
if (likely(ctrl->state == NVME_CTRL_LIVE))
return true;
return __nvmf_check_ready(ctrl, rq, queue_live);
}

View File

@ -15,6 +15,7 @@
#include <linux/sed-opal.h>
#include <linux/fault-inject.h>
#include <linux/rcupdate.h>
#include <linux/wait.h>
#include <trace/events/block.h>
@ -161,7 +162,6 @@ static inline u16 nvme_req_qid(struct request *req)
enum nvme_ctrl_state {
NVME_CTRL_NEW,
NVME_CTRL_LIVE,
NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */
NVME_CTRL_RESETTING,
NVME_CTRL_CONNECTING,
NVME_CTRL_DELETING,
@ -199,6 +199,7 @@ struct nvme_ctrl {
struct cdev cdev;
struct work_struct reset_work;
struct work_struct delete_work;
wait_queue_head_t state_wq;
struct nvme_subsystem *subsys;
struct list_head subsys_entry;
@ -449,6 +450,7 @@ void nvme_complete_rq(struct request *req);
bool nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
bool nvme_wait_reset(struct nvme_ctrl *ctrl);
int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
@ -499,6 +501,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,

View File

@ -773,7 +773,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
struct bio_vec *bv)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
unsigned int first_prp_len = dev->ctrl.page_size - bv->bv_offset;
unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1);
unsigned int first_prp_len = dev->ctrl.page_size - offset;
iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
if (dma_mapping_error(dev->dev, iod->first_dma))
@ -2263,10 +2264,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
return true;
}
/*
* return error value only when tagset allocation failed
*/
static int nvme_dev_add(struct nvme_dev *dev)
static void nvme_dev_add(struct nvme_dev *dev)
{
int ret;
@ -2296,7 +2294,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
if (ret) {
dev_warn(dev->ctrl.device,
"IO queues tagset allocation failed %d\n", ret);
return ret;
return;
}
dev->ctrl.tagset = &dev->tagset;
} else {
@ -2307,7 +2305,6 @@ static int nvme_dev_add(struct nvme_dev *dev)
}
nvme_dbbuf_set(dev);
return 0;
}
static int nvme_pci_enable(struct nvme_dev *dev)
@ -2467,6 +2464,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
mutex_unlock(&dev->shutdown_lock);
}
static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
{
if (!nvme_wait_reset(&dev->ctrl))
return -EBUSY;
nvme_dev_disable(dev, shutdown);
return 0;
}
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
@ -2490,14 +2495,20 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
dma_pool_destroy(dev->prp_small_pool);
}
static void nvme_free_tagset(struct nvme_dev *dev)
{
if (dev->tagset.tags)
blk_mq_free_tag_set(&dev->tagset);
dev->ctrl.tagset = NULL;
}
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
nvme_dbbuf_dma_free(dev);
put_device(dev->dev);
if (dev->tagset.tags)
blk_mq_free_tag_set(&dev->tagset);
nvme_free_tagset(dev);
if (dev->ctrl.admin_q)
blk_put_queue(dev->ctrl.admin_q);
kfree(dev->queues);
@ -2508,6 +2519,11 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
{
/*
* Set state to deleting now to avoid blocking nvme_wait_reset(), which
* may be holding this pci_dev's device lock.
*/
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
nvme_get_ctrl(&dev->ctrl);
nvme_dev_disable(dev, false);
nvme_kill_queues(&dev->ctrl);
@ -2521,7 +2537,6 @@ static void nvme_reset_work(struct work_struct *work)
container_of(work, struct nvme_dev, ctrl.reset_work);
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result;
enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
result = -ENODEV;
@ -2615,13 +2630,11 @@ static void nvme_reset_work(struct work_struct *work)
dev_warn(dev->ctrl.device, "IO queues not created\n");
nvme_kill_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
new_state = NVME_CTRL_ADMIN_ONLY;
nvme_free_tagset(dev);
} else {
nvme_start_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
/* hit this only when allocate tagset fails */
if (nvme_dev_add(dev))
new_state = NVME_CTRL_ADMIN_ONLY;
nvme_dev_add(dev);
nvme_unfreeze(&dev->ctrl);
}
@ -2629,9 +2642,9 @@ static void nvme_reset_work(struct work_struct *work)
* If only admin queue live, keep it to do further investigation or
* recovery.
*/
if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
dev_warn(dev->ctrl.device,
"failed to mark controller state %d\n", new_state);
"failed to mark controller live state\n");
result = -ENODEV;
goto out;
}
@ -2672,7 +2685,7 @@ static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
*val = readq(to_nvme_dev(ctrl)->bar + off);
*val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
return 0;
}
@ -2836,19 +2849,28 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
static void nvme_reset_prepare(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
nvme_dev_disable(dev, false);
/*
* We don't need to check the return value from waiting for the reset
* state as pci_dev device lock is held, making it impossible to race
* with ->remove().
*/
nvme_disable_prepare_reset(dev, false);
nvme_sync_queues(&dev->ctrl);
}
static void nvme_reset_done(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
nvme_reset_ctrl_sync(&dev->ctrl);
if (!nvme_try_sched_reset(&dev->ctrl))
flush_work(&dev->ctrl.reset_work);
}
static void nvme_shutdown(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
nvme_dev_disable(dev, true);
nvme_disable_prepare_reset(dev, true);
}
/*
@ -2901,7 +2923,7 @@ static int nvme_resume(struct device *dev)
if (ndev->last_ps == U32_MAX ||
nvme_set_power_state(ctrl, ndev->last_ps) != 0)
nvme_reset_ctrl(ctrl);
return nvme_try_sched_reset(&ndev->ctrl);
return 0;
}
@ -2929,17 +2951,14 @@ static int nvme_suspend(struct device *dev)
*/
if (pm_suspend_via_firmware() || !ctrl->npss ||
!pcie_aspm_enabled(pdev) ||
(ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) {
nvme_dev_disable(ndev, true);
return 0;
}
(ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
return nvme_disable_prepare_reset(ndev, true);
nvme_start_freeze(ctrl);
nvme_wait_freeze(ctrl);
nvme_sync_queues(ctrl);
if (ctrl->state != NVME_CTRL_LIVE &&
ctrl->state != NVME_CTRL_ADMIN_ONLY)
if (ctrl->state != NVME_CTRL_LIVE)
goto unfreeze;
ret = nvme_get_power_state(ctrl, &ndev->last_ps);
@ -2965,9 +2984,8 @@ static int nvme_suspend(struct device *dev)
* Clearing npss forces a controller reset on resume. The
* correct value will be resdicovered then.
*/
nvme_dev_disable(ndev, true);
ret = nvme_disable_prepare_reset(ndev, true);
ctrl->npss = 0;
ret = 0;
}
unfreeze:
nvme_unfreeze(ctrl);
@ -2977,9 +2995,7 @@ unfreeze:
static int nvme_simple_suspend(struct device *dev)
{
struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
nvme_dev_disable(ndev, true);
return 0;
return nvme_disable_prepare_reset(ndev, true);
}
static int nvme_simple_resume(struct device *dev)
@ -2987,8 +3003,7 @@ static int nvme_simple_resume(struct device *dev)
struct pci_dev *pdev = to_pci_dev(dev);
struct nvme_dev *ndev = pci_get_drvdata(pdev);
nvme_reset_ctrl(&ndev->ctrl);
return 0;
return nvme_try_sched_reset(&ndev->ctrl);
}
static const struct dev_pm_ops nvme_dev_pm_ops = {

View File

@ -1701,6 +1701,14 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
rq->tag, nvme_rdma_queue_idx(queue));
/*
* Restart the timer if a controller reset is already scheduled. Any
* timed out commands would be handled before entering the connecting
* state.
*/
if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
return BLK_EH_RESET_TIMER;
if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
/*
* Teardown immediately if controller times out while starting

View File

@ -1386,7 +1386,9 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
queue->sock->sk->sk_state_change = nvme_tcp_state_change;
queue->sock->sk->sk_write_space = nvme_tcp_write_space;
#ifdef CONFIG_NET_RX_BUSY_POLL
queue->sock->sk->sk_ll_usec = 1;
#endif
write_unlock_bh(&queue->sock->sk->sk_callback_lock);
return 0;
@ -2044,6 +2046,14 @@ nvme_tcp_timeout(struct request *rq, bool reserved)
struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
struct nvme_tcp_cmd_pdu *pdu = req->pdu;
/*
* Restart the timer if a controller reset is already scheduled. Any
* timed out commands would be handled before entering the connecting
* state.
*/
if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
return BLK_EH_RESET_TIMER;
dev_warn(ctrl->ctrl.device,
"queue %d: timeout request %#x type %d\n",
nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
@ -2126,6 +2136,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
ret = nvme_tcp_map_data(queue, rq);
if (unlikely(ret)) {
nvme_cleanup_cmd(rq);
dev_err(queue->ctrl->ctrl.device,
"Failed to map data (%d)\n", ret);
return ret;

View File

@ -157,8 +157,10 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
iod->sg_table.sgl = iod->first_sgl;
if (sg_alloc_table_chained(&iod->sg_table,
blk_rq_nr_phys_segments(req),
iod->sg_table.sgl, SG_CHUNK_SIZE))
iod->sg_table.sgl, SG_CHUNK_SIZE)) {
nvme_cleanup_cmd(req);
return BLK_STS_RESOURCE;
}
iod->req.sg = iod->sg_table.sgl;
iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl);