2019-02-18 16:36:29 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2015-11-26 16:08:36 +08:00
|
|
|
/*
|
|
|
|
* NVM Express device driver
|
|
|
|
* Copyright (c) 2011-2014, Intel Corporation.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/blk-mq.h>
|
2021-09-20 20:33:27 +08:00
|
|
|
#include <linux/blk-integrity.h>
|
2020-03-28 13:09:09 +08:00
|
|
|
#include <linux/compat.h>
|
2015-11-28 22:03:49 +08:00
|
|
|
#include <linux/delay.h>
|
2015-11-26 16:08:36 +08:00
|
|
|
#include <linux/errno.h>
|
2015-11-26 17:54:19 +08:00
|
|
|
#include <linux/hdreg.h>
|
2015-11-26 16:08:36 +08:00
|
|
|
#include <linux/kernel.h>
|
2015-11-28 22:39:07 +08:00
|
|
|
#include <linux/module.h>
|
2019-07-04 15:59:18 +08:00
|
|
|
#include <linux/backing-dev.h>
|
2015-11-26 16:08:36 +08:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/types.h>
|
2015-11-26 17:54:19 +08:00
|
|
|
#include <linux/pr.h>
|
|
|
|
#include <linux/ptrace.h>
|
|
|
|
#include <linux/nvme_ioctl.h>
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
#include <linux/pm_qos.h>
|
2015-11-26 17:54:19 +08:00
|
|
|
#include <asm/unaligned.h>
|
2015-11-26 16:08:36 +08:00
|
|
|
|
|
|
|
#include "nvme.h"
|
2016-06-13 22:45:28 +08:00
|
|
|
#include "fabrics.h"
|
2022-06-27 17:52:02 +08:00
|
|
|
#include <linux/nvme-auth.h>
|
2015-11-26 16:08:36 +08:00
|
|
|
|
2019-07-24 21:47:55 +08:00
|
|
|
#define CREATE_TRACE_POINTS
|
|
|
|
#include "trace.h"
|
|
|
|
|
2015-11-28 22:40:19 +08:00
|
|
|
#define NVME_MINORS (1U << MINORBITS)
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
struct nvme_ns_info {
|
|
|
|
struct nvme_ns_ids ids;
|
|
|
|
u32 nsid;
|
|
|
|
__le32 anagrpid;
|
|
|
|
bool is_shared;
|
2022-07-13 13:40:25 +08:00
|
|
|
bool is_readonly;
|
2022-07-23 00:24:18 +08:00
|
|
|
bool is_ready;
|
2023-02-22 06:02:25 +08:00
|
|
|
bool is_removed;
|
2022-07-23 00:24:18 +08:00
|
|
|
};
|
|
|
|
|
2017-09-07 08:23:56 +08:00
|
|
|
unsigned int admin_timeout = 60;
|
|
|
|
module_param(admin_timeout, uint, 0644);
|
2016-02-11 02:03:30 +08:00
|
|
|
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(admin_timeout);
|
2016-02-11 02:03:30 +08:00
|
|
|
|
2017-09-07 08:23:56 +08:00
|
|
|
unsigned int nvme_io_timeout = 30;
|
|
|
|
module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
|
2016-02-11 02:03:30 +08:00
|
|
|
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_io_timeout);
|
2016-02-11 02:03:30 +08:00
|
|
|
|
2017-06-13 00:30:51 +08:00
|
|
|
static unsigned char shutdown_timeout = 5;
|
2016-02-11 02:03:30 +08:00
|
|
|
module_param(shutdown_timeout, byte, 0644);
|
|
|
|
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
|
|
|
|
|
2017-04-06 01:18:11 +08:00
|
|
|
static u8 nvme_max_retries = 5;
|
|
|
|
module_param_named(max_retries, nvme_max_retries, byte, 0644);
|
2016-07-13 07:20:31 +08:00
|
|
|
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2017-06-07 15:25:43 +08:00
|
|
|
static unsigned long default_ps_max_latency_us = 100000;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
module_param(default_ps_max_latency_us, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(default_ps_max_latency_us,
|
|
|
|
"max power saving latency for new devices; use PM QOS to change per device");
|
|
|
|
|
2017-04-22 07:19:24 +08:00
|
|
|
static bool force_apst;
|
|
|
|
module_param(force_apst, bool, 0644);
|
|
|
|
MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
|
|
|
|
|
nvme: extend and modify the APST configuration algorithm
The algorithm that was used until now for building the APST configuration
table has been found to produce entries with excessively long ITPT
(idle time prior to transition) for devices declaring relatively long
entry and exit latencies for non-operational power states. This leads
to unnecessary waste of power and, as a result, failure to pass
mandatory power consumption tests on Chromebook platforms.
The new algorithm is based on two predefined ITPT values and two
predefined latency tolerances. Based on these values, as well as on
exit and entry latencies reported by the device, the algorithm looks
for up to 2 suitable non-operational power states to use as primary
and secondary APST transition targets. The predefined values are
supplied to the nvme driver as module parameters:
- apst_primary_timeout_ms (default: 100)
- apst_secondary_timeout_ms (default: 2000)
- apst_primary_latency_tol_us (default: 15000)
- apst_secondary_latency_tol_us (default: 100000)
The algorithm echoes the approach used by Intel's and Microsoft's drivers
on Windows. The specific default parameter values are also based on those
drivers. Yet, this patch doesn't introduce the ability to dynamically
regenerate the APST table in the event of switching the power source from
AC to battery and back. Adding this functionality may be considered in the
future. In the meantime, the timeouts and tolerances reflect a compromise
between values used by Microsoft for AC and battery scenarios.
In most NVMe devices the new algorithm causes them to implement a more
aggressive power saving policy. While beneficial in most cases, this
sometimes comes at the price of a higher IO processing latency in certain
scenarios as well as at the price of a potential impact on the drive's
endurance (due to more frequent context saving when entering deep non-
operational states). So in order to provide a fallback for systems where
these regressions cannot be tolerated, the patch allows to revert to
the legacy behavior by setting either apst_primary_timeout_ms or
apst_primary_latency_tol_us parameter to 0. Eventually (and possibly after
fine tuning the default values of the module parameters) the legacy behavior
can be removed.
TESTING.
The new algorithm has been extensively tested. Initially, simulations were
used to compare APST tables generated by old and new algorithms for a wide
range of devices. After that, power consumption, performance and latencies
were measured under different workloads on devices from multiple vendors
(WD, Intel, Samsung, Hynix, Kioxia). Below is the description of the tests
and the findings.
General observations.
The effect the patch has on the APST table varies depending on the entry and
exit latencies advertised by the devices. For some devices, the effect is
negligible (e.g. Kioxia KBG40ZNS), for some significant, making the
transitions to PS3 and PS4 much quicker (e.g. WD SN530, Intel 760P), or making
the sleep deeper, PS4 rather than PS3 after a similar amount of time (e.g.
SK Hynix BC511). For some devices (e.g. Samsung PM991) the effect is mixed:
the initial transition happens after a longer idle time, but takes the device
to a lower power state.
Workflows.
In order to evaluate the patch's effect on the power consumption and latency,
7 workflows were used for each device. The workflows were designed to test
the scenarios where significant differences between the old and new behaviors
are most likely. Each workflow was tested twice: with the new and with the
old APST table generation implementation. Power consumption, performance and
latency were measured in the process. The following workflows were used:
1) Consecutive write at the maximum rate with IO depth of 2, with no pauses
2) Repeated pattern of 1000 consecutive writes of 4K packets followed by 50ms
idle time
3) Repeated pattern of 1000 consecutive writes of 4K packets followed by 150ms
idle time
4) Repeated pattern of 1000 consecutive writes of 4K packets followed by 500ms
idle time
5) Repeated pattern of 1000 consecutive writes of 4K packets followed by 1.5s
idle time
6) Repeated pattern of 1000 consecutive writes of 4K packets followed by 5s
idle time
7) Repeated pattern of a single random read of a 4K packet followed by 150ms
idle time
Power consumption
Actual power consumption measurements produced predictable results in
accordance with the APST mechanism's theory of operation.
Devices with long entry and exit latencies such as WD SN530 showed huge
improvement on scenarios 4,5 and 6 of up to 62%. Devices such as Kioxia
KBG40ZNS where the resulting APST table looks virtually identical with
both legacy and new algorithms, showed little or no change in the average power
consumption on all workflows. Devices with extra short latencies such as
Samsung PM991 showed moderate increase in power consumption of up to 18% in
worst case scenarios.
In addition, on Intel and Samsung devices a more complex impact was observed
on scenarios 3, 4 and 7. Our understanding is that due to longer stay in deep
non-operational states between the writes the devices start performing background
operations leading to an increase of power consumption. With the old APST tables
part of these operations are delayed until the scenario is over and a longer idle
period begins, but eventually this extra power is consumed anyway.
Performance.
In terms of performance measured on sustained write or read scenarios, the
effect of the patch is minimal as in this case the device doesn't enter low power
states.
Latency
As expected, in devices where the patch causes a more aggressive power saving
policy (e.g. WD SN530, Intel 760P), an increase in latency was observed in
certain scenarios. Workflow number 7, specifically designed to simulate the
worst case scenario as far as latency is concerned, indeed shows a sharp
increase in average latency (~2ms -> ~53ms on Intel 760P and 0.6 -> 10ms on
WD SN530). The latency increase on other workloads and other devices is much
milder or non-existent.
Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-04-28 17:27:36 +08:00
|
|
|
static unsigned long apst_primary_timeout_ms = 100;
|
|
|
|
module_param(apst_primary_timeout_ms, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(apst_primary_timeout_ms,
|
|
|
|
"primary APST timeout in ms");
|
|
|
|
|
|
|
|
static unsigned long apst_secondary_timeout_ms = 2000;
|
|
|
|
module_param(apst_secondary_timeout_ms, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(apst_secondary_timeout_ms,
|
|
|
|
"secondary APST timeout in ms");
|
|
|
|
|
|
|
|
static unsigned long apst_primary_latency_tol_us = 15000;
|
|
|
|
module_param(apst_primary_latency_tol_us, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(apst_primary_latency_tol_us,
|
|
|
|
"primary APST latency tolerance in us");
|
|
|
|
|
|
|
|
static unsigned long apst_secondary_latency_tol_us = 100000;
|
|
|
|
module_param(apst_secondary_latency_tol_us, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(apst_secondary_latency_tol_us,
|
|
|
|
"secondary APST latency tolerance in us");
|
|
|
|
|
2018-01-14 18:39:02 +08:00
|
|
|
/*
|
|
|
|
* nvme_wq - hosts nvme related works that are not reset or delete
|
|
|
|
* nvme_reset_wq - hosts nvme reset works
|
|
|
|
* nvme_delete_wq - hosts nvme delete works
|
|
|
|
*
|
2020-02-11 08:01:45 +08:00
|
|
|
* nvme_wq will host works such as scan, aen handling, fw activation,
|
|
|
|
* keep-alive, periodic reconnects etc. nvme_reset_wq
|
2018-01-14 18:39:02 +08:00
|
|
|
* runs reset works which also flush works hosted on nvme_wq for
|
|
|
|
* serialization purposes. nvme_delete_wq host controller deletion
|
|
|
|
* works which flush reset works for serialization.
|
|
|
|
*/
|
2017-06-08 02:31:55 +08:00
|
|
|
struct workqueue_struct *nvme_wq;
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_wq);
|
|
|
|
|
2018-01-14 18:39:02 +08:00
|
|
|
struct workqueue_struct *nvme_reset_wq;
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_reset_wq);
|
|
|
|
|
|
|
|
struct workqueue_struct *nvme_delete_wq;
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_delete_wq);
|
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
static LIST_HEAD(nvme_subsystems);
|
|
|
|
static DEFINE_MUTEX(nvme_subsystems_lock);
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2017-10-18 19:10:01 +08:00
|
|
|
static DEFINE_IDA(nvme_instance_ida);
|
2020-12-01 20:56:08 +08:00
|
|
|
static dev_t nvme_ctrl_base_chr_devt;
|
2015-11-28 22:40:19 +08:00
|
|
|
static struct class *nvme_class;
|
2017-11-09 20:48:55 +08:00
|
|
|
static struct class *nvme_subsys_class;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2021-04-21 15:45:04 +08:00
|
|
|
static DEFINE_IDA(nvme_ns_chr_minor_ida);
|
|
|
|
static dev_t nvme_ns_chr_devt;
|
|
|
|
static struct class *nvme_ns_chr_class;
|
|
|
|
|
2018-05-04 16:01:57 +08:00
|
|
|
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
|
2018-06-30 03:03:28 +08:00
|
|
|
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
|
|
|
|
unsigned nsid);
|
2021-09-01 16:23:42 +08:00
|
|
|
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
|
|
|
|
struct nvme_command *cmd);
|
2018-06-30 03:03:28 +08:00
|
|
|
|
2021-04-10 14:42:03 +08:00
|
|
|
void nvme_queue_scan(struct nvme_ctrl *ctrl)
|
2018-05-26 00:15:47 +08:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Only new queue scan work when admin and IO queues are both alive
|
|
|
|
*/
|
2019-09-03 23:22:24 +08:00
|
|
|
if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
|
2018-05-26 00:15:47 +08:00
|
|
|
queue_work(nvme_wq, &ctrl->scan_work);
|
|
|
|
}
|
|
|
|
|
2019-09-07 01:23:08 +08:00
|
|
|
/*
|
|
|
|
* Use this function to proceed with scheduling reset_work for a controller
|
|
|
|
* that had previously been set to the resetting state. This is intended for
|
|
|
|
* code paths that can't be interrupted by other reset attempts. A hot removal
|
|
|
|
* may prevent this from succeeding.
|
|
|
|
*/
|
2019-09-05 00:06:11 +08:00
|
|
|
int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
|
2019-09-07 01:23:08 +08:00
|
|
|
{
|
|
|
|
if (ctrl->state != NVME_CTRL_RESETTING)
|
|
|
|
return -EBUSY;
|
|
|
|
if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
|
|
|
|
return -EBUSY;
|
|
|
|
return 0;
|
|
|
|
}
|
2019-09-05 00:06:11 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
|
2019-09-07 01:23:08 +08:00
|
|
|
|
2020-11-25 02:34:59 +08:00
|
|
|
static void nvme_failfast_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
|
|
|
|
struct nvme_ctrl, failfast_work);
|
|
|
|
|
|
|
|
if (ctrl->state != NVME_CTRL_CONNECTING)
|
|
|
|
return;
|
|
|
|
|
|
|
|
set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
|
|
|
|
dev_info(ctrl->device, "failfast expired\n");
|
|
|
|
nvme_kick_requeue_lists(ctrl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
schedule_delayed_work(&ctrl->failfast_work,
|
|
|
|
ctrl->opts->fast_io_fail_tmo * HZ);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (!ctrl->opts)
|
|
|
|
return;
|
|
|
|
|
|
|
|
cancel_delayed_work_sync(&ctrl->failfast_work);
|
|
|
|
clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-06-15 21:41:08 +08:00
|
|
|
int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
|
|
|
|
return -EBUSY;
|
2018-01-14 18:39:02 +08:00
|
|
|
if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
|
2017-06-15 21:41:08 +08:00
|
|
|
return -EBUSY;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
|
|
|
|
|
2021-04-10 14:42:03 +08:00
|
|
|
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
|
2017-06-15 21:41:08 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = nvme_reset_ctrl(ctrl);
|
2018-01-17 19:01:14 +08:00
|
|
|
if (!ret) {
|
2017-06-15 21:41:08 +08:00
|
|
|
flush_work(&ctrl->reset_work);
|
2019-09-03 23:22:24 +08:00
|
|
|
if (ctrl->state != NVME_CTRL_LIVE)
|
2018-01-17 19:01:14 +08:00
|
|
|
ret = -ENETRESET;
|
|
|
|
}
|
|
|
|
|
2017-06-15 21:41:08 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-02-15 06:50:56 +08:00
|
|
|
static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
|
2017-10-29 16:44:29 +08:00
|
|
|
{
|
2018-03-11 23:46:06 +08:00
|
|
|
dev_info(ctrl->device,
|
2021-09-22 14:35:25 +08:00
|
|
|
"Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
|
2018-03-11 23:46:06 +08:00
|
|
|
|
2017-10-29 20:21:02 +08:00
|
|
|
flush_work(&ctrl->reset_work);
|
2017-10-29 16:44:31 +08:00
|
|
|
nvme_stop_ctrl(ctrl);
|
|
|
|
nvme_remove_namespaces(ctrl);
|
2017-10-29 16:44:29 +08:00
|
|
|
ctrl->ops->delete_ctrl(ctrl);
|
2017-10-29 16:44:31 +08:00
|
|
|
nvme_uninit_ctrl(ctrl);
|
2017-10-29 16:44:29 +08:00
|
|
|
}
|
|
|
|
|
2019-02-15 06:50:56 +08:00
|
|
|
static void nvme_delete_ctrl_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(work, struct nvme_ctrl, delete_work);
|
|
|
|
|
|
|
|
nvme_do_delete_ctrl(ctrl);
|
|
|
|
}
|
|
|
|
|
2017-10-29 16:44:29 +08:00
|
|
|
int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
|
|
|
|
return -EBUSY;
|
2018-01-14 18:39:02 +08:00
|
|
|
if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
|
2017-10-29 16:44:29 +08:00
|
|
|
return -EBUSY;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
|
|
|
|
|
2020-03-24 23:29:39 +08:00
|
|
|
static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
|
2017-10-29 16:44:29 +08:00
|
|
|
{
|
|
|
|
/*
|
2019-03-14 01:54:58 +08:00
|
|
|
* Keep a reference until nvme_do_delete_ctrl() complete,
|
|
|
|
* since ->delete_ctrl can free the controller.
|
2017-10-29 16:44:29 +08:00
|
|
|
*/
|
|
|
|
nvme_get_ctrl(ctrl);
|
2020-03-24 23:29:39 +08:00
|
|
|
if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
|
2019-02-15 06:50:57 +08:00
|
|
|
nvme_do_delete_ctrl(ctrl);
|
2017-10-29 16:44:29 +08:00
|
|
|
nvme_put_ctrl(ctrl);
|
|
|
|
}
|
|
|
|
|
2019-08-30 03:53:15 +08:00
|
|
|
static blk_status_t nvme_error_status(u16 status)
|
2017-04-20 22:02:57 +08:00
|
|
|
{
|
2019-08-30 03:53:15 +08:00
|
|
|
switch (status & 0x7ff) {
|
2017-04-20 22:02:57 +08:00
|
|
|
case NVME_SC_SUCCESS:
|
2017-06-03 15:38:04 +08:00
|
|
|
return BLK_STS_OK;
|
2017-04-20 22:02:57 +08:00
|
|
|
case NVME_SC_CAP_EXCEEDED:
|
2017-06-03 15:38:04 +08:00
|
|
|
return BLK_STS_NOSPC;
|
2018-01-10 03:04:14 +08:00
|
|
|
case NVME_SC_LBA_RANGE:
|
2019-12-06 03:50:44 +08:00
|
|
|
case NVME_SC_CMD_INTERRUPTED:
|
|
|
|
case NVME_SC_NS_NOT_READY:
|
2018-01-10 03:04:14 +08:00
|
|
|
return BLK_STS_TARGET;
|
|
|
|
case NVME_SC_BAD_ATTRIBUTES:
|
2017-04-21 18:59:07 +08:00
|
|
|
case NVME_SC_ONCS_NOT_SUPPORTED:
|
2018-01-10 03:04:14 +08:00
|
|
|
case NVME_SC_INVALID_OPCODE:
|
|
|
|
case NVME_SC_INVALID_FIELD:
|
|
|
|
case NVME_SC_INVALID_NS:
|
2017-06-03 15:38:04 +08:00
|
|
|
return BLK_STS_NOTSUPP;
|
2017-04-21 18:59:07 +08:00
|
|
|
case NVME_SC_WRITE_FAULT:
|
|
|
|
case NVME_SC_READ_ERROR:
|
|
|
|
case NVME_SC_UNWRITTEN_BLOCK:
|
2017-08-22 16:17:03 +08:00
|
|
|
case NVME_SC_ACCESS_DENIED:
|
|
|
|
case NVME_SC_READ_ONLY:
|
2018-01-10 03:04:14 +08:00
|
|
|
case NVME_SC_COMPARE_FAILED:
|
2017-06-03 15:38:04 +08:00
|
|
|
return BLK_STS_MEDIUM;
|
2017-08-22 16:17:03 +08:00
|
|
|
case NVME_SC_GUARD_CHECK:
|
|
|
|
case NVME_SC_APPTAG_CHECK:
|
|
|
|
case NVME_SC_REFTAG_CHECK:
|
|
|
|
case NVME_SC_INVALID_PI:
|
|
|
|
return BLK_STS_PROTECTION;
|
|
|
|
case NVME_SC_RESERVATION_CONFLICT:
|
|
|
|
return BLK_STS_NEXUS;
|
2019-08-03 09:04:12 +08:00
|
|
|
case NVME_SC_HOST_PATH_ERROR:
|
|
|
|
return BLK_STS_TRANSPORT;
|
2020-09-25 04:53:29 +08:00
|
|
|
case NVME_SC_ZONE_TOO_MANY_ACTIVE:
|
|
|
|
return BLK_STS_ZONE_ACTIVE_RESOURCE;
|
|
|
|
case NVME_SC_ZONE_TOO_MANY_OPEN:
|
|
|
|
return BLK_STS_ZONE_OPEN_RESOURCE;
|
2017-06-03 15:38:04 +08:00
|
|
|
default:
|
|
|
|
return BLK_STS_IOERR;
|
2017-04-20 22:02:57 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-28 00:40:57 +08:00
|
|
|
static void nvme_retry_req(struct request *req)
|
|
|
|
{
|
|
|
|
unsigned long delay = 0;
|
|
|
|
u16 crd;
|
|
|
|
|
|
|
|
/* The mask and shift result must be <= 3 */
|
|
|
|
crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
|
2021-01-08 22:46:57 +08:00
|
|
|
if (crd)
|
|
|
|
delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
|
2018-11-28 00:40:57 +08:00
|
|
|
|
|
|
|
nvme_req(req)->retries++;
|
|
|
|
blk_mq_requeue_request(req, false);
|
|
|
|
blk_mq_delay_kick_requeue_list(req->q, delay);
|
|
|
|
}
|
|
|
|
|
2022-02-03 16:11:53 +08:00
|
|
|
static void nvme_log_error(struct request *req)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns = req->q->queuedata;
|
|
|
|
struct nvme_request *nr = nvme_req(req);
|
|
|
|
|
|
|
|
if (ns) {
|
|
|
|
pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
|
|
|
|
ns->disk ? ns->disk->disk_name : "?",
|
|
|
|
nvme_get_opcode_str(nr->cmd->common.opcode),
|
|
|
|
nr->cmd->common.opcode,
|
|
|
|
(unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
|
|
|
|
(unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
|
|
|
|
nvme_get_error_status_str(nr->status),
|
|
|
|
nr->status >> 8 & 7, /* Status Code Type */
|
|
|
|
nr->status & 0xff, /* Status Code */
|
|
|
|
nr->status & NVME_SC_MORE ? "MORE " : "",
|
|
|
|
nr->status & NVME_SC_DNR ? "DNR " : "");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
|
|
|
|
dev_name(nr->ctrl->device),
|
|
|
|
nvme_get_admin_opcode_str(nr->cmd->common.opcode),
|
|
|
|
nr->cmd->common.opcode,
|
|
|
|
nvme_get_error_status_str(nr->status),
|
|
|
|
nr->status >> 8 & 7, /* Status Code Type */
|
|
|
|
nr->status & 0xff, /* Status Code */
|
|
|
|
nr->status & NVME_SC_MORE ? "MORE " : "",
|
|
|
|
nr->status & NVME_SC_DNR ? "DNR " : "");
|
|
|
|
}
|
|
|
|
|
2020-08-18 15:11:30 +08:00
|
|
|
enum nvme_disposition {
|
|
|
|
COMPLETE,
|
|
|
|
RETRY,
|
|
|
|
FAILOVER,
|
2022-06-27 17:52:02 +08:00
|
|
|
AUTHENTICATE,
|
2020-08-18 15:11:30 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
|
2017-03-30 19:41:32 +08:00
|
|
|
{
|
2020-08-18 15:11:30 +08:00
|
|
|
if (likely(nvme_req(req)->status == 0))
|
|
|
|
return COMPLETE;
|
2018-01-10 03:04:15 +08:00
|
|
|
|
2022-06-27 17:52:02 +08:00
|
|
|
if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED)
|
|
|
|
return AUTHENTICATE;
|
|
|
|
|
2020-08-18 15:11:30 +08:00
|
|
|
if (blk_noretry_request(req) ||
|
|
|
|
(nvme_req(req)->status & NVME_SC_DNR) ||
|
|
|
|
nvme_req(req)->retries >= nvme_max_retries)
|
|
|
|
return COMPLETE;
|
2018-01-26 18:21:38 +08:00
|
|
|
|
2020-08-18 15:11:30 +08:00
|
|
|
if (req->cmd_flags & REQ_NVME_MPATH) {
|
2020-08-18 15:11:32 +08:00
|
|
|
if (nvme_is_path_error(nvme_req(req)->status) ||
|
|
|
|
blk_queue_dying(req->q))
|
2020-08-18 15:11:30 +08:00
|
|
|
return FAILOVER;
|
2020-08-18 15:11:32 +08:00
|
|
|
} else {
|
|
|
|
if (blk_queue_dying(req->q))
|
|
|
|
return COMPLETE;
|
2020-08-18 15:11:30 +08:00
|
|
|
}
|
2019-10-14 00:57:36 +08:00
|
|
|
|
2020-08-18 15:11:30 +08:00
|
|
|
return RETRY;
|
|
|
|
}
|
2018-11-03 01:28:15 +08:00
|
|
|
|
2021-10-08 19:59:37 +08:00
|
|
|
static inline void nvme_end_req_zoned(struct request *req)
|
2020-08-18 15:11:30 +08:00
|
|
|
{
|
|
|
|
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
|
|
|
|
req_op(req) == REQ_OP_ZONE_APPEND)
|
2020-06-30 03:06:41 +08:00
|
|
|
req->__sector = nvme_lba_to_sect(req->q->queuedata,
|
|
|
|
le64_to_cpu(nvme_req(req)->result.u64));
|
2021-10-08 19:59:37 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void nvme_end_req(struct request *req)
|
|
|
|
{
|
|
|
|
blk_status_t status = nvme_error_status(nvme_req(req)->status);
|
2019-07-24 21:47:55 +08:00
|
|
|
|
2022-04-11 11:12:49 +08:00
|
|
|
if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET)))
|
2022-02-03 16:11:53 +08:00
|
|
|
nvme_log_error(req);
|
2021-10-08 19:59:37 +08:00
|
|
|
nvme_end_req_zoned(req);
|
2021-01-05 18:34:02 +08:00
|
|
|
nvme_trace_bio_complete(req);
|
2022-11-29 22:43:19 +08:00
|
|
|
if (req->cmd_flags & REQ_NVME_MPATH)
|
|
|
|
nvme_mpath_end_request(req);
|
2018-01-10 03:04:15 +08:00
|
|
|
blk_mq_end_request(req, status);
|
2017-03-30 19:41:32 +08:00
|
|
|
}
|
2020-08-18 15:11:30 +08:00
|
|
|
|
|
|
|
void nvme_complete_rq(struct request *req)
|
|
|
|
{
|
2022-06-27 17:52:02 +08:00
|
|
|
struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
|
|
|
|
|
2020-08-18 15:11:30 +08:00
|
|
|
trace_nvme_complete_rq(req);
|
|
|
|
nvme_cleanup_cmd(req);
|
|
|
|
|
2022-06-27 17:52:02 +08:00
|
|
|
if (ctrl->kas)
|
|
|
|
ctrl->comp_seen = true;
|
2020-08-18 15:11:30 +08:00
|
|
|
|
|
|
|
switch (nvme_decide_disposition(req)) {
|
|
|
|
case COMPLETE:
|
|
|
|
nvme_end_req(req);
|
|
|
|
return;
|
|
|
|
case RETRY:
|
|
|
|
nvme_retry_req(req);
|
|
|
|
return;
|
|
|
|
case FAILOVER:
|
|
|
|
nvme_failover_req(req);
|
|
|
|
return;
|
2022-06-27 17:52:02 +08:00
|
|
|
case AUTHENTICATE:
|
|
|
|
#ifdef CONFIG_NVME_AUTH
|
|
|
|
queue_work(nvme_wq, &ctrl->dhchap_auth_work);
|
|
|
|
nvme_retry_req(req);
|
|
|
|
#else
|
|
|
|
nvme_end_req(req);
|
|
|
|
#endif
|
|
|
|
return;
|
2020-08-18 15:11:30 +08:00
|
|
|
}
|
|
|
|
}
|
2017-03-30 19:41:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_complete_rq);
|
|
|
|
|
2021-10-08 19:59:37 +08:00
|
|
|
void nvme_complete_batch_req(struct request *req)
|
|
|
|
{
|
2022-02-08 07:28:06 +08:00
|
|
|
trace_nvme_complete_rq(req);
|
2021-10-08 19:59:37 +08:00
|
|
|
nvme_cleanup_cmd(req);
|
|
|
|
nvme_end_req_zoned(req);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
|
|
|
|
|
2021-02-04 15:55:11 +08:00
|
|
|
/*
|
|
|
|
* Called to unwind from ->queue_rq on a failed command submission so that the
|
|
|
|
* multipathing code gets called to potentially failover to another path.
|
|
|
|
* The caller needs to unwind all transport specific resource allocations and
|
|
|
|
* must return propagate the return value.
|
|
|
|
*/
|
|
|
|
blk_status_t nvme_host_path_error(struct request *req)
|
|
|
|
{
|
|
|
|
nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
|
|
|
|
blk_mq_set_request_complete(req);
|
|
|
|
nvme_complete_rq(req);
|
|
|
|
return BLK_STS_OK;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_host_path_error);
|
|
|
|
|
2022-07-06 20:03:53 +08:00
|
|
|
bool nvme_cancel_request(struct request *req, void *data)
|
2016-05-19 05:05:02 +08:00
|
|
|
{
|
|
|
|
dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
|
|
|
|
"Cancelling I/O %d", req->tag);
|
|
|
|
|
2023-04-06 23:39:11 +08:00
|
|
|
/* don't abort one completed or idle request */
|
|
|
|
if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT)
|
2019-07-24 11:48:41 +08:00
|
|
|
return true;
|
|
|
|
|
2019-10-14 00:57:35 +08:00
|
|
|
nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
|
2021-02-26 15:17:26 +08:00
|
|
|
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
|
2020-06-11 14:44:47 +08:00
|
|
|
blk_mq_complete_request(req);
|
2018-11-09 01:24:07 +08:00
|
|
|
return true;
|
2016-05-19 05:05:02 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_cancel_request);
|
|
|
|
|
2021-01-21 11:32:36 +08:00
|
|
|
void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (ctrl->tagset) {
|
|
|
|
blk_mq_tagset_busy_iter(ctrl->tagset,
|
|
|
|
nvme_cancel_request, ctrl);
|
|
|
|
blk_mq_tagset_wait_completed_request(ctrl->tagset);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
|
|
|
|
|
|
|
|
void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (ctrl->admin_tagset) {
|
|
|
|
blk_mq_tagset_busy_iter(ctrl->admin_tagset,
|
|
|
|
nvme_cancel_request, ctrl);
|
|
|
|
blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
|
|
|
|
|
2016-04-26 19:51:57 +08:00
|
|
|
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
|
|
|
|
enum nvme_ctrl_state new_state)
|
|
|
|
{
|
2016-07-30 03:15:18 +08:00
|
|
|
enum nvme_ctrl_state old_state;
|
2017-08-22 17:42:24 +08:00
|
|
|
unsigned long flags;
|
2016-04-26 19:51:57 +08:00
|
|
|
bool changed = false;
|
|
|
|
|
2017-08-22 17:42:24 +08:00
|
|
|
spin_lock_irqsave(&ctrl->lock, flags);
|
2016-07-30 03:15:18 +08:00
|
|
|
|
|
|
|
old_state = ctrl->state;
|
2016-04-26 19:51:57 +08:00
|
|
|
switch (new_state) {
|
|
|
|
case NVME_CTRL_LIVE:
|
|
|
|
switch (old_state) {
|
2016-06-13 22:45:22 +08:00
|
|
|
case NVME_CTRL_NEW:
|
2016-04-26 19:51:57 +08:00
|
|
|
case NVME_CTRL_RESETTING:
|
2018-02-01 00:31:24 +08:00
|
|
|
case NVME_CTRL_CONNECTING:
|
2016-04-26 19:51:57 +08:00
|
|
|
changed = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2016-04-26 19:51:57 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case NVME_CTRL_RESETTING:
|
|
|
|
switch (old_state) {
|
|
|
|
case NVME_CTRL_NEW:
|
2016-07-06 20:55:49 +08:00
|
|
|
case NVME_CTRL_LIVE:
|
|
|
|
changed = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2016-07-06 20:55:49 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
2018-02-01 00:31:24 +08:00
|
|
|
case NVME_CTRL_CONNECTING:
|
2016-07-06 20:55:49 +08:00
|
|
|
switch (old_state) {
|
2018-02-01 00:31:25 +08:00
|
|
|
case NVME_CTRL_NEW:
|
2017-10-26 07:43:13 +08:00
|
|
|
case NVME_CTRL_RESETTING:
|
2016-04-26 19:51:57 +08:00
|
|
|
changed = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2016-04-26 19:51:57 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case NVME_CTRL_DELETING:
|
|
|
|
switch (old_state) {
|
|
|
|
case NVME_CTRL_LIVE:
|
|
|
|
case NVME_CTRL_RESETTING:
|
2018-02-01 00:31:24 +08:00
|
|
|
case NVME_CTRL_CONNECTING:
|
2016-04-26 19:51:57 +08:00
|
|
|
changed = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2016-04-26 19:51:57 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
2020-07-23 07:32:19 +08:00
|
|
|
case NVME_CTRL_DELETING_NOIO:
|
|
|
|
switch (old_state) {
|
|
|
|
case NVME_CTRL_DELETING:
|
|
|
|
case NVME_CTRL_DEAD:
|
|
|
|
changed = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2020-07-23 07:32:19 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
2016-05-12 22:37:14 +08:00
|
|
|
case NVME_CTRL_DEAD:
|
|
|
|
switch (old_state) {
|
|
|
|
case NVME_CTRL_DELETING:
|
|
|
|
changed = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2016-05-12 22:37:14 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
2016-04-26 19:51:57 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2019-09-05 00:06:11 +08:00
|
|
|
if (changed) {
|
2016-04-26 19:51:57 +08:00
|
|
|
ctrl->state = new_state;
|
2019-09-05 00:06:11 +08:00
|
|
|
wake_up_all(&ctrl->state_wq);
|
|
|
|
}
|
2016-04-26 19:51:57 +08:00
|
|
|
|
2017-08-22 17:42:24 +08:00
|
|
|
spin_unlock_irqrestore(&ctrl->lock, flags);
|
2020-11-25 02:34:59 +08:00
|
|
|
if (!changed)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (ctrl->state == NVME_CTRL_LIVE) {
|
|
|
|
if (old_state == NVME_CTRL_CONNECTING)
|
|
|
|
nvme_stop_failfast_work(ctrl);
|
2017-11-02 19:59:30 +08:00
|
|
|
nvme_kick_requeue_lists(ctrl);
|
2020-11-25 02:34:59 +08:00
|
|
|
} else if (ctrl->state == NVME_CTRL_CONNECTING &&
|
|
|
|
old_state == NVME_CTRL_RESETTING) {
|
|
|
|
nvme_start_failfast_work(ctrl);
|
|
|
|
}
|
2016-04-26 19:51:57 +08:00
|
|
|
return changed;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
|
|
|
|
|
2019-09-05 00:06:11 +08:00
|
|
|
/*
|
|
|
|
* Returns true for sink states that can't ever transition back to live.
|
|
|
|
*/
|
|
|
|
static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
switch (ctrl->state) {
|
|
|
|
case NVME_CTRL_NEW:
|
|
|
|
case NVME_CTRL_LIVE:
|
|
|
|
case NVME_CTRL_RESETTING:
|
|
|
|
case NVME_CTRL_CONNECTING:
|
|
|
|
return false;
|
|
|
|
case NVME_CTRL_DELETING:
|
2020-07-23 07:32:19 +08:00
|
|
|
case NVME_CTRL_DELETING_NOIO:
|
2019-09-05 00:06:11 +08:00
|
|
|
case NVME_CTRL_DEAD:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Waits for the controller state to be resetting, or returns false if it is
|
|
|
|
* not possible to ever transition to that state.
|
|
|
|
*/
|
|
|
|
bool nvme_wait_reset(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
wait_event(ctrl->state_wq,
|
|
|
|
nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
|
|
|
|
nvme_state_terminal(ctrl));
|
|
|
|
return ctrl->state == NVME_CTRL_RESETTING;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_wait_reset);
|
|
|
|
|
2017-11-09 20:50:43 +08:00
|
|
|
static void nvme_free_ns_head(struct kref *ref)
|
|
|
|
{
|
|
|
|
struct nvme_ns_head *head =
|
|
|
|
container_of(ref, struct nvme_ns_head, ref);
|
|
|
|
|
2017-11-02 19:59:30 +08:00
|
|
|
nvme_mpath_remove_disk(head);
|
2022-02-14 17:07:27 +08:00
|
|
|
ida_free(&head->subsys->ns_ida, head->instance);
|
2019-02-14 05:54:37 +08:00
|
|
|
cleanup_srcu_struct(&head->srcu);
|
2018-05-04 16:01:57 +08:00
|
|
|
nvme_put_subsystem(head->subsys);
|
2017-11-09 20:50:43 +08:00
|
|
|
kfree(head);
|
|
|
|
}
|
|
|
|
|
2021-04-07 20:22:12 +08:00
|
|
|
bool nvme_tryget_ns_head(struct nvme_ns_head *head)
|
2021-04-07 20:20:40 +08:00
|
|
|
{
|
|
|
|
return kref_get_unless_zero(&head->ref);
|
|
|
|
}
|
|
|
|
|
2021-04-07 20:22:12 +08:00
|
|
|
void nvme_put_ns_head(struct nvme_ns_head *head)
|
2017-11-09 20:50:43 +08:00
|
|
|
{
|
|
|
|
kref_put(&head->ref, nvme_free_ns_head);
|
|
|
|
}
|
|
|
|
|
2015-11-26 17:54:19 +08:00
|
|
|
static void nvme_free_ns(struct kref *kref)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
|
|
|
|
|
|
|
|
put_disk(ns->disk);
|
2017-11-09 20:50:43 +08:00
|
|
|
nvme_put_ns_head(ns->head);
|
2016-02-25 00:15:53 +08:00
|
|
|
nvme_put_ctrl(ns->ctrl);
|
2015-11-26 17:54:19 +08:00
|
|
|
kfree(ns);
|
|
|
|
}
|
|
|
|
|
2021-04-27 14:47:46 +08:00
|
|
|
static inline bool nvme_get_ns(struct nvme_ns *ns)
|
|
|
|
{
|
|
|
|
return kref_get_unless_zero(&ns->kref);
|
|
|
|
}
|
|
|
|
|
2020-07-25 01:25:16 +08:00
|
|
|
void nvme_put_ns(struct nvme_ns *ns)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
|
|
|
kref_put(&ns->kref, nvme_free_ns);
|
|
|
|
}
|
2020-07-25 01:25:16 +08:00
|
|
|
EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2018-04-12 23:16:15 +08:00
|
|
|
static inline void nvme_clear_nvme_request(struct request *req)
|
|
|
|
{
|
2021-06-11 05:44:37 +08:00
|
|
|
nvme_req(req)->status = 0;
|
2021-03-01 10:06:08 +08:00
|
|
|
nvme_req(req)->retries = 0;
|
|
|
|
nvme_req(req)->flags = 0;
|
|
|
|
req->rq_flags |= RQF_DONTPREP;
|
2018-04-12 23:16:15 +08:00
|
|
|
}
|
|
|
|
|
2022-03-15 22:53:59 +08:00
|
|
|
/* initialize a passthrough request */
|
|
|
|
void nvme_init_request(struct request *req, struct nvme_command *cmd)
|
2020-11-10 10:24:00 +08:00
|
|
|
{
|
2020-11-10 08:33:42 +08:00
|
|
|
if (req->q->queuedata)
|
|
|
|
req->timeout = NVME_IO_TIMEOUT;
|
|
|
|
else /* no queuedata implies admin queue */
|
2020-11-10 08:33:45 +08:00
|
|
|
req->timeout = NVME_ADMIN_TIMEOUT;
|
2015-11-26 16:08:36 +08:00
|
|
|
|
2021-03-18 04:37:03 +08:00
|
|
|
/* passthru commands should let the driver set the SGL flags */
|
|
|
|
cmd->common.flags &= ~NVME_CMD_SGL_ALL;
|
|
|
|
|
2015-11-26 16:08:36 +08:00
|
|
|
req->cmd_flags |= REQ_FAILFAST_DRIVER;
|
2021-06-11 05:44:35 +08:00
|
|
|
if (req->mq_hctx->type == HCTX_TYPE_POLL)
|
2021-10-12 19:12:21 +08:00
|
|
|
req->cmd_flags |= REQ_POLLED;
|
2018-04-12 23:16:15 +08:00
|
|
|
nvme_clear_nvme_request(req);
|
2022-10-29 04:14:15 +08:00
|
|
|
req->rq_flags |= RQF_QUIET;
|
2021-03-18 04:37:03 +08:00
|
|
|
memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd));
|
2020-11-10 10:24:00 +08:00
|
|
|
}
|
2022-03-15 22:53:59 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_init_request);
|
2020-11-10 10:24:00 +08:00
|
|
|
|
2021-04-26 10:53:10 +08:00
|
|
|
/*
|
|
|
|
* For something we're not in a state to send to the device the default action
|
|
|
|
* is to busy it and retry it after the controller state is recovered. However,
|
|
|
|
* if the controller is deleting or if anything is marked for failfast or
|
|
|
|
* nvme multipath it is immediately failed.
|
|
|
|
*
|
|
|
|
* Note: commands used to initialize the controller will be marked for failfast.
|
|
|
|
* Note: nvme cli/ioctl commands are marked for failfast.
|
|
|
|
*/
|
|
|
|
blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
|
|
|
|
struct request *rq)
|
|
|
|
{
|
|
|
|
if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
|
2021-11-04 15:13:32 +08:00
|
|
|
ctrl->state != NVME_CTRL_DELETING &&
|
2021-04-26 10:53:10 +08:00
|
|
|
ctrl->state != NVME_CTRL_DEAD &&
|
|
|
|
!test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
|
|
|
|
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
|
|
|
|
return BLK_STS_RESOURCE;
|
|
|
|
return nvme_host_path_error(rq);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
|
|
|
|
|
|
|
|
bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
|
|
|
|
bool queue_live)
|
|
|
|
{
|
|
|
|
struct nvme_request *req = nvme_req(rq);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* currently we have a problem sending passthru commands
|
|
|
|
* on the admin_q if the controller is not LIVE because we can't
|
|
|
|
* make sure that they are going out after the admin connect,
|
|
|
|
* controller enable and/or other commands in the initialization
|
|
|
|
* sequence. until the controller will be LIVE, fail with
|
|
|
|
* BLK_STS_RESOURCE so that they will be rescheduled.
|
|
|
|
*/
|
|
|
|
if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (ctrl->ops->flags & NVME_F_FABRICS) {
|
|
|
|
/*
|
|
|
|
* Only allow commands on a live queue, except for the connect
|
|
|
|
* command, which is require to set the queue live in the
|
|
|
|
* appropinquate states.
|
|
|
|
*/
|
|
|
|
switch (ctrl->state) {
|
|
|
|
case NVME_CTRL_CONNECTING:
|
|
|
|
if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
|
2022-06-27 17:52:02 +08:00
|
|
|
(req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
|
|
|
|
req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
|
|
|
|
req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
|
2021-04-26 10:53:10 +08:00
|
|
|
return true;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
case NVME_CTRL_DEAD:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return queue_live;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__nvme_check_ready);
|
|
|
|
|
2016-04-13 03:10:14 +08:00
|
|
|
static inline void nvme_setup_flush(struct nvme_ns *ns,
|
|
|
|
struct nvme_command *cmnd)
|
|
|
|
{
|
2021-10-18 20:45:06 +08:00
|
|
|
memset(cmnd, 0, sizeof(*cmnd));
|
2016-04-13 03:10:14 +08:00
|
|
|
cmnd->common.opcode = nvme_cmd_flush;
|
2017-11-09 20:50:43 +08:00
|
|
|
cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
|
2016-04-13 03:10:14 +08:00
|
|
|
}
|
|
|
|
|
2017-06-03 15:38:05 +08:00
|
|
|
static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
|
2016-04-13 03:10:14 +08:00
|
|
|
struct nvme_command *cmnd)
|
|
|
|
{
|
2017-02-08 21:46:50 +08:00
|
|
|
unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
|
2016-04-13 03:10:14 +08:00
|
|
|
struct nvme_dsm_range *range;
|
2017-02-08 21:46:50 +08:00
|
|
|
struct bio *bio;
|
2016-04-13 03:10:14 +08:00
|
|
|
|
2019-11-13 04:55:01 +08:00
|
|
|
/*
|
|
|
|
* Some devices do not consider the DSM 'Number of Ranges' field when
|
|
|
|
* determining how much data to DMA. Always allocate memory for maximum
|
|
|
|
* number of segments to prevent device reading beyond end of buffer.
|
|
|
|
*/
|
|
|
|
static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
|
|
|
|
|
|
|
|
range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
|
2018-12-13 00:18:11 +08:00
|
|
|
if (!range) {
|
|
|
|
/*
|
|
|
|
* If we fail allocation our range, fallback to the controller
|
|
|
|
* discard page. If that's also busy, it's safe to return
|
|
|
|
* busy, as we know we can make progress once that's freed.
|
|
|
|
*/
|
|
|
|
if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
|
|
|
|
return BLK_STS_RESOURCE;
|
|
|
|
|
|
|
|
range = page_address(ns->ctrl->discard_page);
|
|
|
|
}
|
2016-04-13 03:10:14 +08:00
|
|
|
|
2023-03-04 07:13:45 +08:00
|
|
|
if (queue_max_discard_segments(req->q) == 1) {
|
|
|
|
u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req));
|
|
|
|
u32 nlb = blk_rq_sectors(req) >> (ns->lba_shift - 9);
|
|
|
|
|
|
|
|
range[0].cattr = cpu_to_le32(0);
|
|
|
|
range[0].nlb = cpu_to_le32(nlb);
|
|
|
|
range[0].slba = cpu_to_le64(slba);
|
|
|
|
n = 1;
|
|
|
|
} else {
|
|
|
|
__rq_for_each_bio(bio, req) {
|
|
|
|
u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
|
|
|
|
u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
|
|
|
|
|
|
|
|
if (n < segments) {
|
|
|
|
range[n].cattr = cpu_to_le32(0);
|
|
|
|
range[n].nlb = cpu_to_le32(nlb);
|
|
|
|
range[n].slba = cpu_to_le64(slba);
|
|
|
|
}
|
|
|
|
n++;
|
2018-02-01 08:01:58 +08:00
|
|
|
}
|
2017-02-08 21:46:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(n != segments)) {
|
2018-12-13 00:18:11 +08:00
|
|
|
if (virt_to_page(range) == ns->ctrl->discard_page)
|
|
|
|
clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
|
|
|
|
else
|
|
|
|
kfree(range);
|
2017-06-03 15:38:05 +08:00
|
|
|
return BLK_STS_IOERR;
|
2017-02-08 21:46:50 +08:00
|
|
|
}
|
2016-04-13 03:10:14 +08:00
|
|
|
|
2021-10-18 20:45:06 +08:00
|
|
|
memset(cmnd, 0, sizeof(*cmnd));
|
2016-04-13 03:10:14 +08:00
|
|
|
cmnd->dsm.opcode = nvme_cmd_dsm;
|
2017-11-09 20:50:43 +08:00
|
|
|
cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
|
2017-03-31 23:00:05 +08:00
|
|
|
cmnd->dsm.nr = cpu_to_le32(segments - 1);
|
2016-04-13 03:10:14 +08:00
|
|
|
cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
|
|
|
|
|
2023-02-03 23:06:18 +08:00
|
|
|
bvec_set_virt(&req->special_vec, range, alloc_size);
|
2016-12-09 06:20:32 +08:00
|
|
|
req->rq_flags |= RQF_SPECIAL_PAYLOAD;
|
2016-04-13 03:10:14 +08:00
|
|
|
|
2017-06-03 15:38:05 +08:00
|
|
|
return BLK_STS_OK;
|
2016-04-13 03:10:14 +08:00
|
|
|
}
|
|
|
|
|
2022-03-04 04:13:12 +08:00
|
|
|
static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
|
|
|
|
struct request *req)
|
|
|
|
{
|
|
|
|
u32 upper, lower;
|
|
|
|
u64 ref48;
|
|
|
|
|
|
|
|
/* both rw and write zeroes share the same reftag format */
|
|
|
|
switch (ns->guard_type) {
|
|
|
|
case NVME_NVM_NS_16B_GUARD:
|
|
|
|
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
|
|
|
|
break;
|
|
|
|
case NVME_NVM_NS_64B_GUARD:
|
|
|
|
ref48 = ext_pi_ref_tag(req);
|
|
|
|
lower = lower_32_bits(ref48);
|
|
|
|
upper = upper_32_bits(ref48);
|
|
|
|
|
|
|
|
cmnd->rw.reftag = cpu_to_le32(lower);
|
|
|
|
cmnd->rw.cdw3 = cpu_to_le32(upper);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-18 11:42:03 +08:00
|
|
|
static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
|
|
|
|
struct request *req, struct nvme_command *cmnd)
|
|
|
|
{
|
2021-10-18 20:45:06 +08:00
|
|
|
memset(cmnd, 0, sizeof(*cmnd));
|
|
|
|
|
2018-12-18 11:42:03 +08:00
|
|
|
if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
|
|
|
|
return nvme_setup_discard(ns, req, cmnd);
|
|
|
|
|
|
|
|
cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
|
|
|
|
cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
|
|
|
|
cmnd->write_zeroes.slba =
|
2019-10-21 11:40:03 +08:00
|
|
|
cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
|
2018-12-18 11:42:03 +08:00
|
|
|
cmnd->write_zeroes.length =
|
|
|
|
cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
|
2021-11-10 17:19:06 +08:00
|
|
|
|
2022-10-30 23:50:15 +08:00
|
|
|
if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC))
|
|
|
|
cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
|
|
|
|
|
2021-11-10 17:19:06 +08:00
|
|
|
if (nvme_ns_has_pi(ns)) {
|
2022-10-30 23:50:15 +08:00
|
|
|
cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
|
2021-11-10 17:19:06 +08:00
|
|
|
|
|
|
|
switch (ns->pi_type) {
|
|
|
|
case NVME_NS_DPS_PI_TYPE1:
|
|
|
|
case NVME_NS_DPS_PI_TYPE2:
|
2022-03-04 04:13:12 +08:00
|
|
|
nvme_set_ref_tag(ns, cmnd, req);
|
2021-11-10 17:19:06 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-18 11:42:03 +08:00
|
|
|
return BLK_STS_OK;
|
|
|
|
}
|
|
|
|
|
2017-06-13 00:36:32 +08:00
|
|
|
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
|
2020-06-30 03:06:41 +08:00
|
|
|
struct request *req, struct nvme_command *cmnd,
|
|
|
|
enum nvme_opcode op)
|
2016-04-13 03:10:14 +08:00
|
|
|
{
|
|
|
|
u16 control = 0;
|
|
|
|
u32 dsmgmt = 0;
|
|
|
|
|
|
|
|
if (req->cmd_flags & REQ_FUA)
|
|
|
|
control |= NVME_RW_FUA;
|
|
|
|
if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
|
|
|
|
control |= NVME_RW_LR;
|
|
|
|
|
|
|
|
if (req->cmd_flags & REQ_RAHEAD)
|
|
|
|
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
|
|
|
|
|
2020-06-30 03:06:41 +08:00
|
|
|
cmnd->rw.opcode = op;
|
2021-10-18 20:47:18 +08:00
|
|
|
cmnd->rw.flags = 0;
|
2017-11-09 20:50:43 +08:00
|
|
|
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
|
2022-03-04 04:13:12 +08:00
|
|
|
cmnd->rw.cdw2 = 0;
|
|
|
|
cmnd->rw.cdw3 = 0;
|
2021-10-18 20:47:18 +08:00
|
|
|
cmnd->rw.metadata = 0;
|
2019-10-21 11:40:03 +08:00
|
|
|
cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
|
2016-04-13 03:10:14 +08:00
|
|
|
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
|
2021-10-18 20:47:18 +08:00
|
|
|
cmnd->rw.reftag = 0;
|
|
|
|
cmnd->rw.apptag = 0;
|
|
|
|
cmnd->rw.appmask = 0;
|
2016-04-13 03:10:14 +08:00
|
|
|
|
|
|
|
if (ns->ms) {
|
2017-11-08 00:27:34 +08:00
|
|
|
/*
|
|
|
|
* If formated with metadata, the block layer always provides a
|
|
|
|
* metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
|
|
|
|
* we enable the PRACT bit for protection information or set the
|
|
|
|
* namespace capacity to zero to prevent any I/O.
|
|
|
|
*/
|
|
|
|
if (!blk_integrity_rq(req)) {
|
|
|
|
if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
|
|
|
|
return BLK_STS_NOTSUPP;
|
|
|
|
control |= NVME_RW_PRINFO_PRACT;
|
|
|
|
}
|
|
|
|
|
2016-04-13 03:10:14 +08:00
|
|
|
switch (ns->pi_type) {
|
|
|
|
case NVME_NS_DPS_PI_TYPE3:
|
|
|
|
control |= NVME_RW_PRINFO_PRCHK_GUARD;
|
|
|
|
break;
|
|
|
|
case NVME_NS_DPS_PI_TYPE1:
|
|
|
|
case NVME_NS_DPS_PI_TYPE2:
|
|
|
|
control |= NVME_RW_PRINFO_PRCHK_GUARD |
|
|
|
|
NVME_RW_PRINFO_PRCHK_REF;
|
2020-06-30 03:06:41 +08:00
|
|
|
if (op == nvme_cmd_zone_append)
|
|
|
|
control |= NVME_RW_APPEND_PIREMAP;
|
2022-03-04 04:13:12 +08:00
|
|
|
nvme_set_ref_tag(ns, cmnd, req);
|
2016-04-13 03:10:14 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
cmnd->rw.control = cpu_to_le16(control);
|
|
|
|
cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
|
2017-06-13 00:36:32 +08:00
|
|
|
return 0;
|
2016-04-13 03:10:14 +08:00
|
|
|
}
|
|
|
|
|
2018-07-30 05:15:33 +08:00
|
|
|
void nvme_cleanup_cmd(struct request *req)
|
|
|
|
{
|
|
|
|
if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
|
2021-01-13 22:36:27 +08:00
|
|
|
struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
|
2018-12-13 00:18:11 +08:00
|
|
|
|
2021-08-04 17:56:34 +08:00
|
|
|
if (req->special_vec.bv_page == ctrl->discard_page)
|
2021-01-13 22:36:27 +08:00
|
|
|
clear_bit_unlock(0, &ctrl->discard_page_busy);
|
2018-12-13 00:18:11 +08:00
|
|
|
else
|
2021-08-04 17:56:34 +08:00
|
|
|
kfree(bvec_virt(&req->special_vec));
|
2018-07-30 05:15:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
|
|
|
|
|
2021-03-18 04:37:03 +08:00
|
|
|
blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
|
2016-04-13 03:10:14 +08:00
|
|
|
{
|
2021-03-18 04:37:03 +08:00
|
|
|
struct nvme_command *cmd = nvme_req(req)->cmd;
|
2017-06-03 15:38:05 +08:00
|
|
|
blk_status_t ret = BLK_STS_OK;
|
2016-04-13 03:10:14 +08:00
|
|
|
|
2021-10-18 20:45:06 +08:00
|
|
|
if (!(req->rq_flags & RQF_DONTPREP))
|
2021-03-01 10:06:08 +08:00
|
|
|
nvme_clear_nvme_request(req);
|
2017-04-06 01:18:08 +08:00
|
|
|
|
2017-01-31 23:57:31 +08:00
|
|
|
switch (req_op(req)) {
|
|
|
|
case REQ_OP_DRV_IN:
|
|
|
|
case REQ_OP_DRV_OUT:
|
2021-03-18 04:37:03 +08:00
|
|
|
/* these are setup prior to execution in nvme_init_request() */
|
2017-01-31 23:57:31 +08:00
|
|
|
break;
|
|
|
|
case REQ_OP_FLUSH:
|
2016-04-13 03:10:14 +08:00
|
|
|
nvme_setup_flush(ns, cmd);
|
2017-01-31 23:57:31 +08:00
|
|
|
break;
|
2020-06-30 03:06:41 +08:00
|
|
|
case REQ_OP_ZONE_RESET_ALL:
|
|
|
|
case REQ_OP_ZONE_RESET:
|
|
|
|
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
|
|
|
|
break;
|
|
|
|
case REQ_OP_ZONE_OPEN:
|
|
|
|
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
|
|
|
|
break;
|
|
|
|
case REQ_OP_ZONE_CLOSE:
|
|
|
|
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
|
|
|
|
break;
|
|
|
|
case REQ_OP_ZONE_FINISH:
|
|
|
|
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
|
|
|
|
break;
|
2017-04-06 01:21:13 +08:00
|
|
|
case REQ_OP_WRITE_ZEROES:
|
2018-12-18 11:42:03 +08:00
|
|
|
ret = nvme_setup_write_zeroes(ns, req, cmd);
|
|
|
|
break;
|
2017-01-31 23:57:31 +08:00
|
|
|
case REQ_OP_DISCARD:
|
2016-04-13 03:10:14 +08:00
|
|
|
ret = nvme_setup_discard(ns, req, cmd);
|
2017-01-31 23:57:31 +08:00
|
|
|
break;
|
|
|
|
case REQ_OP_READ:
|
2020-06-30 03:06:41 +08:00
|
|
|
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
|
|
|
|
break;
|
2017-01-31 23:57:31 +08:00
|
|
|
case REQ_OP_WRITE:
|
2020-06-30 03:06:41 +08:00
|
|
|
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
|
|
|
|
break;
|
|
|
|
case REQ_OP_ZONE_APPEND:
|
|
|
|
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
|
2017-01-31 23:57:31 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
WARN_ON_ONCE(1);
|
2017-06-03 15:38:05 +08:00
|
|
|
return BLK_STS_IOERR;
|
2017-01-31 23:57:31 +08:00
|
|
|
}
|
2016-04-13 03:10:14 +08:00
|
|
|
|
2021-06-17 05:19:36 +08:00
|
|
|
cmd->common.command_id = nvme_cid(req);
|
2018-06-30 06:50:01 +08:00
|
|
|
trace_nvme_setup_cmd(req, cmd);
|
2016-04-13 03:10:14 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_setup_cmd);
|
|
|
|
|
2021-06-11 05:44:37 +08:00
|
|
|
/*
|
|
|
|
* Return values:
|
|
|
|
* 0: success
|
|
|
|
* >0: nvme controller's cqe status response
|
|
|
|
* <0: kernel error in lieu of controller response
|
|
|
|
*/
|
2022-12-14 17:13:16 +08:00
|
|
|
int nvme_execute_rq(struct request *rq, bool at_head)
|
2021-06-11 05:44:37 +08:00
|
|
|
{
|
|
|
|
blk_status_t status;
|
|
|
|
|
2021-11-26 20:18:01 +08:00
|
|
|
status = blk_execute_rq(rq, at_head);
|
2021-06-11 05:44:37 +08:00
|
|
|
if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
|
|
|
|
return -EINTR;
|
|
|
|
if (nvme_req(rq)->status)
|
|
|
|
return nvme_req(rq)->status;
|
|
|
|
return blk_status_to_errno(status);
|
|
|
|
}
|
2022-12-14 17:13:16 +08:00
|
|
|
EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, NVME_TARGET_PASSTHRU);
|
2021-06-11 05:44:37 +08:00
|
|
|
|
2015-11-20 16:00:02 +08:00
|
|
|
/*
|
|
|
|
* Returns 0 on success. If the result is negative, it's a Linux error code;
|
|
|
|
* if the result is positive, it's an NVM Express status code
|
|
|
|
*/
|
|
|
|
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
|
2016-11-10 23:32:33 +08:00
|
|
|
union nvme_result *result, void *buffer, unsigned bufflen,
|
2022-06-07 09:16:42 +08:00
|
|
|
int qid, int at_head, blk_mq_req_flags_t flags)
|
2015-11-20 16:00:02 +08:00
|
|
|
{
|
|
|
|
struct request *req;
|
|
|
|
int ret;
|
|
|
|
|
2020-11-10 10:24:00 +08:00
|
|
|
if (qid == NVME_QID_ANY)
|
2022-03-15 22:53:59 +08:00
|
|
|
req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
|
2020-11-10 10:24:00 +08:00
|
|
|
else
|
2022-03-15 22:53:59 +08:00
|
|
|
req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
|
nvme: fix qid param blk_mq_alloc_request_hctx
Only caller of the __nvme_submit_sync_cmd() with qid value not equal to
NVME_QID_ANY is nvmf_connect_io_queues(), where qid value is alway set
to > 0.
[1] __nvme_submit_sync_cmd() callers with qid parameter from :-
Caller | qid parameter
------------------------------------------------------
* nvme_fc_connect_io_queues() |
nvmf_connect_io_queue() | qid > 0
* nvme_rdma_start_io_queues() |
nvme_rdma_start_queue() |
nvmf_connect_io_queues() | qid > 0
* nvme_tcp_start_io_queues() |
nvme_tcp_start_queue() |
nvmf_connect_io_queues() | qid > 0
* nvme_loop_connect_io_queues() |
nvmf_connect_io_queues() | qid > 0
When qid value of the function parameter __nvme_submit_sync_cmd() is > 0
from above callers, we use blk_mq_alloc_request_hctx(), where we pass
last parameter as 0 if qid functional parameter value is set to 0 with
conditional operators, see 1002 :-
991 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
992 union nvme_result *result, void *buffer, unsigned bufflen,
993 int qid, int at_head, blk_mq_req_flags_t flags)
994 {
995 struct request *req;
996 int ret;
997
998 if (qid == NVME_QID_ANY)
999 req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
1000 else
1001 req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
1002 qid ? qid - 1 : 0);
1003
But qid function parameter value of the __nvme_submit_sync_cmd() will
never be 0 from above caller list see [1], and all the other callers of
__nvme_submit_sync_cmd() use NVME_QID_ANY as qid value :-
1. nvme_submit_sync_cmd()
2. nvme_features()
3. nvme_sec_submit()
4. nvmf_reg_read32()
5. nvmf_reg_read64()
6. nvmf_ref_write32()
7. nvmf_connect_admin_queue()
Remove the conditional operator to pass the qid as 0 in the call to
blk_mq_alloc_requst_hctx().
Signed-off-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2022-06-07 09:16:43 +08:00
|
|
|
qid - 1);
|
2022-03-15 22:53:59 +08:00
|
|
|
|
2015-11-20 16:00:02 +08:00
|
|
|
if (IS_ERR(req))
|
|
|
|
return PTR_ERR(req);
|
2022-03-15 22:53:59 +08:00
|
|
|
nvme_init_request(req, cmd);
|
2015-11-20 16:00:02 +08:00
|
|
|
|
2015-11-26 16:08:36 +08:00
|
|
|
if (buffer && bufflen) {
|
|
|
|
ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
2015-11-20 16:00:02 +08:00
|
|
|
}
|
|
|
|
|
2022-01-22 13:05:39 +08:00
|
|
|
ret = nvme_execute_rq(req, at_head);
|
2021-06-11 05:44:37 +08:00
|
|
|
if (result && ret >= 0)
|
2016-11-10 23:32:33 +08:00
|
|
|
*result = nvme_req(req)->result;
|
2015-11-20 16:00:02 +08:00
|
|
|
out:
|
|
|
|
blk_mq_free_request(req);
|
|
|
|
return ret;
|
|
|
|
}
|
2016-06-13 22:45:23 +08:00
|
|
|
EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
|
2015-11-20 16:00:02 +08:00
|
|
|
|
|
|
|
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
|
|
|
|
void *buffer, unsigned bufflen)
|
|
|
|
{
|
2022-06-07 09:16:42 +08:00
|
|
|
return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen,
|
2021-06-11 05:44:35 +08:00
|
|
|
NVME_QID_ANY, 0, 0);
|
2015-11-20 16:00:02 +08:00
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
|
2015-11-20 16:00:02 +08:00
|
|
|
|
2020-07-25 01:25:13 +08:00
|
|
|
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
|
|
|
|
{
|
|
|
|
u32 effects = 0;
|
|
|
|
|
|
|
|
if (ns) {
|
2023-01-28 00:56:19 +08:00
|
|
|
effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
|
2020-07-25 01:25:13 +08:00
|
|
|
if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
|
2021-03-18 04:33:41 +08:00
|
|
|
dev_warn_once(ctrl->device,
|
2022-12-21 17:12:17 +08:00
|
|
|
"IO command:%02x has unusual effects:%08x\n",
|
2021-03-18 04:33:41 +08:00
|
|
|
opcode, effects);
|
2020-07-25 01:25:13 +08:00
|
|
|
|
2022-12-21 17:12:17 +08:00
|
|
|
/*
|
|
|
|
* NVME_CMD_EFFECTS_CSE_MASK causes a freeze all I/O queues,
|
|
|
|
* which would deadlock when done on an I/O command. Note that
|
|
|
|
* We already warn about an unusual effect above.
|
|
|
|
*/
|
|
|
|
effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
|
|
|
|
} else {
|
2023-01-28 00:56:19 +08:00
|
|
|
effects = le32_to_cpu(ctrl->effects->acs[opcode]);
|
2022-12-21 17:12:17 +08:00
|
|
|
}
|
2020-07-25 01:25:13 +08:00
|
|
|
|
|
|
|
return effects;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
|
|
|
|
|
2022-12-14 17:13:16 +08:00
|
|
|
u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
|
2020-07-25 01:25:13 +08:00
|
|
|
{
|
|
|
|
u32 effects = nvme_command_effects(ctrl, ns, opcode);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For simplicity, IO to all namespaces is quiesced even if the command
|
|
|
|
* effects say only one namespace is affected.
|
|
|
|
*/
|
2020-09-28 17:10:36 +08:00
|
|
|
if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
|
2020-07-25 01:25:13 +08:00
|
|
|
mutex_lock(&ctrl->scan_lock);
|
|
|
|
mutex_lock(&ctrl->subsys->lock);
|
|
|
|
nvme_mpath_start_freeze(ctrl->subsys);
|
|
|
|
nvme_mpath_wait_freeze(ctrl->subsys);
|
|
|
|
nvme_start_freeze(ctrl);
|
|
|
|
nvme_wait_freeze(ctrl);
|
|
|
|
}
|
|
|
|
return effects;
|
|
|
|
}
|
2022-12-14 17:13:16 +08:00
|
|
|
EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, NVME_TARGET_PASSTHRU);
|
2020-07-25 01:25:13 +08:00
|
|
|
|
2022-09-20 03:36:46 +08:00
|
|
|
void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
|
|
|
|
struct nvme_command *cmd, int status)
|
2020-07-25 01:25:13 +08:00
|
|
|
{
|
2020-09-28 17:10:36 +08:00
|
|
|
if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
|
2020-07-25 01:25:13 +08:00
|
|
|
nvme_unfreeze(ctrl);
|
|
|
|
nvme_mpath_unfreeze(ctrl->subsys);
|
|
|
|
mutex_unlock(&ctrl->subsys->lock);
|
|
|
|
mutex_unlock(&ctrl->scan_lock);
|
|
|
|
}
|
2022-11-08 22:46:45 +08:00
|
|
|
if (effects & NVME_CMD_EFFECTS_CCC) {
|
|
|
|
dev_info(ctrl->device,
|
|
|
|
"controller capabilities changed, reset may be required to take effect.\n");
|
|
|
|
}
|
2020-07-25 01:25:13 +08:00
|
|
|
if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
|
|
|
|
nvme_queue_scan(ctrl);
|
|
|
|
flush_work(&ctrl->scan_work);
|
|
|
|
}
|
2021-09-01 16:23:42 +08:00
|
|
|
|
|
|
|
switch (cmd->common.opcode) {
|
|
|
|
case nvme_admin_set_features:
|
|
|
|
switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) {
|
|
|
|
case NVME_FEAT_KATO:
|
|
|
|
/*
|
|
|
|
* Keep alive commands interval on the host should be
|
|
|
|
* updated when KATO is modified by Set Features
|
|
|
|
* commands.
|
|
|
|
*/
|
|
|
|
if (!status)
|
|
|
|
nvme_update_keep_alive(ctrl, cmd);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2020-07-25 01:25:13 +08:00
|
|
|
}
|
2022-09-20 03:36:46 +08:00
|
|
|
EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU);
|
2020-07-25 01:25:13 +08:00
|
|
|
|
2021-04-16 19:46:20 +08:00
|
|
|
/*
|
|
|
|
* Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
|
|
|
|
*
|
|
|
|
* The host should send Keep Alive commands at half of the Keep Alive Timeout
|
|
|
|
* accounting for transport roundtrip times [..].
|
|
|
|
*/
|
|
|
|
static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
|
2015-11-20 16:00:02 +08:00
|
|
|
{
|
2021-04-16 19:46:20 +08:00
|
|
|
queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2);
|
2015-11-26 16:08:36 +08:00
|
|
|
}
|
|
|
|
|
2022-09-22 05:19:54 +08:00
|
|
|
static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
|
|
|
|
blk_status_t status)
|
2016-06-13 22:45:28 +08:00
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = rq->end_io_data;
|
nvme: validate controller state before rescheduling keep alive
Delete operations are seeing NULL pointer references in call_timer_fn.
Tracking these back, the timer appears to be the keep alive timer.
nvme_keep_alive_work() which is tied to the timer that is cancelled
by nvme_stop_keep_alive(), simply starts the keep alive io but doesn't
wait for it's completion. So nvme_stop_keep_alive() only stops a timer
when it's pending. When a keep alive is in flight, there is no timer
running and the nvme_stop_keep_alive() will have no affect on the keep
alive io. Thus, if the io completes successfully, the keep alive timer
will be rescheduled. In the failure case, delete is called, the
controller state is changed, the nvme_stop_keep_alive() is called while
the io is outstanding, and the delete path continues on. The keep
alive happens to successfully complete before the delete paths mark it
as aborted as part of the queue termination, so the timer is restarted.
The delete paths then tear down the controller, and later on the timer
code fires and the timer entry is now corrupt.
Fix by validating the controller state before rescheduling the keep
alive. Testing with the fix has confirmed the condition above was hit.
Signed-off-by: James Smart <jsmart2021@gmail.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2018-11-28 09:04:44 +08:00
|
|
|
unsigned long flags;
|
|
|
|
bool startka = false;
|
2016-06-13 22:45:28 +08:00
|
|
|
|
|
|
|
blk_mq_free_request(rq);
|
|
|
|
|
2017-06-03 15:38:04 +08:00
|
|
|
if (status) {
|
2016-06-13 22:45:28 +08:00
|
|
|
dev_err(ctrl->device,
|
2017-06-03 15:38:04 +08:00
|
|
|
"failed nvme_keep_alive_end_io error=%d\n",
|
|
|
|
status);
|
2022-09-22 05:19:54 +08:00
|
|
|
return RQ_END_IO_NONE;
|
2016-06-13 22:45:28 +08:00
|
|
|
}
|
|
|
|
|
2018-11-03 01:28:15 +08:00
|
|
|
ctrl->comp_seen = false;
|
nvme: validate controller state before rescheduling keep alive
Delete operations are seeing NULL pointer references in call_timer_fn.
Tracking these back, the timer appears to be the keep alive timer.
nvme_keep_alive_work() which is tied to the timer that is cancelled
by nvme_stop_keep_alive(), simply starts the keep alive io but doesn't
wait for it's completion. So nvme_stop_keep_alive() only stops a timer
when it's pending. When a keep alive is in flight, there is no timer
running and the nvme_stop_keep_alive() will have no affect on the keep
alive io. Thus, if the io completes successfully, the keep alive timer
will be rescheduled. In the failure case, delete is called, the
controller state is changed, the nvme_stop_keep_alive() is called while
the io is outstanding, and the delete path continues on. The keep
alive happens to successfully complete before the delete paths mark it
as aborted as part of the queue termination, so the timer is restarted.
The delete paths then tear down the controller, and later on the timer
code fires and the timer entry is now corrupt.
Fix by validating the controller state before rescheduling the keep
alive. Testing with the fix has confirmed the condition above was hit.
Signed-off-by: James Smart <jsmart2021@gmail.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2018-11-28 09:04:44 +08:00
|
|
|
spin_lock_irqsave(&ctrl->lock, flags);
|
|
|
|
if (ctrl->state == NVME_CTRL_LIVE ||
|
|
|
|
ctrl->state == NVME_CTRL_CONNECTING)
|
|
|
|
startka = true;
|
|
|
|
spin_unlock_irqrestore(&ctrl->lock, flags);
|
|
|
|
if (startka)
|
2021-04-16 19:46:20 +08:00
|
|
|
nvme_queue_keep_alive_work(ctrl);
|
2022-09-22 05:19:54 +08:00
|
|
|
return RQ_END_IO_NONE;
|
2016-06-13 22:45:28 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_keep_alive_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
|
|
|
|
struct nvme_ctrl, ka_work);
|
2018-11-03 01:28:15 +08:00
|
|
|
bool comp_seen = ctrl->comp_seen;
|
2021-03-03 20:46:06 +08:00
|
|
|
struct request *rq;
|
2018-11-03 01:28:15 +08:00
|
|
|
|
|
|
|
if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
|
|
|
|
dev_dbg(ctrl->device,
|
|
|
|
"reschedule traffic based keep-alive timer\n");
|
|
|
|
ctrl->comp_seen = false;
|
2021-04-16 19:46:20 +08:00
|
|
|
nvme_queue_keep_alive_work(ctrl);
|
2018-11-03 01:28:15 +08:00
|
|
|
return;
|
|
|
|
}
|
2016-06-13 22:45:28 +08:00
|
|
|
|
2022-03-15 22:53:59 +08:00
|
|
|
rq = blk_mq_alloc_request(ctrl->admin_q, nvme_req_op(&ctrl->ka_cmd),
|
|
|
|
BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
|
2021-03-03 20:46:06 +08:00
|
|
|
if (IS_ERR(rq)) {
|
2016-06-13 22:45:28 +08:00
|
|
|
/* allocation failure, reset the controller */
|
2021-03-03 20:51:47 +08:00
|
|
|
dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
|
2017-06-13 00:21:19 +08:00
|
|
|
nvme_reset_ctrl(ctrl);
|
2016-06-13 22:45:28 +08:00
|
|
|
return;
|
|
|
|
}
|
2022-03-15 22:53:59 +08:00
|
|
|
nvme_init_request(rq, &ctrl->ka_cmd);
|
2021-03-03 20:46:06 +08:00
|
|
|
|
|
|
|
rq->timeout = ctrl->kato * HZ;
|
2022-05-24 20:15:30 +08:00
|
|
|
rq->end_io = nvme_keep_alive_end_io;
|
2021-03-03 20:46:06 +08:00
|
|
|
rq->end_io_data = ctrl;
|
2022-05-24 20:15:30 +08:00
|
|
|
blk_execute_rq_nowait(rq, false);
|
2016-06-13 22:45:28 +08:00
|
|
|
}
|
|
|
|
|
2018-04-12 23:16:05 +08:00
|
|
|
static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
|
2016-06-13 22:45:28 +08:00
|
|
|
{
|
|
|
|
if (unlikely(ctrl->kato == 0))
|
|
|
|
return;
|
|
|
|
|
2021-04-16 19:46:20 +08:00
|
|
|
nvme_queue_keep_alive_work(ctrl);
|
2016-06-13 22:45:28 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (unlikely(ctrl->kato == 0))
|
|
|
|
return;
|
|
|
|
|
|
|
|
cancel_delayed_work_sync(&ctrl->ka_work);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
|
|
|
|
|
2021-09-01 16:23:42 +08:00
|
|
|
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
|
|
|
|
struct nvme_command *cmd)
|
|
|
|
{
|
|
|
|
unsigned int new_kato =
|
|
|
|
DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000);
|
|
|
|
|
|
|
|
dev_info(ctrl->device,
|
|
|
|
"keep alive interval updated from %u ms to %u ms\n",
|
|
|
|
ctrl->kato * 1000 / 2, new_kato * 1000 / 2);
|
|
|
|
|
|
|
|
nvme_stop_keep_alive(ctrl);
|
|
|
|
ctrl->kato = new_kato;
|
|
|
|
nvme_start_keep_alive(ctrl);
|
|
|
|
}
|
|
|
|
|
2020-04-04 16:11:28 +08:00
|
|
|
/*
|
|
|
|
* In NVMe 1.0 the CNS field was just a binary controller or namespace
|
|
|
|
* flag, thus sending any new CNS opcodes has a big chance of not working.
|
|
|
|
* Qemu unfortunately had that bug after reporting a 1.1 version compliance
|
|
|
|
* (but not for any later version).
|
|
|
|
*/
|
|
|
|
static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
|
|
|
|
return ctrl->vs < NVME_VS(1, 2, 0);
|
|
|
|
return ctrl->vs < NVME_VS(1, 1, 0);
|
|
|
|
}
|
|
|
|
|
2017-06-21 03:09:56 +08:00
|
|
|
static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
|
2015-11-26 16:08:36 +08:00
|
|
|
{
|
|
|
|
struct nvme_command c = { };
|
|
|
|
int error;
|
|
|
|
|
|
|
|
/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
|
|
|
|
c.identify.opcode = nvme_admin_identify;
|
2017-01-26 23:17:28 +08:00
|
|
|
c.identify.cns = NVME_ID_CNS_CTRL;
|
2015-11-26 16:08:36 +08:00
|
|
|
|
|
|
|
*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
|
|
|
|
if (!*id)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
|
|
|
|
sizeof(struct nvme_id_ctrl));
|
|
|
|
if (error)
|
|
|
|
kfree(*id);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2020-02-20 00:14:31 +08:00
|
|
|
static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
|
2020-06-30 03:06:39 +08:00
|
|
|
struct nvme_ns_id_desc *cur, bool *csi_seen)
|
2020-02-20 00:14:31 +08:00
|
|
|
{
|
|
|
|
const char *warn_str = "ctrl returned bogus length:";
|
|
|
|
void *data = cur;
|
|
|
|
|
|
|
|
switch (cur->nidt) {
|
|
|
|
case NVME_NIDT_EUI64:
|
|
|
|
if (cur->nidl != NVME_NIDT_EUI64_LEN) {
|
|
|
|
dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
|
|
|
|
warn_str, cur->nidl);
|
|
|
|
return -1;
|
|
|
|
}
|
2022-04-11 14:05:27 +08:00
|
|
|
if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
|
|
|
|
return NVME_NIDT_EUI64_LEN;
|
2020-02-20 00:14:31 +08:00
|
|
|
memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
|
|
|
|
return NVME_NIDT_EUI64_LEN;
|
|
|
|
case NVME_NIDT_NGUID:
|
|
|
|
if (cur->nidl != NVME_NIDT_NGUID_LEN) {
|
|
|
|
dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
|
|
|
|
warn_str, cur->nidl);
|
|
|
|
return -1;
|
|
|
|
}
|
2022-04-11 14:05:27 +08:00
|
|
|
if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
|
|
|
|
return NVME_NIDT_NGUID_LEN;
|
2020-02-20 00:14:31 +08:00
|
|
|
memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
|
|
|
|
return NVME_NIDT_NGUID_LEN;
|
|
|
|
case NVME_NIDT_UUID:
|
|
|
|
if (cur->nidl != NVME_NIDT_UUID_LEN) {
|
|
|
|
dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
|
|
|
|
warn_str, cur->nidl);
|
|
|
|
return -1;
|
|
|
|
}
|
2022-04-11 14:05:27 +08:00
|
|
|
if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
|
|
|
|
return NVME_NIDT_UUID_LEN;
|
2020-02-20 00:14:31 +08:00
|
|
|
uuid_copy(&ids->uuid, data + sizeof(*cur));
|
|
|
|
return NVME_NIDT_UUID_LEN;
|
2020-06-30 03:06:39 +08:00
|
|
|
case NVME_NIDT_CSI:
|
|
|
|
if (cur->nidl != NVME_NIDT_CSI_LEN) {
|
|
|
|
dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
|
|
|
|
warn_str, cur->nidl);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
|
|
|
|
*csi_seen = true;
|
|
|
|
return NVME_NIDT_CSI_LEN;
|
2020-02-20 00:14:31 +08:00
|
|
|
default:
|
|
|
|
/* Skip unknown types */
|
|
|
|
return cur->nidl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
|
|
|
|
struct nvme_ns_info *info)
|
2017-06-07 17:45:34 +08:00
|
|
|
{
|
|
|
|
struct nvme_command c = { };
|
2020-06-30 03:06:39 +08:00
|
|
|
bool csi_seen = false;
|
|
|
|
int status, pos, len;
|
2017-06-07 17:45:34 +08:00
|
|
|
void *data;
|
|
|
|
|
2020-09-28 20:07:56 +08:00
|
|
|
if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
|
|
|
|
return 0;
|
2020-07-28 19:09:03 +08:00
|
|
|
if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
|
|
|
|
return 0;
|
|
|
|
|
2017-06-07 17:45:34 +08:00
|
|
|
c.identify.opcode = nvme_admin_identify;
|
2022-07-23 00:24:18 +08:00
|
|
|
c.identify.nsid = cpu_to_le32(info->nsid);
|
2017-06-07 17:45:34 +08:00
|
|
|
c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
|
|
|
|
|
|
|
|
data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
|
|
|
|
if (!data)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2017-08-16 22:14:47 +08:00
|
|
|
status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
|
2017-06-07 17:45:34 +08:00
|
|
|
NVME_IDENTIFY_DATA_SIZE);
|
2020-03-25 21:19:35 +08:00
|
|
|
if (status) {
|
|
|
|
dev_warn(ctrl->device,
|
2020-11-30 20:47:46 +08:00
|
|
|
"Identify Descriptors failed (nsid=%u, status=0x%x)\n",
|
2022-07-23 00:24:18 +08:00
|
|
|
info->nsid, status);
|
2017-06-07 17:45:34 +08:00
|
|
|
goto free_data;
|
2020-03-25 21:19:35 +08:00
|
|
|
}
|
2017-06-07 17:45:34 +08:00
|
|
|
|
|
|
|
for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
|
|
|
|
struct nvme_ns_id_desc *cur = data + pos;
|
|
|
|
|
|
|
|
if (cur->nidl == 0)
|
|
|
|
break;
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
len = nvme_process_ns_desc(ctrl, &info->ids, cur, &csi_seen);
|
2020-02-20 00:14:31 +08:00
|
|
|
if (len < 0)
|
2020-06-30 03:06:39 +08:00
|
|
|
break;
|
2017-06-07 17:45:34 +08:00
|
|
|
|
|
|
|
len += sizeof(*cur);
|
|
|
|
}
|
2020-06-30 03:06:39 +08:00
|
|
|
|
|
|
|
if (nvme_multi_css(ctrl) && !csi_seen) {
|
|
|
|
dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
|
2022-07-23 00:24:18 +08:00
|
|
|
info->nsid);
|
2020-06-30 03:06:39 +08:00
|
|
|
status = -EINVAL;
|
|
|
|
}
|
|
|
|
|
2017-06-07 17:45:34 +08:00
|
|
|
free_data:
|
|
|
|
kfree(data);
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2020-09-28 20:07:56 +08:00
|
|
|
static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
|
2022-07-23 00:24:18 +08:00
|
|
|
struct nvme_id_ns **id)
|
2015-11-26 16:08:36 +08:00
|
|
|
{
|
|
|
|
struct nvme_command c = { };
|
|
|
|
int error;
|
|
|
|
|
|
|
|
/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
|
2017-01-26 23:17:27 +08:00
|
|
|
c.identify.opcode = nvme_admin_identify;
|
|
|
|
c.identify.nsid = cpu_to_le32(nsid);
|
2017-01-26 23:17:28 +08:00
|
|
|
c.identify.cns = NVME_ID_CNS_NS;
|
2015-11-26 16:08:36 +08:00
|
|
|
|
2019-08-03 09:11:42 +08:00
|
|
|
*id = kmalloc(sizeof(**id), GFP_KERNEL);
|
|
|
|
if (!*id)
|
|
|
|
return -ENOMEM;
|
2015-11-26 16:08:36 +08:00
|
|
|
|
2019-08-03 09:11:42 +08:00
|
|
|
error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
|
2017-08-16 22:14:47 +08:00
|
|
|
if (error) {
|
2019-04-05 02:57:45 +08:00
|
|
|
dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
|
2023-02-22 06:02:25 +08:00
|
|
|
kfree(*id);
|
2017-08-16 22:14:47 +08:00
|
|
|
}
|
2022-07-23 00:24:18 +08:00
|
|
|
return error;
|
|
|
|
}
|
2022-04-11 14:05:27 +08:00
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
|
|
|
|
struct nvme_ns_info *info)
|
|
|
|
{
|
|
|
|
struct nvme_ns_ids *ids = &info->ids;
|
|
|
|
struct nvme_id_ns *id;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = nvme_identify_ns(ctrl, info->nsid, &id);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2023-02-22 06:02:25 +08:00
|
|
|
|
|
|
|
if (id->ncap == 0) {
|
|
|
|
/* namespace not allocated or attached */
|
|
|
|
info->is_removed = true;
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
info->anagrpid = id->anagrpid;
|
|
|
|
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
|
2022-07-13 13:40:25 +08:00
|
|
|
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
|
2022-07-23 00:24:18 +08:00
|
|
|
info->is_ready = true;
|
2022-04-11 14:05:27 +08:00
|
|
|
if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
|
|
|
|
dev_info(ctrl->device,
|
|
|
|
"Ignoring bogus Namespace Identifiers\n");
|
|
|
|
} else {
|
|
|
|
if (ctrl->vs >= NVME_VS(1, 1, 0) &&
|
|
|
|
!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
|
2022-07-23 00:24:18 +08:00
|
|
|
memcpy(ids->eui64, id->eui64, sizeof(ids->eui64));
|
2022-04-11 14:05:27 +08:00
|
|
|
if (ctrl->vs >= NVME_VS(1, 2, 0) &&
|
|
|
|
!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
|
2022-07-23 00:24:18 +08:00
|
|
|
memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
|
2022-04-11 14:05:27 +08:00
|
|
|
}
|
2022-07-23 00:24:18 +08:00
|
|
|
kfree(id);
|
2020-09-28 18:33:19 +08:00
|
|
|
return 0;
|
2015-11-26 16:08:36 +08:00
|
|
|
}
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
|
|
|
|
struct nvme_ns_info *info)
|
2022-05-16 21:09:21 +08:00
|
|
|
{
|
2022-07-23 00:24:18 +08:00
|
|
|
struct nvme_id_ns_cs_indep *id;
|
2022-05-16 21:09:21 +08:00
|
|
|
struct nvme_command c = {
|
|
|
|
.identify.opcode = nvme_admin_identify,
|
2022-07-23 00:24:18 +08:00
|
|
|
.identify.nsid = cpu_to_le32(info->nsid),
|
2022-05-16 21:09:21 +08:00
|
|
|
.identify.cns = NVME_ID_CNS_NS_CS_INDEP,
|
|
|
|
};
|
|
|
|
int ret;
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
id = kmalloc(sizeof(*id), GFP_KERNEL);
|
|
|
|
if (!id)
|
2022-05-16 21:09:21 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
|
|
|
|
if (!ret) {
|
|
|
|
info->anagrpid = id->anagrpid;
|
|
|
|
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
|
2022-07-13 13:40:25 +08:00
|
|
|
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
|
2022-07-23 00:24:18 +08:00
|
|
|
info->is_ready = id->nstat & NVME_NSTAT_NRDY;
|
2022-05-16 21:09:21 +08:00
|
|
|
}
|
2022-07-23 00:24:18 +08:00
|
|
|
kfree(id);
|
|
|
|
return ret;
|
2022-05-16 21:09:21 +08:00
|
|
|
}
|
|
|
|
|
2019-05-27 00:29:01 +08:00
|
|
|
static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
|
|
|
|
unsigned int dword11, void *buffer, size_t buflen, u32 *result)
|
2015-11-26 16:08:36 +08:00
|
|
|
{
|
2020-02-19 23:59:36 +08:00
|
|
|
union nvme_result res = { 0 };
|
2021-06-17 06:15:52 +08:00
|
|
|
struct nvme_command c = { };
|
2016-02-29 22:59:47 +08:00
|
|
|
int ret;
|
2015-11-26 16:08:36 +08:00
|
|
|
|
2019-05-27 00:29:01 +08:00
|
|
|
c.features.opcode = op;
|
2015-11-26 16:08:36 +08:00
|
|
|
c.features.fid = cpu_to_le32(fid);
|
|
|
|
c.features.dword11 = cpu_to_le32(dword11);
|
|
|
|
|
2016-11-10 23:32:33 +08:00
|
|
|
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
|
2022-06-07 09:16:42 +08:00
|
|
|
buffer, buflen, NVME_QID_ANY, 0, 0);
|
2016-08-24 18:52:12 +08:00
|
|
|
if (ret >= 0 && result)
|
2016-11-10 23:32:33 +08:00
|
|
|
*result = le32_to_cpu(res.u32);
|
2016-02-29 22:59:47 +08:00
|
|
|
return ret;
|
2015-11-26 16:08:36 +08:00
|
|
|
}
|
|
|
|
|
2019-05-27 00:29:01 +08:00
|
|
|
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
|
|
|
|
unsigned int dword11, void *buffer, size_t buflen,
|
|
|
|
u32 *result)
|
|
|
|
{
|
|
|
|
return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
|
|
|
|
buflen, result);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_set_features);
|
|
|
|
|
|
|
|
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
|
|
|
|
unsigned int dword11, void *buffer, size_t buflen,
|
|
|
|
u32 *result)
|
|
|
|
{
|
|
|
|
return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
|
|
|
|
buflen, result);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_get_features);
|
|
|
|
|
2015-11-26 18:09:06 +08:00
|
|
|
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
|
|
|
|
{
|
|
|
|
u32 q_count = (*count - 1) | ((*count - 1) << 16);
|
|
|
|
u32 result;
|
|
|
|
int status, nr_io_queues;
|
|
|
|
|
2016-09-17 02:16:10 +08:00
|
|
|
status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
|
2015-11-26 18:09:06 +08:00
|
|
|
&result);
|
2016-06-07 05:20:50 +08:00
|
|
|
if (status < 0)
|
2015-11-26 18:09:06 +08:00
|
|
|
return status;
|
|
|
|
|
2016-06-07 05:20:50 +08:00
|
|
|
/*
|
|
|
|
* Degraded controllers might return an error when setting the queue
|
|
|
|
* count. We still want to be able to bring them online and offer
|
|
|
|
* access to the admin queue, as that might be only way to fix them up.
|
|
|
|
*/
|
|
|
|
if (status > 0) {
|
2017-06-09 22:17:21 +08:00
|
|
|
dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
|
2016-06-07 05:20:50 +08:00
|
|
|
*count = 0;
|
|
|
|
} else {
|
|
|
|
nr_io_queues = min(result & 0xffff, result >> 16) + 1;
|
|
|
|
*count = min(*count, nr_io_queues);
|
|
|
|
}
|
|
|
|
|
2015-11-26 18:09:06 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
|
2015-11-26 18:09:06 +08:00
|
|
|
|
2018-05-22 17:09:55 +08:00
|
|
|
#define NVME_AEN_SUPPORTED \
|
2019-07-13 02:02:10 +08:00
|
|
|
(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
|
|
|
|
NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
|
2018-05-22 17:09:55 +08:00
|
|
|
|
|
|
|
static void nvme_enable_aen(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
2018-07-03 00:34:38 +08:00
|
|
|
u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
|
2018-05-22 17:09:55 +08:00
|
|
|
int status;
|
|
|
|
|
2018-07-03 00:34:38 +08:00
|
|
|
if (!supported_aens)
|
|
|
|
return;
|
|
|
|
|
|
|
|
status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
|
|
|
|
NULL, 0, &result);
|
2018-05-22 17:09:55 +08:00
|
|
|
if (status)
|
|
|
|
dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
|
2018-07-03 00:34:38 +08:00
|
|
|
supported_aens);
|
2019-08-23 02:25:46 +08:00
|
|
|
|
|
|
|
queue_work(nvme_wq, &ctrl->async_event_work);
|
2018-05-22 17:09:55 +08:00
|
|
|
}
|
|
|
|
|
2021-04-07 20:36:47 +08:00
|
|
|
static int nvme_ns_open(struct nvme_ns *ns)
|
2020-03-05 19:13:29 +08:00
|
|
|
{
|
|
|
|
|
2017-11-02 19:59:30 +08:00
|
|
|
/* should never be called due to GENHD_FL_HIDDEN */
|
2021-04-07 23:49:29 +08:00
|
|
|
if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
|
2018-01-04 23:56:13 +08:00
|
|
|
goto fail;
|
2021-04-27 14:47:46 +08:00
|
|
|
if (!nvme_get_ns(ns))
|
2018-01-04 23:56:13 +08:00
|
|
|
goto fail;
|
|
|
|
if (!try_module_get(ns->ctrl->ops->module))
|
|
|
|
goto fail_put_ns;
|
|
|
|
|
2017-10-18 19:22:00 +08:00
|
|
|
return 0;
|
2018-01-04 23:56:13 +08:00
|
|
|
|
|
|
|
fail_put_ns:
|
|
|
|
nvme_put_ns(ns);
|
|
|
|
fail:
|
|
|
|
return -ENXIO;
|
2015-11-26 17:54:19 +08:00
|
|
|
}
|
|
|
|
|
2021-04-07 20:36:47 +08:00
|
|
|
static void nvme_ns_release(struct nvme_ns *ns)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
2018-01-04 23:56:13 +08:00
|
|
|
|
|
|
|
module_put(ns->ctrl->ops->module);
|
|
|
|
nvme_put_ns(ns);
|
2015-11-26 17:54:19 +08:00
|
|
|
}
|
|
|
|
|
2023-06-08 19:02:55 +08:00
|
|
|
static int nvme_open(struct gendisk *disk, blk_mode_t mode)
|
2021-04-07 20:36:47 +08:00
|
|
|
{
|
2023-06-08 19:02:36 +08:00
|
|
|
return nvme_ns_open(disk->private_data);
|
2021-04-07 20:36:47 +08:00
|
|
|
}
|
|
|
|
|
2023-06-08 19:02:37 +08:00
|
|
|
static void nvme_release(struct gendisk *disk)
|
2021-04-07 20:36:47 +08:00
|
|
|
{
|
|
|
|
nvme_ns_release(disk->private_data);
|
|
|
|
}
|
|
|
|
|
2021-04-07 20:22:12 +08:00
|
|
|
int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
|
|
|
/* some standard values */
|
|
|
|
geo->heads = 1 << 6;
|
|
|
|
geo->sectors = 1 << 5;
|
|
|
|
geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_BLK_DEV_INTEGRITY
|
2022-03-04 04:13:12 +08:00
|
|
|
static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
|
2020-05-19 22:05:52 +08:00
|
|
|
u32 max_integrity_segments)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
2021-06-17 06:15:52 +08:00
|
|
|
struct blk_integrity integrity = { };
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2022-03-04 04:13:12 +08:00
|
|
|
switch (ns->pi_type) {
|
2015-11-26 17:54:19 +08:00
|
|
|
case NVME_NS_DPS_PI_TYPE3:
|
2022-03-04 04:13:12 +08:00
|
|
|
switch (ns->guard_type) {
|
|
|
|
case NVME_NVM_NS_16B_GUARD:
|
|
|
|
integrity.profile = &t10_pi_type3_crc;
|
|
|
|
integrity.tag_size = sizeof(u16) + sizeof(u32);
|
|
|
|
integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
|
|
|
|
break;
|
|
|
|
case NVME_NVM_NS_64B_GUARD:
|
|
|
|
integrity.profile = &ext_pi_type3_crc64;
|
|
|
|
integrity.tag_size = sizeof(u16) + 6;
|
|
|
|
integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
integrity.profile = NULL;
|
|
|
|
break;
|
|
|
|
}
|
2015-11-26 17:54:19 +08:00
|
|
|
break;
|
|
|
|
case NVME_NS_DPS_PI_TYPE1:
|
|
|
|
case NVME_NS_DPS_PI_TYPE2:
|
2022-03-04 04:13:12 +08:00
|
|
|
switch (ns->guard_type) {
|
|
|
|
case NVME_NVM_NS_16B_GUARD:
|
|
|
|
integrity.profile = &t10_pi_type1_crc;
|
|
|
|
integrity.tag_size = sizeof(u16);
|
|
|
|
integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
|
|
|
|
break;
|
|
|
|
case NVME_NVM_NS_64B_GUARD:
|
|
|
|
integrity.profile = &ext_pi_type1_crc64;
|
|
|
|
integrity.tag_size = sizeof(u16);
|
|
|
|
integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
integrity.profile = NULL;
|
|
|
|
break;
|
|
|
|
}
|
2015-11-26 17:54:19 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
integrity.profile = NULL;
|
|
|
|
break;
|
|
|
|
}
|
2022-03-04 04:13:12 +08:00
|
|
|
|
|
|
|
integrity.tuple_size = ns->ms;
|
2017-11-03 02:28:53 +08:00
|
|
|
blk_integrity_register(disk, &integrity);
|
2020-05-19 22:05:52 +08:00
|
|
|
blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
|
2015-11-26 17:54:19 +08:00
|
|
|
}
|
|
|
|
#else
|
2022-03-04 04:13:12 +08:00
|
|
|
static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
|
2020-05-19 22:05:52 +08:00
|
|
|
u32 max_integrity_segments)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_BLK_DEV_INTEGRITY */
|
|
|
|
|
2019-03-14 01:55:07 +08:00
|
|
|
static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
2018-05-03 01:06:54 +08:00
|
|
|
struct nvme_ctrl *ctrl = ns->ctrl;
|
2019-03-14 01:55:07 +08:00
|
|
|
struct request_queue *queue = disk->queue;
|
2017-11-03 02:28:54 +08:00
|
|
|
u32 size = queue_logical_block_size(queue);
|
|
|
|
|
2023-04-04 04:09:25 +08:00
|
|
|
if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns, UINT_MAX))
|
|
|
|
ctrl->max_discard_sectors = nvme_lba_to_sect(ns, ctrl->dmrsl);
|
|
|
|
|
2021-03-25 07:18:05 +08:00
|
|
|
if (ctrl->max_discard_sectors == 0) {
|
2022-04-15 12:52:55 +08:00
|
|
|
blk_queue_max_discard_sectors(queue, 0);
|
2018-05-03 01:06:54 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2017-02-08 21:46:50 +08:00
|
|
|
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
|
|
|
|
NVME_DSM_MAX_RANGES);
|
|
|
|
|
2017-11-03 02:28:54 +08:00
|
|
|
queue->limits.discard_granularity = size;
|
2017-06-28 02:03:06 +08:00
|
|
|
|
2018-05-03 01:06:54 +08:00
|
|
|
/* If discard is already enabled, don't reset queue limits */
|
2022-04-15 12:52:55 +08:00
|
|
|
if (queue->limits.max_discard_sectors)
|
2018-05-03 01:06:54 +08:00
|
|
|
return;
|
|
|
|
|
2021-03-25 07:18:05 +08:00
|
|
|
blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
|
|
|
|
blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
|
2017-04-06 01:21:13 +08:00
|
|
|
|
|
|
|
if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
|
2017-11-03 02:28:54 +08:00
|
|
|
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
|
2015-11-26 17:54:19 +08:00
|
|
|
}
|
|
|
|
|
2017-11-09 20:50:16 +08:00
|
|
|
static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
|
|
|
|
{
|
|
|
|
return uuid_equal(&a->uuid, &b->uuid) &&
|
|
|
|
memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
|
2020-06-30 03:06:39 +08:00
|
|
|
memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
|
|
|
|
a->csi == b->csi;
|
2017-11-09 20:50:16 +08:00
|
|
|
}
|
|
|
|
|
2022-03-04 04:13:12 +08:00
|
|
|
static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
|
2020-09-25 13:19:13 +08:00
|
|
|
{
|
2022-03-04 04:13:12 +08:00
|
|
|
bool first = id->dps & NVME_NS_DPS_PI_FIRST;
|
|
|
|
unsigned lbaf = nvme_lbaf_index(id->flbas);
|
2020-09-25 13:19:13 +08:00
|
|
|
struct nvme_ctrl *ctrl = ns->ctrl;
|
2022-03-04 04:13:12 +08:00
|
|
|
struct nvme_command c = { };
|
|
|
|
struct nvme_id_ns_nvm *nvm;
|
|
|
|
int ret = 0;
|
|
|
|
u32 elbaf;
|
|
|
|
|
|
|
|
ns->pi_size = 0;
|
|
|
|
ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
|
|
|
|
if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
|
|
|
|
ns->pi_size = sizeof(struct t10_pi_tuple);
|
|
|
|
ns->guard_type = NVME_NVM_NS_16B_GUARD;
|
|
|
|
goto set_pi;
|
|
|
|
}
|
2020-09-25 13:19:13 +08:00
|
|
|
|
2022-03-04 04:13:12 +08:00
|
|
|
nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
|
|
|
|
if (!nvm)
|
|
|
|
return -ENOMEM;
|
2020-09-25 13:19:13 +08:00
|
|
|
|
2022-03-04 04:13:12 +08:00
|
|
|
c.identify.opcode = nvme_admin_identify;
|
|
|
|
c.identify.nsid = cpu_to_le32(ns->head->ns_id);
|
|
|
|
c.identify.cns = NVME_ID_CNS_CS_NS;
|
|
|
|
c.identify.csi = NVME_CSI_NVM;
|
|
|
|
|
|
|
|
ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, nvm, sizeof(*nvm));
|
|
|
|
if (ret)
|
|
|
|
goto free_data;
|
|
|
|
|
|
|
|
elbaf = le32_to_cpu(nvm->elbaf[lbaf]);
|
|
|
|
|
|
|
|
/* no support for storage tag formats right now */
|
|
|
|
if (nvme_elbaf_sts(elbaf))
|
|
|
|
goto free_data;
|
|
|
|
|
|
|
|
ns->guard_type = nvme_elbaf_guard_type(elbaf);
|
|
|
|
switch (ns->guard_type) {
|
|
|
|
case NVME_NVM_NS_64B_GUARD:
|
|
|
|
ns->pi_size = sizeof(struct crc64_pi_tuple);
|
|
|
|
break;
|
|
|
|
case NVME_NVM_NS_16B_GUARD:
|
|
|
|
ns->pi_size = sizeof(struct t10_pi_tuple);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
free_data:
|
|
|
|
kfree(nvm);
|
|
|
|
set_pi:
|
|
|
|
if (ns->pi_size && (first || ns->ms == ns->pi_size))
|
2020-09-25 13:19:13 +08:00
|
|
|
ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
|
|
|
|
else
|
|
|
|
ns->pi_type = 0;
|
|
|
|
|
2022-03-04 04:13:12 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = ns->ctrl;
|
|
|
|
|
|
|
|
if (nvme_init_ms(ns, id))
|
|
|
|
return;
|
|
|
|
|
2020-09-25 13:19:13 +08:00
|
|
|
ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
|
|
|
|
if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
|
2022-02-16 22:07:15 +08:00
|
|
|
return;
|
|
|
|
|
2020-09-25 13:19:13 +08:00
|
|
|
if (ctrl->ops->flags & NVME_F_FABRICS) {
|
|
|
|
/*
|
|
|
|
* The NVMe over Fabrics specification only supports metadata as
|
|
|
|
* part of the extended data LBA. We rely on HCA/HBA support to
|
|
|
|
* remap the separate metadata buffer from the block layer.
|
|
|
|
*/
|
|
|
|
if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
|
2022-02-16 22:07:15 +08:00
|
|
|
return;
|
2021-12-01 00:14:54 +08:00
|
|
|
|
|
|
|
ns->features |= NVME_NS_EXT_LBAS;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The current fabrics transport drivers support namespace
|
|
|
|
* metadata formats only if nvme_ns_has_pi() returns true.
|
|
|
|
* Suppress support for all other formats so the namespace will
|
|
|
|
* have a 0 capacity and not be usable through the block stack.
|
|
|
|
*
|
|
|
|
* Note, this check will need to be modified if any drivers
|
|
|
|
* gain the ability to use other metadata formats.
|
|
|
|
*/
|
|
|
|
if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
|
|
|
|
ns->features |= NVME_NS_METADATA_SUPPORTED;
|
2020-09-25 13:19:13 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* For PCIe controllers, we can't easily remap the separate
|
|
|
|
* metadata buffer from the block layer and thus require a
|
|
|
|
* separate metadata buffer for block layer metadata/PI support.
|
|
|
|
* We allow extended LBAs for the passthrough interface, though.
|
|
|
|
*/
|
|
|
|
if (id->flbas & NVME_NS_FLBAS_META_EXT)
|
|
|
|
ns->features |= NVME_NS_EXT_LBAS;
|
|
|
|
else
|
|
|
|
ns->features |= NVME_NS_METADATA_SUPPORTED;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-09-28 18:05:28 +08:00
|
|
|
static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
|
|
|
|
struct request_queue *q)
|
|
|
|
{
|
2020-10-02 02:54:32 +08:00
|
|
|
bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
|
2020-09-28 18:05:28 +08:00
|
|
|
|
|
|
|
if (ctrl->max_hw_sectors) {
|
|
|
|
u32 max_segments =
|
|
|
|
(ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
|
|
|
|
|
|
|
|
max_segments = min_not_zero(max_segments, ctrl->max_segments);
|
|
|
|
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
|
|
|
|
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
|
|
|
|
}
|
|
|
|
blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
|
2022-05-05 02:43:25 +08:00
|
|
|
blk_queue_dma_alignment(q, 3);
|
2020-09-28 18:05:28 +08:00
|
|
|
blk_queue_write_cache(q, vwc, vwc);
|
|
|
|
}
|
|
|
|
|
2017-11-03 02:28:56 +08:00
|
|
|
static void nvme_update_disk_info(struct gendisk *disk,
|
|
|
|
struct nvme_ns *ns, struct nvme_id_ns *id)
|
|
|
|
{
|
2019-10-21 11:40:04 +08:00
|
|
|
sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
|
2017-12-20 03:24:15 +08:00
|
|
|
unsigned short bs = 1 << ns->lba_shift;
|
2020-05-14 13:56:26 +08:00
|
|
|
u32 atomic_bs, phys_bs, io_opt = 0;
|
2017-11-03 02:28:56 +08:00
|
|
|
|
2020-09-28 18:03:13 +08:00
|
|
|
/*
|
|
|
|
* The block layer can't support LBA sizes larger than the page size
|
|
|
|
* yet, so catch this early and don't allow block I/O.
|
|
|
|
*/
|
2019-03-12 06:02:25 +08:00
|
|
|
if (ns->lba_shift > PAGE_SHIFT) {
|
2020-09-28 18:03:13 +08:00
|
|
|
capacity = 0;
|
2019-03-12 06:02:25 +08:00
|
|
|
bs = (1 << 9);
|
|
|
|
}
|
2020-09-28 18:11:42 +08:00
|
|
|
|
2017-11-03 02:28:56 +08:00
|
|
|
blk_integrity_unregister(disk);
|
|
|
|
|
2020-05-14 13:56:26 +08:00
|
|
|
atomic_bs = phys_bs = bs;
|
2019-06-29 00:53:31 +08:00
|
|
|
if (id->nabo == 0) {
|
|
|
|
/*
|
|
|
|
* Bit 1 indicates whether NAWUPF is defined for this namespace
|
|
|
|
* and whether it should be used instead of AWUPF. If NAWUPF ==
|
|
|
|
* 0 then AWUPF must be used instead.
|
|
|
|
*/
|
2020-04-04 01:53:46 +08:00
|
|
|
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
|
2019-06-29 00:53:31 +08:00
|
|
|
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
|
|
|
|
else
|
|
|
|
atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
|
|
|
|
}
|
2020-04-10 00:09:08 +08:00
|
|
|
|
2020-04-04 01:53:46 +08:00
|
|
|
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
|
2019-06-29 00:53:31 +08:00
|
|
|
/* NPWG = Namespace Preferred Write Granularity */
|
2020-04-10 00:09:08 +08:00
|
|
|
phys_bs = bs * (1 + le16_to_cpu(id->npwg));
|
2019-06-29 00:53:31 +08:00
|
|
|
/* NOWS = Namespace Optimal Write Size */
|
2020-04-10 00:09:08 +08:00
|
|
|
io_opt = bs * (1 + le16_to_cpu(id->nows));
|
2019-06-29 00:53:31 +08:00
|
|
|
}
|
|
|
|
|
2017-12-20 03:24:15 +08:00
|
|
|
blk_queue_logical_block_size(disk->queue, bs);
|
2019-06-29 00:53:31 +08:00
|
|
|
/*
|
|
|
|
* Linux filesystems assume writing a single physical block is
|
|
|
|
* an atomic operation. Hence limit the physical block size to the
|
|
|
|
* value of the Atomic Write Unit Power Fail parameter.
|
|
|
|
*/
|
|
|
|
blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
|
|
|
|
blk_queue_io_min(disk->queue, phys_bs);
|
|
|
|
blk_queue_io_opt(disk->queue, io_opt);
|
2017-12-20 03:24:15 +08:00
|
|
|
|
2020-05-19 22:05:50 +08:00
|
|
|
/*
|
|
|
|
* Register a metadata profile for PI, or the plain non-integrity NVMe
|
|
|
|
* metadata masquerading as Type 0 if supported, otherwise reject block
|
|
|
|
* I/O to namespaces with metadata except when the namespace supports
|
|
|
|
* PI, as it can strip/insert in that case.
|
|
|
|
*/
|
|
|
|
if (ns->ms) {
|
|
|
|
if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
|
|
|
|
(ns->features & NVME_NS_METADATA_SUPPORTED))
|
2022-03-04 04:13:12 +08:00
|
|
|
nvme_init_integrity(disk, ns,
|
2020-05-19 22:05:52 +08:00
|
|
|
ns->ctrl->max_integrity_segments);
|
2020-05-19 22:05:50 +08:00
|
|
|
else if (!nvme_ns_has_pi(ns))
|
|
|
|
capacity = 0;
|
|
|
|
}
|
|
|
|
|
2020-11-16 22:56:56 +08:00
|
|
|
set_capacity_and_notify(disk, capacity);
|
2019-03-14 01:55:06 +08:00
|
|
|
|
2019-03-14 01:55:07 +08:00
|
|
|
nvme_config_discard(disk, ns);
|
2021-03-25 07:18:05 +08:00
|
|
|
blk_queue_max_write_zeroes_sectors(disk->queue,
|
|
|
|
ns->ctrl->max_zeroes_sectors);
|
2017-11-03 02:28:56 +08:00
|
|
|
}
|
|
|
|
|
2022-07-13 13:40:25 +08:00
|
|
|
static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
|
|
|
|
{
|
|
|
|
return info->is_readonly || test_bit(NVME_NS_FORCE_RO, &ns->flags);
|
|
|
|
}
|
|
|
|
|
2020-08-28 01:38:57 +08:00
|
|
|
static inline bool nvme_first_scan(struct gendisk *disk)
|
|
|
|
{
|
|
|
|
/* nvme_alloc_ns() scans the disk prior to adding it */
|
2021-08-09 14:40:28 +08:00
|
|
|
return !disk_live(disk);
|
2020-08-28 01:38:57 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = ns->ctrl;
|
|
|
|
u32 iob;
|
|
|
|
|
|
|
|
if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
|
|
|
|
is_power_of_2(ctrl->max_hw_sectors))
|
|
|
|
iob = ctrl->max_hw_sectors;
|
|
|
|
else
|
|
|
|
iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
|
|
|
|
|
|
|
|
if (!iob)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!is_power_of_2(iob)) {
|
|
|
|
if (nvme_first_scan(ns->disk))
|
|
|
|
pr_warn("%s: ignoring unaligned IO boundary:%u\n",
|
|
|
|
ns->disk->disk_name, iob);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (blk_queue_is_zoned(ns->disk->queue)) {
|
|
|
|
if (nvme_first_scan(ns->disk))
|
|
|
|
pr_warn("%s: ignoring zoned namespace IO boundary\n",
|
|
|
|
ns->disk->disk_name);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
blk_queue_chunk_sectors(ns->queue, iob);
|
|
|
|
}
|
|
|
|
|
2022-07-13 02:33:04 +08:00
|
|
|
static int nvme_update_ns_info_generic(struct nvme_ns *ns,
|
|
|
|
struct nvme_ns_info *info)
|
|
|
|
{
|
|
|
|
blk_mq_freeze_queue(ns->disk->queue);
|
|
|
|
nvme_set_queue_limits(ns->ctrl, ns->queue);
|
|
|
|
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
|
|
|
|
blk_mq_unfreeze_queue(ns->disk->queue);
|
|
|
|
|
|
|
|
if (nvme_ns_head_multipath(ns->head)) {
|
|
|
|
blk_mq_freeze_queue(ns->head->disk->queue);
|
|
|
|
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
|
|
|
|
nvme_mpath_revalidate_paths(ns);
|
|
|
|
blk_stack_limits(&ns->head->disk->queue->limits,
|
|
|
|
&ns->queue->limits, 0);
|
|
|
|
ns->head->disk->flags |= GENHD_FL_HIDDEN;
|
|
|
|
blk_mq_unfreeze_queue(ns->head->disk->queue);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Hide the block-interface for these devices */
|
|
|
|
ns->disk->flags |= GENHD_FL_HIDDEN;
|
|
|
|
set_bit(NVME_NS_READY, &ns->flags);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
static int nvme_update_ns_info_block(struct nvme_ns *ns,
|
|
|
|
struct nvme_ns_info *info)
|
2016-09-16 20:25:04 +08:00
|
|
|
{
|
2022-07-23 00:24:18 +08:00
|
|
|
struct nvme_id_ns *id;
|
|
|
|
unsigned lbaf;
|
2020-06-30 03:06:41 +08:00
|
|
|
int ret;
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
ret = nvme_identify_ns(ns->ctrl, info->nsid, &id);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2020-09-28 18:11:42 +08:00
|
|
|
blk_mq_freeze_queue(ns->disk->queue);
|
2022-07-23 00:24:18 +08:00
|
|
|
lbaf = nvme_lbaf_index(id->flbas);
|
2020-06-30 03:06:41 +08:00
|
|
|
ns->lba_shift = id->lbaf[lbaf].ds;
|
2020-09-28 20:07:56 +08:00
|
|
|
nvme_set_queue_limits(ns->ctrl, ns->queue);
|
2020-04-10 00:09:06 +08:00
|
|
|
|
2022-02-16 22:07:15 +08:00
|
|
|
nvme_configure_metadata(ns, id);
|
2021-01-28 12:47:27 +08:00
|
|
|
nvme_set_chunk_sectors(ns, id);
|
|
|
|
nvme_update_disk_info(ns->disk, ns, id);
|
|
|
|
|
2020-09-28 20:07:56 +08:00
|
|
|
if (ns->head->ids.csi == NVME_CSI_ZNS) {
|
2020-08-20 20:02:18 +08:00
|
|
|
ret = nvme_update_zone_info(ns, lbaf);
|
2022-07-21 13:56:35 +08:00
|
|
|
if (ret) {
|
|
|
|
blk_mq_unfreeze_queue(ns->disk->queue);
|
|
|
|
goto out;
|
|
|
|
}
|
2020-06-30 03:06:39 +08:00
|
|
|
}
|
|
|
|
|
2022-10-30 23:50:15 +08:00
|
|
|
/*
|
|
|
|
* Only set the DEAC bit if the device guarantees that reads from
|
|
|
|
* deallocated data return zeroes. While the DEAC bit does not
|
|
|
|
* require that, it must be a no-op if reads from deallocated data
|
|
|
|
* do not return zeroes.
|
|
|
|
*/
|
|
|
|
if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
|
|
|
|
ns->features |= NVME_NS_DEAC;
|
2022-07-13 13:40:25 +08:00
|
|
|
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
|
2021-08-24 22:57:42 +08:00
|
|
|
set_bit(NVME_NS_READY, &ns->flags);
|
2020-09-28 18:11:42 +08:00
|
|
|
blk_mq_unfreeze_queue(ns->disk->queue);
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2020-09-28 18:30:16 +08:00
|
|
|
if (blk_queue_is_zoned(ns->queue)) {
|
|
|
|
ret = nvme_revalidate_zones(ns);
|
2020-10-24 03:16:28 +08:00
|
|
|
if (ret && !nvme_first_scan(ns->disk))
|
2022-07-21 13:56:35 +08:00
|
|
|
goto out;
|
2020-05-19 22:05:50 +08:00
|
|
|
}
|
|
|
|
|
2021-04-07 23:49:29 +08:00
|
|
|
if (nvme_ns_head_multipath(ns->head)) {
|
2020-09-28 18:11:42 +08:00
|
|
|
blk_mq_freeze_queue(ns->head->disk->queue);
|
2017-11-02 19:59:30 +08:00
|
|
|
nvme_update_disk_info(ns->head->disk, ns, id);
|
2022-07-13 13:40:25 +08:00
|
|
|
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
|
2021-08-24 22:57:42 +08:00
|
|
|
nvme_mpath_revalidate_paths(ns);
|
2020-07-20 14:12:51 +08:00
|
|
|
blk_stack_limits(&ns->head->disk->queue->limits,
|
|
|
|
&ns->queue->limits, 0);
|
2021-08-09 22:17:41 +08:00
|
|
|
disk_update_readahead(ns->head->disk);
|
2020-09-28 18:11:42 +08:00
|
|
|
blk_mq_unfreeze_queue(ns->head->disk->queue);
|
2018-11-03 02:22:13 +08:00
|
|
|
}
|
2016-09-16 20:25:04 +08:00
|
|
|
|
2022-07-21 13:56:35 +08:00
|
|
|
ret = 0;
|
|
|
|
out:
|
2021-04-07 21:03:16 +08:00
|
|
|
/*
|
|
|
|
* If probing fails due an unsupported feature, hide the block device,
|
|
|
|
* but still allow other access.
|
|
|
|
*/
|
|
|
|
if (ret == -ENODEV) {
|
|
|
|
ns->disk->flags |= GENHD_FL_HIDDEN;
|
2022-02-16 21:14:58 +08:00
|
|
|
set_bit(NVME_NS_READY, &ns->flags);
|
2021-04-07 21:03:16 +08:00
|
|
|
ret = 0;
|
|
|
|
}
|
2022-07-23 00:24:18 +08:00
|
|
|
kfree(id);
|
2020-06-30 03:06:41 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
|
|
|
|
{
|
|
|
|
switch (info->ids.csi) {
|
|
|
|
case NVME_CSI_ZNS:
|
|
|
|
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
|
2022-07-13 02:33:04 +08:00
|
|
|
dev_info(ns->ctrl->device,
|
|
|
|
"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
|
2022-07-23 00:24:18 +08:00
|
|
|
info->nsid);
|
2022-07-13 02:33:04 +08:00
|
|
|
return nvme_update_ns_info_generic(ns, info);
|
2022-07-23 00:24:18 +08:00
|
|
|
}
|
|
|
|
return nvme_update_ns_info_block(ns, info);
|
|
|
|
case NVME_CSI_NVM:
|
|
|
|
return nvme_update_ns_info_block(ns, info);
|
|
|
|
default:
|
2022-07-13 02:33:04 +08:00
|
|
|
dev_info(ns->ctrl->device,
|
|
|
|
"block device for nsid %u not supported (csi %u)\n",
|
|
|
|
info->nsid, info->ids.csi);
|
|
|
|
return nvme_update_ns_info_generic(ns, info);
|
2022-07-23 00:24:18 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-11-26 17:54:19 +08:00
|
|
|
static char nvme_pr_type(enum pr_type type)
|
|
|
|
{
|
|
|
|
switch (type) {
|
|
|
|
case PR_WRITE_EXCLUSIVE:
|
|
|
|
return 1;
|
|
|
|
case PR_EXCLUSIVE_ACCESS:
|
|
|
|
return 2;
|
|
|
|
case PR_WRITE_EXCLUSIVE_REG_ONLY:
|
|
|
|
return 3;
|
|
|
|
case PR_EXCLUSIVE_ACCESS_REG_ONLY:
|
|
|
|
return 4;
|
|
|
|
case PR_WRITE_EXCLUSIVE_ALL_REGS:
|
|
|
|
return 5;
|
|
|
|
case PR_EXCLUSIVE_ACCESS_ALL_REGS:
|
|
|
|
return 6;
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
2022-01-19 15:49:54 +08:00
|
|
|
}
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2021-05-19 15:22:35 +08:00
|
|
|
static int nvme_send_ns_head_pr_command(struct block_device *bdev,
|
|
|
|
struct nvme_command *c, u8 data[16])
|
|
|
|
{
|
|
|
|
struct nvme_ns_head *head = bdev->bd_disk->private_data;
|
|
|
|
int srcu_idx = srcu_read_lock(&head->srcu);
|
|
|
|
struct nvme_ns *ns = nvme_find_path(head);
|
|
|
|
int ret = -EWOULDBLOCK;
|
|
|
|
|
|
|
|
if (ns) {
|
|
|
|
c->common.nsid = cpu_to_le32(ns->head->ns_id);
|
|
|
|
ret = nvme_submit_sync_cmd(ns->queue, c, data, 16);
|
|
|
|
}
|
|
|
|
srcu_read_unlock(&head->srcu, srcu_idx);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_send_ns_pr_command(struct nvme_ns *ns, struct nvme_command *c,
|
|
|
|
u8 data[16])
|
|
|
|
{
|
|
|
|
c->common.nsid = cpu_to_le32(ns->head->ns_id);
|
|
|
|
return nvme_submit_sync_cmd(ns->queue, c, data, 16);
|
|
|
|
}
|
|
|
|
|
2022-11-22 11:26:03 +08:00
|
|
|
static int nvme_sc_to_pr_err(int nvme_sc)
|
|
|
|
{
|
|
|
|
if (nvme_is_path_error(nvme_sc))
|
|
|
|
return PR_STS_PATH_FAILED;
|
|
|
|
|
|
|
|
switch (nvme_sc) {
|
|
|
|
case NVME_SC_SUCCESS:
|
|
|
|
return PR_STS_SUCCESS;
|
|
|
|
case NVME_SC_RESERVATION_CONFLICT:
|
|
|
|
return PR_STS_RESERVATION_CONFLICT;
|
|
|
|
case NVME_SC_ONCS_NOT_SUPPORTED:
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
case NVME_SC_BAD_ATTRIBUTES:
|
|
|
|
case NVME_SC_INVALID_OPCODE:
|
|
|
|
case NVME_SC_INVALID_FIELD:
|
|
|
|
case NVME_SC_INVALID_NS:
|
|
|
|
return -EINVAL;
|
|
|
|
default:
|
|
|
|
return PR_STS_IOERR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-11-26 17:54:19 +08:00
|
|
|
static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
|
|
|
|
u64 key, u64 sa_key, u8 op)
|
|
|
|
{
|
2021-06-17 06:15:52 +08:00
|
|
|
struct nvme_command c = { };
|
2015-11-26 17:54:19 +08:00
|
|
|
u8 data[16] = { 0, };
|
2022-11-22 11:26:03 +08:00
|
|
|
int ret;
|
2015-11-26 17:54:19 +08:00
|
|
|
|
|
|
|
put_unaligned_le64(key, &data[0]);
|
|
|
|
put_unaligned_le64(sa_key, &data[8]);
|
|
|
|
|
|
|
|
c.common.opcode = op;
|
2018-12-13 07:11:37 +08:00
|
|
|
c.common.cdw10 = cpu_to_le32(cdw10);
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2021-05-19 15:22:35 +08:00
|
|
|
if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
|
|
|
|
bdev->bd_disk->fops == &nvme_ns_head_ops)
|
2022-11-22 11:26:03 +08:00
|
|
|
ret = nvme_send_ns_head_pr_command(bdev, &c, data);
|
|
|
|
else
|
|
|
|
ret = nvme_send_ns_pr_command(bdev->bd_disk->private_data, &c,
|
|
|
|
data);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
return nvme_sc_to_pr_err(ret);
|
2015-11-26 17:54:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_register(struct block_device *bdev, u64 old,
|
|
|
|
u64 new, unsigned flags)
|
|
|
|
{
|
|
|
|
u32 cdw10;
|
|
|
|
|
|
|
|
if (flags & ~PR_FL_IGNORE_KEY)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
cdw10 = old ? 2 : 0;
|
|
|
|
cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
|
|
|
|
cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
|
|
|
|
return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_reserve(struct block_device *bdev, u64 key,
|
|
|
|
enum pr_type type, unsigned flags)
|
|
|
|
{
|
|
|
|
u32 cdw10;
|
|
|
|
|
|
|
|
if (flags & ~PR_FL_IGNORE_KEY)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
cdw10 = nvme_pr_type(type) << 8;
|
|
|
|
cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
|
|
|
|
return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
|
|
|
|
enum pr_type type, bool abort)
|
|
|
|
{
|
2018-05-23 22:56:11 +08:00
|
|
|
u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
|
2021-03-01 10:06:11 +08:00
|
|
|
|
2015-11-26 17:54:19 +08:00
|
|
|
return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_clear(struct block_device *bdev, u64 key)
|
|
|
|
{
|
2022-09-23 12:49:09 +08:00
|
|
|
u32 cdw10 = 1 | (key ? 0 : 1 << 3);
|
2021-03-01 10:06:11 +08:00
|
|
|
|
2022-09-23 12:49:09 +08:00
|
|
|
return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
|
2015-11-26 17:54:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
|
|
|
|
{
|
2022-09-23 12:49:09 +08:00
|
|
|
u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 0 : 1 << 3);
|
2021-03-01 10:06:11 +08:00
|
|
|
|
2015-11-26 17:54:19 +08:00
|
|
|
return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
|
|
|
|
}
|
|
|
|
|
2021-04-07 20:22:12 +08:00
|
|
|
const struct pr_ops nvme_pr_ops = {
|
2015-11-26 17:54:19 +08:00
|
|
|
.pr_register = nvme_pr_register,
|
|
|
|
.pr_reserve = nvme_pr_reserve,
|
|
|
|
.pr_release = nvme_pr_release,
|
|
|
|
.pr_preempt = nvme_pr_preempt,
|
|
|
|
.pr_clear = nvme_pr_clear,
|
|
|
|
};
|
|
|
|
|
2017-02-04 03:50:32 +08:00
|
|
|
#ifdef CONFIG_BLK_SED_OPAL
|
2022-11-08 22:48:27 +08:00
|
|
|
static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
|
2017-02-17 20:59:39 +08:00
|
|
|
bool send)
|
2017-02-04 03:50:32 +08:00
|
|
|
{
|
2017-02-17 20:59:39 +08:00
|
|
|
struct nvme_ctrl *ctrl = data;
|
2021-06-17 06:15:52 +08:00
|
|
|
struct nvme_command cmd = { };
|
2017-02-04 03:50:32 +08:00
|
|
|
|
|
|
|
if (send)
|
|
|
|
cmd.common.opcode = nvme_admin_security_send;
|
|
|
|
else
|
|
|
|
cmd.common.opcode = nvme_admin_security_recv;
|
|
|
|
cmd.common.nsid = 0;
|
2018-12-13 07:11:37 +08:00
|
|
|
cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
|
|
|
|
cmd.common.cdw11 = cpu_to_le32(len);
|
2017-02-04 03:50:32 +08:00
|
|
|
|
2022-06-07 09:16:42 +08:00
|
|
|
return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
|
2021-06-11 05:44:35 +08:00
|
|
|
NVME_QID_ANY, 1, 0);
|
2017-02-04 03:50:32 +08:00
|
|
|
}
|
2022-11-08 22:48:27 +08:00
|
|
|
|
|
|
|
static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
|
|
|
|
{
|
|
|
|
if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) {
|
|
|
|
if (!ctrl->opal_dev)
|
|
|
|
ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit);
|
|
|
|
else if (was_suspended)
|
|
|
|
opal_unlock_from_suspend(ctrl->opal_dev);
|
|
|
|
} else {
|
|
|
|
free_opal_dev(ctrl->opal_dev);
|
|
|
|
ctrl->opal_dev = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
|
|
|
|
{
|
|
|
|
}
|
2017-02-04 03:50:32 +08:00
|
|
|
#endif /* CONFIG_BLK_SED_OPAL */
|
|
|
|
|
2021-05-19 15:17:06 +08:00
|
|
|
#ifdef CONFIG_BLK_DEV_ZONED
|
|
|
|
static int nvme_report_zones(struct gendisk *disk, sector_t sector,
|
|
|
|
unsigned int nr_zones, report_zones_cb cb, void *data)
|
|
|
|
{
|
|
|
|
return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
|
|
|
|
data);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#define nvme_report_zones NULL
|
|
|
|
#endif /* CONFIG_BLK_DEV_ZONED */
|
|
|
|
|
2020-12-01 20:56:09 +08:00
|
|
|
static const struct block_device_operations nvme_bdev_ops = {
|
2015-11-26 17:54:19 +08:00
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.ioctl = nvme_ioctl,
|
2022-07-21 11:57:35 +08:00
|
|
|
.compat_ioctl = blkdev_compat_ptr_ioctl,
|
2015-11-26 17:54:19 +08:00
|
|
|
.open = nvme_open,
|
|
|
|
.release = nvme_release,
|
|
|
|
.getgeo = nvme_getgeo,
|
2020-06-30 03:06:41 +08:00
|
|
|
.report_zones = nvme_report_zones,
|
2015-11-26 17:54:19 +08:00
|
|
|
.pr_ops = &nvme_pr_ops,
|
|
|
|
};
|
|
|
|
|
2022-11-16 15:54:26 +08:00
|
|
|
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
|
|
|
|
u32 timeout, const char *op)
|
2015-11-28 22:03:49 +08:00
|
|
|
{
|
2022-11-16 15:54:26 +08:00
|
|
|
unsigned long timeout_jiffies = jiffies + timeout * HZ;
|
|
|
|
u32 csts;
|
2015-11-28 22:03:49 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
|
2016-10-12 01:31:58 +08:00
|
|
|
if (csts == ~0)
|
|
|
|
return -ENODEV;
|
2022-11-16 15:54:26 +08:00
|
|
|
if ((csts & mask) == val)
|
2015-11-28 22:03:49 +08:00
|
|
|
break;
|
|
|
|
|
2020-02-29 10:52:28 +08:00
|
|
|
usleep_range(1000, 2000);
|
2015-11-28 22:03:49 +08:00
|
|
|
if (fatal_signal_pending(current))
|
|
|
|
return -EINTR;
|
2022-05-16 21:09:21 +08:00
|
|
|
if (time_after(jiffies, timeout_jiffies)) {
|
2016-02-10 23:51:15 +08:00
|
|
|
dev_err(ctrl->device,
|
2020-02-28 00:45:26 +08:00
|
|
|
"Device not ready; aborting %s, CSTS=0x%x\n",
|
2022-11-16 15:54:26 +08:00
|
|
|
op, csts);
|
2015-11-28 22:03:49 +08:00
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-11-08 18:20:12 +08:00
|
|
|
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
|
2015-11-28 22:03:49 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
|
2022-11-08 18:20:12 +08:00
|
|
|
if (shutdown)
|
|
|
|
ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
|
|
|
|
else
|
|
|
|
ctrl->ctrl_config &= ~NVME_CC_ENABLE;
|
2015-11-28 22:03:49 +08:00
|
|
|
|
|
|
|
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2016-06-15 05:22:41 +08:00
|
|
|
|
2022-11-08 18:20:12 +08:00
|
|
|
if (shutdown) {
|
|
|
|
return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
|
|
|
|
NVME_CSTS_SHST_CMPLT,
|
|
|
|
ctrl->shutdown_timeout, "shutdown");
|
|
|
|
}
|
nvme: apply DELAY_BEFORE_CHK_RDY quirk at probe time too
Commit 54adc01055b7 ("nvme/quirk: Add a delay before checking for adapter
readiness") introduced a quirk to adapters that cannot read the bit
NVME_CSTS_RDY right after register NVME_REG_CC is set; these adapters
need a delay or else the action of reading the bit NVME_CSTS_RDY could
somehow corrupt adapter's registers state and it never recovers.
When this quirk was added, we checked ctrl->tagset in order to avoid
quirking in probe time, supposing we would never require such delay
during probe. Well, it was too optimistic; we in fact need this quirk
at probe time in some cases, like after a kexec.
In some experiments, after abnormal shutdown of machine (aka power cord
unplug), we booted into our bootloader in Power, which is a Linux kernel,
and kexec'ed into another distro. If this kexec is too quick, we end up
reaching the probe of NVMe adapter in that distro when adapter is in
bad state (not fully initialized on our bootloader). What happens next
is that nvme_wait_ready() is unable to complete, except if the quirk is
enabled.
So, this patch removes the original ctrl->tagset verification in order
to enable the quirk even on probe time.
Fixes: 54adc01055b7 ("nvme/quirk: Add a delay before checking for adapter readiness")
Reported-by: Andrew Byrne <byrneadw@ie.ibm.com>
Reported-by: Jaime A. H. Gomez <jahgomez@mx1.ibm.com>
Reported-by: Zachary D. Myers <zdmyers@us.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
Acked-by: Jeffrey Lien <Jeff.Lien@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2016-12-29 08:13:15 +08:00
|
|
|
if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
|
2016-06-15 05:22:41 +08:00
|
|
|
msleep(NVME_QUIRK_DELAY_AMOUNT);
|
2022-11-16 15:54:26 +08:00
|
|
|
return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
|
|
|
|
(NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
|
2015-11-28 22:03:49 +08:00
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
|
2015-11-28 22:03:49 +08:00
|
|
|
|
2019-07-23 08:06:53 +08:00
|
|
|
int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
|
2015-11-28 22:03:49 +08:00
|
|
|
{
|
2020-07-17 08:51:37 +08:00
|
|
|
unsigned dev_page_min;
|
2022-05-16 21:09:21 +08:00
|
|
|
u32 timeout;
|
2015-11-28 22:03:49 +08:00
|
|
|
int ret;
|
|
|
|
|
2019-07-23 08:06:53 +08:00
|
|
|
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
|
|
|
|
if (ret) {
|
|
|
|
dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
|
|
|
|
|
2020-07-17 08:51:37 +08:00
|
|
|
if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
|
2016-02-10 23:51:15 +08:00
|
|
|
dev_err(ctrl->device,
|
2015-11-28 22:03:49 +08:00
|
|
|
"Minimum device page size %u too large for host (%u)\n",
|
2020-07-17 08:51:37 +08:00
|
|
|
1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
|
2015-11-28 22:03:49 +08:00
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
|
2020-06-30 03:06:39 +08:00
|
|
|
if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
|
|
|
|
ctrl->ctrl_config = NVME_CC_CSS_CSI;
|
|
|
|
else
|
|
|
|
ctrl->ctrl_config = NVME_CC_CSS_NVM;
|
2022-05-16 21:09:21 +08:00
|
|
|
|
|
|
|
if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
|
|
|
|
u32 crto;
|
|
|
|
|
|
|
|
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
|
|
|
|
if (ret) {
|
|
|
|
dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
|
|
|
|
ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ctrl->cap & NVME_CAP_CRMS_CRIMS) {
|
|
|
|
ctrl->ctrl_config |= NVME_CC_CRIME;
|
|
|
|
timeout = NVME_CRTO_CRIMT(crto);
|
|
|
|
} else {
|
|
|
|
timeout = NVME_CRTO_CRWMT(crto);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
timeout = NVME_CAP_TIMEOUT(ctrl->cap);
|
|
|
|
}
|
|
|
|
|
2020-07-17 08:51:37 +08:00
|
|
|
ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
|
2017-08-14 00:21:07 +08:00
|
|
|
ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
|
2015-11-28 22:03:49 +08:00
|
|
|
ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
|
2022-05-26 21:57:21 +08:00
|
|
|
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2015-11-28 22:03:49 +08:00
|
|
|
|
2022-05-26 21:57:21 +08:00
|
|
|
/* Flush write to device (required if transport is PCI) */
|
|
|
|
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CC, &ctrl->ctrl_config);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ctrl->ctrl_config |= NVME_CC_ENABLE;
|
2015-11-28 22:03:49 +08:00
|
|
|
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2022-11-16 15:54:26 +08:00
|
|
|
return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
|
|
|
|
(timeout + 1) / 2, "initialisation");
|
2015-11-28 22:03:49 +08:00
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
|
2015-11-28 22:03:49 +08:00
|
|
|
|
2017-08-16 15:51:29 +08:00
|
|
|
static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
__le64 ts;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
|
|
|
|
ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
|
|
|
|
NULL);
|
|
|
|
if (ret)
|
|
|
|
dev_warn_once(ctrl->device,
|
|
|
|
"could not set timestamp (%d)\n", ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-03-04 04:13:12 +08:00
|
|
|
static int nvme_configure_host_options(struct nvme_ctrl *ctrl)
|
2018-11-28 00:40:57 +08:00
|
|
|
{
|
|
|
|
struct nvme_feat_host_behavior *host;
|
2022-03-04 04:13:12 +08:00
|
|
|
u8 acre = 0, lbafee = 0;
|
2018-11-28 00:40:57 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* Don't bother enabling the feature if retry delay is not reported */
|
2022-03-04 04:13:12 +08:00
|
|
|
if (ctrl->crdt[0])
|
|
|
|
acre = NVME_ENABLE_ACRE;
|
|
|
|
if (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)
|
|
|
|
lbafee = NVME_ENABLE_LBAFEE;
|
|
|
|
|
|
|
|
if (!acre && !lbafee)
|
2018-11-28 00:40:57 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
host = kzalloc(sizeof(*host), GFP_KERNEL);
|
|
|
|
if (!host)
|
|
|
|
return 0;
|
|
|
|
|
2022-03-04 04:13:12 +08:00
|
|
|
host->acre = acre;
|
|
|
|
host->lbafee = lbafee;
|
2018-11-28 00:40:57 +08:00
|
|
|
ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
|
|
|
|
host, sizeof(*host), NULL);
|
|
|
|
kfree(host);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
nvme: extend and modify the APST configuration algorithm
The algorithm that was used until now for building the APST configuration
table has been found to produce entries with excessively long ITPT
(idle time prior to transition) for devices declaring relatively long
entry and exit latencies for non-operational power states. This leads
to unnecessary waste of power and, as a result, failure to pass
mandatory power consumption tests on Chromebook platforms.
The new algorithm is based on two predefined ITPT values and two
predefined latency tolerances. Based on these values, as well as on
exit and entry latencies reported by the device, the algorithm looks
for up to 2 suitable non-operational power states to use as primary
and secondary APST transition targets. The predefined values are
supplied to the nvme driver as module parameters:
- apst_primary_timeout_ms (default: 100)
- apst_secondary_timeout_ms (default: 2000)
- apst_primary_latency_tol_us (default: 15000)
- apst_secondary_latency_tol_us (default: 100000)
The algorithm echoes the approach used by Intel's and Microsoft's drivers
on Windows. The specific default parameter values are also based on those
drivers. Yet, this patch doesn't introduce the ability to dynamically
regenerate the APST table in the event of switching the power source from
AC to battery and back. Adding this functionality may be considered in the
future. In the meantime, the timeouts and tolerances reflect a compromise
between values used by Microsoft for AC and battery scenarios.
In most NVMe devices the new algorithm causes them to implement a more
aggressive power saving policy. While beneficial in most cases, this
sometimes comes at the price of a higher IO processing latency in certain
scenarios as well as at the price of a potential impact on the drive's
endurance (due to more frequent context saving when entering deep non-
operational states). So in order to provide a fallback for systems where
these regressions cannot be tolerated, the patch allows to revert to
the legacy behavior by setting either apst_primary_timeout_ms or
apst_primary_latency_tol_us parameter to 0. Eventually (and possibly after
fine tuning the default values of the module parameters) the legacy behavior
can be removed.
TESTING.
The new algorithm has been extensively tested. Initially, simulations were
used to compare APST tables generated by old and new algorithms for a wide
range of devices. After that, power consumption, performance and latencies
were measured under different workloads on devices from multiple vendors
(WD, Intel, Samsung, Hynix, Kioxia). Below is the description of the tests
and the findings.
General observations.
The effect the patch has on the APST table varies depending on the entry and
exit latencies advertised by the devices. For some devices, the effect is
negligible (e.g. Kioxia KBG40ZNS), for some significant, making the
transitions to PS3 and PS4 much quicker (e.g. WD SN530, Intel 760P), or making
the sleep deeper, PS4 rather than PS3 after a similar amount of time (e.g.
SK Hynix BC511). For some devices (e.g. Samsung PM991) the effect is mixed:
the initial transition happens after a longer idle time, but takes the device
to a lower power state.
Workflows.
In order to evaluate the patch's effect on the power consumption and latency,
7 workflows were used for each device. The workflows were designed to test
the scenarios where significant differences between the old and new behaviors
are most likely. Each workflow was tested twice: with the new and with the
old APST table generation implementation. Power consumption, performance and
latency were measured in the process. The following workflows were used:
1) Consecutive write at the maximum rate with IO depth of 2, with no pauses
2) Repeated pattern of 1000 consecutive writes of 4K packets followed by 50ms
idle time
3) Repeated pattern of 1000 consecutive writes of 4K packets followed by 150ms
idle time
4) Repeated pattern of 1000 consecutive writes of 4K packets followed by 500ms
idle time
5) Repeated pattern of 1000 consecutive writes of 4K packets followed by 1.5s
idle time
6) Repeated pattern of 1000 consecutive writes of 4K packets followed by 5s
idle time
7) Repeated pattern of a single random read of a 4K packet followed by 150ms
idle time
Power consumption
Actual power consumption measurements produced predictable results in
accordance with the APST mechanism's theory of operation.
Devices with long entry and exit latencies such as WD SN530 showed huge
improvement on scenarios 4,5 and 6 of up to 62%. Devices such as Kioxia
KBG40ZNS where the resulting APST table looks virtually identical with
both legacy and new algorithms, showed little or no change in the average power
consumption on all workflows. Devices with extra short latencies such as
Samsung PM991 showed moderate increase in power consumption of up to 18% in
worst case scenarios.
In addition, on Intel and Samsung devices a more complex impact was observed
on scenarios 3, 4 and 7. Our understanding is that due to longer stay in deep
non-operational states between the writes the devices start performing background
operations leading to an increase of power consumption. With the old APST tables
part of these operations are delayed until the scenario is over and a longer idle
period begins, but eventually this extra power is consumed anyway.
Performance.
In terms of performance measured on sustained write or read scenarios, the
effect of the patch is minimal as in this case the device doesn't enter low power
states.
Latency
As expected, in devices where the patch causes a more aggressive power saving
policy (e.g. WD SN530, Intel 760P), an increase in latency was observed in
certain scenarios. Workflow number 7, specifically designed to simulate the
worst case scenario as far as latency is concerned, indeed shows a sharp
increase in average latency (~2ms -> ~53ms on Intel 760P and 0.6 -> 10ms on
WD SN530). The latency increase on other workloads and other devices is much
milder or non-existent.
Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-04-28 17:27:36 +08:00
|
|
|
/*
|
|
|
|
* The function checks whether the given total (exlat + enlat) latency of
|
|
|
|
* a power state allows the latter to be used as an APST transition target.
|
|
|
|
* It does so by comparing the latency to the primary and secondary latency
|
|
|
|
* tolerances defined by module params. If there's a match, the corresponding
|
|
|
|
* timeout value is returned and the matching tolerance index (1 or 2) is
|
|
|
|
* reported.
|
|
|
|
*/
|
|
|
|
static bool nvme_apst_get_transition_time(u64 total_latency,
|
|
|
|
u64 *transition_time, unsigned *last_index)
|
|
|
|
{
|
|
|
|
if (total_latency <= apst_primary_latency_tol_us) {
|
|
|
|
if (*last_index == 1)
|
|
|
|
return false;
|
|
|
|
*last_index = 1;
|
|
|
|
*transition_time = apst_primary_timeout_ms;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (apst_secondary_timeout_ms &&
|
|
|
|
total_latency <= apst_secondary_latency_tol_us) {
|
|
|
|
if (*last_index <= 2)
|
|
|
|
return false;
|
|
|
|
*last_index = 2;
|
|
|
|
*transition_time = apst_secondary_timeout_ms;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
/*
|
|
|
|
* APST (Autonomous Power State Transition) lets us program a table of power
|
|
|
|
* state transitions that the controller will perform automatically.
|
nvme: extend and modify the APST configuration algorithm
The algorithm that was used until now for building the APST configuration
table has been found to produce entries with excessively long ITPT
(idle time prior to transition) for devices declaring relatively long
entry and exit latencies for non-operational power states. This leads
to unnecessary waste of power and, as a result, failure to pass
mandatory power consumption tests on Chromebook platforms.
The new algorithm is based on two predefined ITPT values and two
predefined latency tolerances. Based on these values, as well as on
exit and entry latencies reported by the device, the algorithm looks
for up to 2 suitable non-operational power states to use as primary
and secondary APST transition targets. The predefined values are
supplied to the nvme driver as module parameters:
- apst_primary_timeout_ms (default: 100)
- apst_secondary_timeout_ms (default: 2000)
- apst_primary_latency_tol_us (default: 15000)
- apst_secondary_latency_tol_us (default: 100000)
The algorithm echoes the approach used by Intel's and Microsoft's drivers
on Windows. The specific default parameter values are also based on those
drivers. Yet, this patch doesn't introduce the ability to dynamically
regenerate the APST table in the event of switching the power source from
AC to battery and back. Adding this functionality may be considered in the
future. In the meantime, the timeouts and tolerances reflect a compromise
between values used by Microsoft for AC and battery scenarios.
In most NVMe devices the new algorithm causes them to implement a more
aggressive power saving policy. While beneficial in most cases, this
sometimes comes at the price of a higher IO processing latency in certain
scenarios as well as at the price of a potential impact on the drive's
endurance (due to more frequent context saving when entering deep non-
operational states). So in order to provide a fallback for systems where
these regressions cannot be tolerated, the patch allows to revert to
the legacy behavior by setting either apst_primary_timeout_ms or
apst_primary_latency_tol_us parameter to 0. Eventually (and possibly after
fine tuning the default values of the module parameters) the legacy behavior
can be removed.
TESTING.
The new algorithm has been extensively tested. Initially, simulations were
used to compare APST tables generated by old and new algorithms for a wide
range of devices. After that, power consumption, performance and latencies
were measured under different workloads on devices from multiple vendors
(WD, Intel, Samsung, Hynix, Kioxia). Below is the description of the tests
and the findings.
General observations.
The effect the patch has on the APST table varies depending on the entry and
exit latencies advertised by the devices. For some devices, the effect is
negligible (e.g. Kioxia KBG40ZNS), for some significant, making the
transitions to PS3 and PS4 much quicker (e.g. WD SN530, Intel 760P), or making
the sleep deeper, PS4 rather than PS3 after a similar amount of time (e.g.
SK Hynix BC511). For some devices (e.g. Samsung PM991) the effect is mixed:
the initial transition happens after a longer idle time, but takes the device
to a lower power state.
Workflows.
In order to evaluate the patch's effect on the power consumption and latency,
7 workflows were used for each device. The workflows were designed to test
the scenarios where significant differences between the old and new behaviors
are most likely. Each workflow was tested twice: with the new and with the
old APST table generation implementation. Power consumption, performance and
latency were measured in the process. The following workflows were used:
1) Consecutive write at the maximum rate with IO depth of 2, with no pauses
2) Repeated pattern of 1000 consecutive writes of 4K packets followed by 50ms
idle time
3) Repeated pattern of 1000 consecutive writes of 4K packets followed by 150ms
idle time
4) Repeated pattern of 1000 consecutive writes of 4K packets followed by 500ms
idle time
5) Repeated pattern of 1000 consecutive writes of 4K packets followed by 1.5s
idle time
6) Repeated pattern of 1000 consecutive writes of 4K packets followed by 5s
idle time
7) Repeated pattern of a single random read of a 4K packet followed by 150ms
idle time
Power consumption
Actual power consumption measurements produced predictable results in
accordance with the APST mechanism's theory of operation.
Devices with long entry and exit latencies such as WD SN530 showed huge
improvement on scenarios 4,5 and 6 of up to 62%. Devices such as Kioxia
KBG40ZNS where the resulting APST table looks virtually identical with
both legacy and new algorithms, showed little or no change in the average power
consumption on all workflows. Devices with extra short latencies such as
Samsung PM991 showed moderate increase in power consumption of up to 18% in
worst case scenarios.
In addition, on Intel and Samsung devices a more complex impact was observed
on scenarios 3, 4 and 7. Our understanding is that due to longer stay in deep
non-operational states between the writes the devices start performing background
operations leading to an increase of power consumption. With the old APST tables
part of these operations are delayed until the scenario is over and a longer idle
period begins, but eventually this extra power is consumed anyway.
Performance.
In terms of performance measured on sustained write or read scenarios, the
effect of the patch is minimal as in this case the device doesn't enter low power
states.
Latency
As expected, in devices where the patch causes a more aggressive power saving
policy (e.g. WD SN530, Intel 760P), an increase in latency was observed in
certain scenarios. Workflow number 7, specifically designed to simulate the
worst case scenario as far as latency is concerned, indeed shows a sharp
increase in average latency (~2ms -> ~53ms on Intel 760P and 0.6 -> 10ms on
WD SN530). The latency increase on other workloads and other devices is much
milder or non-existent.
Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-04-28 17:27:36 +08:00
|
|
|
*
|
|
|
|
* Depending on module params, one of the two supported techniques will be used:
|
|
|
|
*
|
|
|
|
* - If the parameters provide explicit timeouts and tolerances, they will be
|
|
|
|
* used to build a table with up to 2 non-operational states to transition to.
|
|
|
|
* The default parameter values were selected based on the values used by
|
|
|
|
* Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
|
|
|
|
* regeneration of the APST table in the event of switching between external
|
|
|
|
* and battery power, the timeouts and tolerances reflect a compromise
|
|
|
|
* between values used by Microsoft for AC and battery scenarios.
|
|
|
|
* - If not, we'll configure the table with a simple heuristic: we are willing
|
|
|
|
* to spend at most 2% of the time transitioning between power states.
|
|
|
|
* Therefore, when running in any given state, we will enter the next
|
|
|
|
* lower-power non-operational state after waiting 50 * (enlat + exlat)
|
|
|
|
* microseconds, as long as that state's exit latency is under the requested
|
|
|
|
* maximum latency.
|
2021-04-09 14:47:44 +08:00
|
|
|
*
|
|
|
|
* We will not autonomously enter any non-operational state for which the total
|
|
|
|
* latency exceeds ps_max_latency_us.
|
|
|
|
*
|
|
|
|
* Users can set ps_max_latency_us to zero to turn off APST.
|
|
|
|
*/
|
2017-08-10 17:23:31 +08:00
|
|
|
static int nvme_configure_apst(struct nvme_ctrl *ctrl)
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
{
|
|
|
|
struct nvme_feat_auto_pst *table;
|
2021-04-09 14:47:44 +08:00
|
|
|
unsigned apste = 0;
|
2017-04-22 07:19:23 +08:00
|
|
|
u64 max_lat_us = 0;
|
2021-04-09 14:47:44 +08:00
|
|
|
__le64 target = 0;
|
2017-04-22 07:19:23 +08:00
|
|
|
int max_ps = -1;
|
2021-04-09 14:47:44 +08:00
|
|
|
int state;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
int ret;
|
nvme: extend and modify the APST configuration algorithm
The algorithm that was used until now for building the APST configuration
table has been found to produce entries with excessively long ITPT
(idle time prior to transition) for devices declaring relatively long
entry and exit latencies for non-operational power states. This leads
to unnecessary waste of power and, as a result, failure to pass
mandatory power consumption tests on Chromebook platforms.
The new algorithm is based on two predefined ITPT values and two
predefined latency tolerances. Based on these values, as well as on
exit and entry latencies reported by the device, the algorithm looks
for up to 2 suitable non-operational power states to use as primary
and secondary APST transition targets. The predefined values are
supplied to the nvme driver as module parameters:
- apst_primary_timeout_ms (default: 100)
- apst_secondary_timeout_ms (default: 2000)
- apst_primary_latency_tol_us (default: 15000)
- apst_secondary_latency_tol_us (default: 100000)
The algorithm echoes the approach used by Intel's and Microsoft's drivers
on Windows. The specific default parameter values are also based on those
drivers. Yet, this patch doesn't introduce the ability to dynamically
regenerate the APST table in the event of switching the power source from
AC to battery and back. Adding this functionality may be considered in the
future. In the meantime, the timeouts and tolerances reflect a compromise
between values used by Microsoft for AC and battery scenarios.
In most NVMe devices the new algorithm causes them to implement a more
aggressive power saving policy. While beneficial in most cases, this
sometimes comes at the price of a higher IO processing latency in certain
scenarios as well as at the price of a potential impact on the drive's
endurance (due to more frequent context saving when entering deep non-
operational states). So in order to provide a fallback for systems where
these regressions cannot be tolerated, the patch allows to revert to
the legacy behavior by setting either apst_primary_timeout_ms or
apst_primary_latency_tol_us parameter to 0. Eventually (and possibly after
fine tuning the default values of the module parameters) the legacy behavior
can be removed.
TESTING.
The new algorithm has been extensively tested. Initially, simulations were
used to compare APST tables generated by old and new algorithms for a wide
range of devices. After that, power consumption, performance and latencies
were measured under different workloads on devices from multiple vendors
(WD, Intel, Samsung, Hynix, Kioxia). Below is the description of the tests
and the findings.
General observations.
The effect the patch has on the APST table varies depending on the entry and
exit latencies advertised by the devices. For some devices, the effect is
negligible (e.g. Kioxia KBG40ZNS), for some significant, making the
transitions to PS3 and PS4 much quicker (e.g. WD SN530, Intel 760P), or making
the sleep deeper, PS4 rather than PS3 after a similar amount of time (e.g.
SK Hynix BC511). For some devices (e.g. Samsung PM991) the effect is mixed:
the initial transition happens after a longer idle time, but takes the device
to a lower power state.
Workflows.
In order to evaluate the patch's effect on the power consumption and latency,
7 workflows were used for each device. The workflows were designed to test
the scenarios where significant differences between the old and new behaviors
are most likely. Each workflow was tested twice: with the new and with the
old APST table generation implementation. Power consumption, performance and
latency were measured in the process. The following workflows were used:
1) Consecutive write at the maximum rate with IO depth of 2, with no pauses
2) Repeated pattern of 1000 consecutive writes of 4K packets followed by 50ms
idle time
3) Repeated pattern of 1000 consecutive writes of 4K packets followed by 150ms
idle time
4) Repeated pattern of 1000 consecutive writes of 4K packets followed by 500ms
idle time
5) Repeated pattern of 1000 consecutive writes of 4K packets followed by 1.5s
idle time
6) Repeated pattern of 1000 consecutive writes of 4K packets followed by 5s
idle time
7) Repeated pattern of a single random read of a 4K packet followed by 150ms
idle time
Power consumption
Actual power consumption measurements produced predictable results in
accordance with the APST mechanism's theory of operation.
Devices with long entry and exit latencies such as WD SN530 showed huge
improvement on scenarios 4,5 and 6 of up to 62%. Devices such as Kioxia
KBG40ZNS where the resulting APST table looks virtually identical with
both legacy and new algorithms, showed little or no change in the average power
consumption on all workflows. Devices with extra short latencies such as
Samsung PM991 showed moderate increase in power consumption of up to 18% in
worst case scenarios.
In addition, on Intel and Samsung devices a more complex impact was observed
on scenarios 3, 4 and 7. Our understanding is that due to longer stay in deep
non-operational states between the writes the devices start performing background
operations leading to an increase of power consumption. With the old APST tables
part of these operations are delayed until the scenario is over and a longer idle
period begins, but eventually this extra power is consumed anyway.
Performance.
In terms of performance measured on sustained write or read scenarios, the
effect of the patch is minimal as in this case the device doesn't enter low power
states.
Latency
As expected, in devices where the patch causes a more aggressive power saving
policy (e.g. WD SN530, Intel 760P), an increase in latency was observed in
certain scenarios. Workflow number 7, specifically designed to simulate the
worst case scenario as far as latency is concerned, indeed shows a sharp
increase in average latency (~2ms -> ~53ms on Intel 760P and 0.6 -> 10ms on
WD SN530). The latency increase on other workloads and other devices is much
milder or non-existent.
Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-04-28 17:27:36 +08:00
|
|
|
unsigned last_lt_index = UINT_MAX;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If APST isn't supported or if we haven't been initialized yet,
|
|
|
|
* then don't do anything.
|
|
|
|
*/
|
|
|
|
if (!ctrl->apsta)
|
2017-08-10 17:23:31 +08:00
|
|
|
return 0;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
|
|
|
if (ctrl->npss > 31) {
|
|
|
|
dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
|
2017-08-10 17:23:31 +08:00
|
|
|
return 0;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
table = kzalloc(sizeof(*table), GFP_KERNEL);
|
|
|
|
if (!table)
|
2017-08-10 17:23:31 +08:00
|
|
|
return 0;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
2017-06-27 04:39:54 +08:00
|
|
|
if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
/* Turn off APST. */
|
2017-04-22 07:19:23 +08:00
|
|
|
dev_dbg(ctrl->device, "APST disabled\n");
|
2021-04-09 14:47:44 +08:00
|
|
|
goto done;
|
|
|
|
}
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
/*
|
|
|
|
* Walk through all states from lowest- to highest-power.
|
|
|
|
* According to the spec, lower-numbered states use more power. NPSS,
|
|
|
|
* despite the name, is the index of the lowest-power state, not the
|
|
|
|
* number of states.
|
|
|
|
*/
|
|
|
|
for (state = (int)ctrl->npss; state >= 0; state--) {
|
|
|
|
u64 total_latency_us, exit_latency_us, transition_ms;
|
2017-06-07 15:25:42 +08:00
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
if (target)
|
|
|
|
table->entries[state] = target;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
|
|
|
/*
|
2021-04-09 14:47:44 +08:00
|
|
|
* Don't allow transitions to the deepest state if it's quirked
|
|
|
|
* off.
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
*/
|
2021-04-09 14:47:44 +08:00
|
|
|
if (state == ctrl->npss &&
|
|
|
|
(ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
|
|
|
|
continue;
|
2017-04-22 07:19:23 +08:00
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
/*
|
|
|
|
* Is this state a useful non-operational state for higher-power
|
|
|
|
* states to autonomously transition to?
|
|
|
|
*/
|
|
|
|
if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE))
|
|
|
|
continue;
|
2017-04-22 07:19:23 +08:00
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
|
|
|
|
if (exit_latency_us > ctrl->ps_max_latency_us)
|
|
|
|
continue;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
total_latency_us = exit_latency_us +
|
|
|
|
le32_to_cpu(ctrl->psd[state].entry_lat);
|
2017-04-22 07:19:23 +08:00
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
/*
|
nvme: extend and modify the APST configuration algorithm
The algorithm that was used until now for building the APST configuration
table has been found to produce entries with excessively long ITPT
(idle time prior to transition) for devices declaring relatively long
entry and exit latencies for non-operational power states. This leads
to unnecessary waste of power and, as a result, failure to pass
mandatory power consumption tests on Chromebook platforms.
The new algorithm is based on two predefined ITPT values and two
predefined latency tolerances. Based on these values, as well as on
exit and entry latencies reported by the device, the algorithm looks
for up to 2 suitable non-operational power states to use as primary
and secondary APST transition targets. The predefined values are
supplied to the nvme driver as module parameters:
- apst_primary_timeout_ms (default: 100)
- apst_secondary_timeout_ms (default: 2000)
- apst_primary_latency_tol_us (default: 15000)
- apst_secondary_latency_tol_us (default: 100000)
The algorithm echoes the approach used by Intel's and Microsoft's drivers
on Windows. The specific default parameter values are also based on those
drivers. Yet, this patch doesn't introduce the ability to dynamically
regenerate the APST table in the event of switching the power source from
AC to battery and back. Adding this functionality may be considered in the
future. In the meantime, the timeouts and tolerances reflect a compromise
between values used by Microsoft for AC and battery scenarios.
In most NVMe devices the new algorithm causes them to implement a more
aggressive power saving policy. While beneficial in most cases, this
sometimes comes at the price of a higher IO processing latency in certain
scenarios as well as at the price of a potential impact on the drive's
endurance (due to more frequent context saving when entering deep non-
operational states). So in order to provide a fallback for systems where
these regressions cannot be tolerated, the patch allows to revert to
the legacy behavior by setting either apst_primary_timeout_ms or
apst_primary_latency_tol_us parameter to 0. Eventually (and possibly after
fine tuning the default values of the module parameters) the legacy behavior
can be removed.
TESTING.
The new algorithm has been extensively tested. Initially, simulations were
used to compare APST tables generated by old and new algorithms for a wide
range of devices. After that, power consumption, performance and latencies
were measured under different workloads on devices from multiple vendors
(WD, Intel, Samsung, Hynix, Kioxia). Below is the description of the tests
and the findings.
General observations.
The effect the patch has on the APST table varies depending on the entry and
exit latencies advertised by the devices. For some devices, the effect is
negligible (e.g. Kioxia KBG40ZNS), for some significant, making the
transitions to PS3 and PS4 much quicker (e.g. WD SN530, Intel 760P), or making
the sleep deeper, PS4 rather than PS3 after a similar amount of time (e.g.
SK Hynix BC511). For some devices (e.g. Samsung PM991) the effect is mixed:
the initial transition happens after a longer idle time, but takes the device
to a lower power state.
Workflows.
In order to evaluate the patch's effect on the power consumption and latency,
7 workflows were used for each device. The workflows were designed to test
the scenarios where significant differences between the old and new behaviors
are most likely. Each workflow was tested twice: with the new and with the
old APST table generation implementation. Power consumption, performance and
latency were measured in the process. The following workflows were used:
1) Consecutive write at the maximum rate with IO depth of 2, with no pauses
2) Repeated pattern of 1000 consecutive writes of 4K packets followed by 50ms
idle time
3) Repeated pattern of 1000 consecutive writes of 4K packets followed by 150ms
idle time
4) Repeated pattern of 1000 consecutive writes of 4K packets followed by 500ms
idle time
5) Repeated pattern of 1000 consecutive writes of 4K packets followed by 1.5s
idle time
6) Repeated pattern of 1000 consecutive writes of 4K packets followed by 5s
idle time
7) Repeated pattern of a single random read of a 4K packet followed by 150ms
idle time
Power consumption
Actual power consumption measurements produced predictable results in
accordance with the APST mechanism's theory of operation.
Devices with long entry and exit latencies such as WD SN530 showed huge
improvement on scenarios 4,5 and 6 of up to 62%. Devices such as Kioxia
KBG40ZNS where the resulting APST table looks virtually identical with
both legacy and new algorithms, showed little or no change in the average power
consumption on all workflows. Devices with extra short latencies such as
Samsung PM991 showed moderate increase in power consumption of up to 18% in
worst case scenarios.
In addition, on Intel and Samsung devices a more complex impact was observed
on scenarios 3, 4 and 7. Our understanding is that due to longer stay in deep
non-operational states between the writes the devices start performing background
operations leading to an increase of power consumption. With the old APST tables
part of these operations are delayed until the scenario is over and a longer idle
period begins, but eventually this extra power is consumed anyway.
Performance.
In terms of performance measured on sustained write or read scenarios, the
effect of the patch is minimal as in this case the device doesn't enter low power
states.
Latency
As expected, in devices where the patch causes a more aggressive power saving
policy (e.g. WD SN530, Intel 760P), an increase in latency was observed in
certain scenarios. Workflow number 7, specifically designed to simulate the
worst case scenario as far as latency is concerned, indeed shows a sharp
increase in average latency (~2ms -> ~53ms on Intel 760P and 0.6 -> 10ms on
WD SN530). The latency increase on other workloads and other devices is much
milder or non-existent.
Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-04-28 17:27:36 +08:00
|
|
|
* This state is good. It can be used as the APST idle target
|
|
|
|
* for higher power states.
|
2021-04-09 14:47:44 +08:00
|
|
|
*/
|
nvme: extend and modify the APST configuration algorithm
The algorithm that was used until now for building the APST configuration
table has been found to produce entries with excessively long ITPT
(idle time prior to transition) for devices declaring relatively long
entry and exit latencies for non-operational power states. This leads
to unnecessary waste of power and, as a result, failure to pass
mandatory power consumption tests on Chromebook platforms.
The new algorithm is based on two predefined ITPT values and two
predefined latency tolerances. Based on these values, as well as on
exit and entry latencies reported by the device, the algorithm looks
for up to 2 suitable non-operational power states to use as primary
and secondary APST transition targets. The predefined values are
supplied to the nvme driver as module parameters:
- apst_primary_timeout_ms (default: 100)
- apst_secondary_timeout_ms (default: 2000)
- apst_primary_latency_tol_us (default: 15000)
- apst_secondary_latency_tol_us (default: 100000)
The algorithm echoes the approach used by Intel's and Microsoft's drivers
on Windows. The specific default parameter values are also based on those
drivers. Yet, this patch doesn't introduce the ability to dynamically
regenerate the APST table in the event of switching the power source from
AC to battery and back. Adding this functionality may be considered in the
future. In the meantime, the timeouts and tolerances reflect a compromise
between values used by Microsoft for AC and battery scenarios.
In most NVMe devices the new algorithm causes them to implement a more
aggressive power saving policy. While beneficial in most cases, this
sometimes comes at the price of a higher IO processing latency in certain
scenarios as well as at the price of a potential impact on the drive's
endurance (due to more frequent context saving when entering deep non-
operational states). So in order to provide a fallback for systems where
these regressions cannot be tolerated, the patch allows to revert to
the legacy behavior by setting either apst_primary_timeout_ms or
apst_primary_latency_tol_us parameter to 0. Eventually (and possibly after
fine tuning the default values of the module parameters) the legacy behavior
can be removed.
TESTING.
The new algorithm has been extensively tested. Initially, simulations were
used to compare APST tables generated by old and new algorithms for a wide
range of devices. After that, power consumption, performance and latencies
were measured under different workloads on devices from multiple vendors
(WD, Intel, Samsung, Hynix, Kioxia). Below is the description of the tests
and the findings.
General observations.
The effect the patch has on the APST table varies depending on the entry and
exit latencies advertised by the devices. For some devices, the effect is
negligible (e.g. Kioxia KBG40ZNS), for some significant, making the
transitions to PS3 and PS4 much quicker (e.g. WD SN530, Intel 760P), or making
the sleep deeper, PS4 rather than PS3 after a similar amount of time (e.g.
SK Hynix BC511). For some devices (e.g. Samsung PM991) the effect is mixed:
the initial transition happens after a longer idle time, but takes the device
to a lower power state.
Workflows.
In order to evaluate the patch's effect on the power consumption and latency,
7 workflows were used for each device. The workflows were designed to test
the scenarios where significant differences between the old and new behaviors
are most likely. Each workflow was tested twice: with the new and with the
old APST table generation implementation. Power consumption, performance and
latency were measured in the process. The following workflows were used:
1) Consecutive write at the maximum rate with IO depth of 2, with no pauses
2) Repeated pattern of 1000 consecutive writes of 4K packets followed by 50ms
idle time
3) Repeated pattern of 1000 consecutive writes of 4K packets followed by 150ms
idle time
4) Repeated pattern of 1000 consecutive writes of 4K packets followed by 500ms
idle time
5) Repeated pattern of 1000 consecutive writes of 4K packets followed by 1.5s
idle time
6) Repeated pattern of 1000 consecutive writes of 4K packets followed by 5s
idle time
7) Repeated pattern of a single random read of a 4K packet followed by 150ms
idle time
Power consumption
Actual power consumption measurements produced predictable results in
accordance with the APST mechanism's theory of operation.
Devices with long entry and exit latencies such as WD SN530 showed huge
improvement on scenarios 4,5 and 6 of up to 62%. Devices such as Kioxia
KBG40ZNS where the resulting APST table looks virtually identical with
both legacy and new algorithms, showed little or no change in the average power
consumption on all workflows. Devices with extra short latencies such as
Samsung PM991 showed moderate increase in power consumption of up to 18% in
worst case scenarios.
In addition, on Intel and Samsung devices a more complex impact was observed
on scenarios 3, 4 and 7. Our understanding is that due to longer stay in deep
non-operational states between the writes the devices start performing background
operations leading to an increase of power consumption. With the old APST tables
part of these operations are delayed until the scenario is over and a longer idle
period begins, but eventually this extra power is consumed anyway.
Performance.
In terms of performance measured on sustained write or read scenarios, the
effect of the patch is minimal as in this case the device doesn't enter low power
states.
Latency
As expected, in devices where the patch causes a more aggressive power saving
policy (e.g. WD SN530, Intel 760P), an increase in latency was observed in
certain scenarios. Workflow number 7, specifically designed to simulate the
worst case scenario as far as latency is concerned, indeed shows a sharp
increase in average latency (~2ms -> ~53ms on Intel 760P and 0.6 -> 10ms on
WD SN530). The latency increase on other workloads and other devices is much
milder or non-existent.
Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-04-28 17:27:36 +08:00
|
|
|
if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
|
|
|
|
if (!nvme_apst_get_transition_time(total_latency_us,
|
|
|
|
&transition_ms, &last_lt_index))
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
transition_ms = total_latency_us + 19;
|
|
|
|
do_div(transition_ms, 20);
|
|
|
|
if (transition_ms > (1 << 24) - 1)
|
|
|
|
transition_ms = (1 << 24) - 1;
|
|
|
|
}
|
2021-04-09 14:47:44 +08:00
|
|
|
|
|
|
|
target = cpu_to_le64((state << 3) | (transition_ms << 8));
|
|
|
|
if (max_ps == -1)
|
|
|
|
max_ps = state;
|
|
|
|
if (total_latency_us > max_lat_us)
|
|
|
|
max_lat_us = total_latency_us;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
}
|
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
if (max_ps == -1)
|
|
|
|
dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
|
|
|
|
else
|
|
|
|
dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
|
|
|
|
max_ps, max_lat_us, (int)sizeof(*table), table);
|
|
|
|
apste = 1;
|
|
|
|
|
|
|
|
done:
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
|
|
|
|
table, sizeof(*table), NULL);
|
|
|
|
if (ret)
|
|
|
|
dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
|
|
|
|
kfree(table);
|
2017-08-10 17:23:31 +08:00
|
|
|
return ret;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_set_latency_tolerance(struct device *dev, s32 val)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
u64 latency;
|
|
|
|
|
|
|
|
switch (val) {
|
|
|
|
case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
|
|
|
|
case PM_QOS_LATENCY_ANY:
|
|
|
|
latency = U64_MAX;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
latency = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ctrl->ps_max_latency_us != latency) {
|
|
|
|
ctrl->ps_max_latency_us = latency;
|
2021-04-09 17:46:12 +08:00
|
|
|
if (ctrl->state == NVME_CTRL_LIVE)
|
|
|
|
nvme_configure_apst(ctrl);
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-02-23 04:32:36 +08:00
|
|
|
struct nvme_core_quirk_entry {
|
|
|
|
/*
|
|
|
|
* NVMe model and firmware strings are padded with spaces. For
|
|
|
|
* simplicity, strings in the quirk table are padded with NULLs
|
|
|
|
* instead.
|
|
|
|
*/
|
|
|
|
u16 vid;
|
|
|
|
const char *mn;
|
|
|
|
const char *fr;
|
|
|
|
unsigned long quirks;
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct nvme_core_quirk_entry core_quirks[] = {
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
{
|
2017-04-21 04:37:56 +08:00
|
|
|
/*
|
|
|
|
* This Toshiba device seems to die using any APST states. See:
|
|
|
|
* https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
|
|
|
|
*/
|
|
|
|
.vid = 0x1179,
|
|
|
|
.mn = "THNSF5256GPUK TOSHIBA",
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
.quirks = NVME_QUIRK_NO_APST,
|
2019-08-17 04:16:19 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* This LiteON CL1-3D*-Q11 firmware version has a race
|
|
|
|
* condition associated with actions related to suspend to idle
|
|
|
|
* LiteON has resolved the problem in future firmware
|
|
|
|
*/
|
|
|
|
.vid = 0x14a4,
|
|
|
|
.fr = "22301111",
|
|
|
|
.quirks = NVME_QUIRK_SIMPLE_SUSPEND,
|
2021-11-06 10:08:57 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* This Kioxia CD6-V Series / HPE PE8030 device times out and
|
|
|
|
* aborts I/O during any load, but more easily reproducible
|
|
|
|
* with discards (fstrim).
|
|
|
|
*
|
|
|
|
* The device is left in a state where it is also not possible
|
|
|
|
* to use "nvme set-feature" to disable APST, but booting with
|
|
|
|
* nvme_core.default_ps_max_latency=0 works.
|
|
|
|
*/
|
|
|
|
.vid = 0x1e0f,
|
|
|
|
.mn = "KCD6XVUL6T40",
|
|
|
|
.quirks = NVME_QUIRK_NO_APST,
|
2022-06-17 16:29:42 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The external Samsung X5 SSD fails initialization without a
|
|
|
|
* delay before checking if it is ready and has a whole set of
|
|
|
|
* other problems. To make this even more interesting, it
|
|
|
|
* shares the PCI ID with internal Samsung 970 Evo Plus that
|
|
|
|
* does not need or want these quirks.
|
|
|
|
*/
|
|
|
|
.vid = 0x144d,
|
|
|
|
.mn = "Samsung Portable SSD X5",
|
|
|
|
.quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
|
|
|
|
NVME_QUIRK_NO_DEEPEST_PS |
|
|
|
|
NVME_QUIRK_IGNORE_DEV_SUBNQN,
|
2017-04-21 04:37:56 +08:00
|
|
|
}
|
2017-02-23 04:32:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/* match is null-terminated but idstr is space-padded. */
|
|
|
|
static bool string_matches(const char *idstr, const char *match, size_t len)
|
|
|
|
{
|
|
|
|
size_t matchlen;
|
|
|
|
|
|
|
|
if (!match)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
matchlen = strlen(match);
|
|
|
|
WARN_ON_ONCE(matchlen > len);
|
|
|
|
|
|
|
|
if (memcmp(idstr, match, matchlen))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
for (; matchlen < len; matchlen++)
|
|
|
|
if (idstr[matchlen] != ' ')
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool quirk_matches(const struct nvme_id_ctrl *id,
|
|
|
|
const struct nvme_core_quirk_entry *q)
|
|
|
|
{
|
|
|
|
return q->vid == le16_to_cpu(id->vid) &&
|
|
|
|
string_matches(id->mn, q->mn, sizeof(id->mn)) &&
|
|
|
|
string_matches(id->fr, q->fr, sizeof(id->fr));
|
|
|
|
}
|
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
|
|
|
|
struct nvme_id_ctrl *id)
|
2017-06-26 18:39:02 +08:00
|
|
|
{
|
|
|
|
size_t nqnlen;
|
|
|
|
int off;
|
|
|
|
|
2019-01-09 01:20:51 +08:00
|
|
|
if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
|
|
|
|
nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
|
|
|
|
if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
|
2022-08-19 05:00:52 +08:00
|
|
|
strscpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
|
2019-01-09 01:20:51 +08:00
|
|
|
return;
|
|
|
|
}
|
2017-06-26 18:39:02 +08:00
|
|
|
|
2019-01-09 01:20:51 +08:00
|
|
|
if (ctrl->vs >= NVME_VS(1, 2, 1))
|
|
|
|
dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
|
|
|
|
}
|
2017-06-26 18:39:02 +08:00
|
|
|
|
2022-08-11 15:40:24 +08:00
|
|
|
/*
|
|
|
|
* Generate a "fake" NQN similar to the one in Section 4.5 of the NVMe
|
|
|
|
* Base Specification 2.0. It is slightly different from the format
|
|
|
|
* specified there due to historic reasons, and we can't change it now.
|
|
|
|
*/
|
2017-11-09 20:48:55 +08:00
|
|
|
off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
|
2019-01-09 00:37:43 +08:00
|
|
|
"nqn.2014.08.org.nvmexpress:%04x%04x",
|
2017-06-26 18:39:02 +08:00
|
|
|
le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
|
2017-11-09 20:48:55 +08:00
|
|
|
memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
|
2017-06-26 18:39:02 +08:00
|
|
|
off += sizeof(id->sn);
|
2017-11-09 20:48:55 +08:00
|
|
|
memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
|
2017-06-26 18:39:02 +08:00
|
|
|
off += sizeof(id->mn);
|
2017-11-09 20:48:55 +08:00
|
|
|
memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
|
|
|
|
}
|
|
|
|
|
2019-07-19 07:53:50 +08:00
|
|
|
static void nvme_release_subsystem(struct device *dev)
|
2017-11-09 20:48:55 +08:00
|
|
|
{
|
2019-07-19 07:53:50 +08:00
|
|
|
struct nvme_subsystem *subsys =
|
|
|
|
container_of(dev, struct nvme_subsystem, dev);
|
|
|
|
|
2019-09-06 00:33:54 +08:00
|
|
|
if (subsys->instance >= 0)
|
2022-02-14 17:07:27 +08:00
|
|
|
ida_free(&nvme_instance_ida, subsys->instance);
|
2017-11-09 20:48:55 +08:00
|
|
|
kfree(subsys);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_destroy_subsystem(struct kref *ref)
|
|
|
|
{
|
|
|
|
struct nvme_subsystem *subsys =
|
|
|
|
container_of(ref, struct nvme_subsystem, ref);
|
|
|
|
|
|
|
|
mutex_lock(&nvme_subsystems_lock);
|
|
|
|
list_del(&subsys->entry);
|
|
|
|
mutex_unlock(&nvme_subsystems_lock);
|
|
|
|
|
2017-11-09 20:50:43 +08:00
|
|
|
ida_destroy(&subsys->ns_ida);
|
2017-11-09 20:48:55 +08:00
|
|
|
device_del(&subsys->dev);
|
|
|
|
put_device(&subsys->dev);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_put_subsystem(struct nvme_subsystem *subsys)
|
|
|
|
{
|
|
|
|
kref_put(&subsys->ref, nvme_destroy_subsystem);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
|
|
|
|
{
|
|
|
|
struct nvme_subsystem *subsys;
|
|
|
|
|
|
|
|
lockdep_assert_held(&nvme_subsystems_lock);
|
|
|
|
|
2019-09-04 05:20:37 +08:00
|
|
|
/*
|
|
|
|
* Fail matches for discovery subsystems. This results
|
|
|
|
* in each discovery controller bound to a unique subsystem.
|
|
|
|
* This avoids issues with validating controller values
|
|
|
|
* that can only be true when there is a single unique subsystem.
|
|
|
|
* There may be multiple and completely independent entities
|
|
|
|
* that provide discovery controllers.
|
|
|
|
*/
|
|
|
|
if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
|
|
|
|
return NULL;
|
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
list_for_each_entry(subsys, &nvme_subsystems, entry) {
|
|
|
|
if (strcmp(subsys->subnqn, subsysnqn))
|
|
|
|
continue;
|
|
|
|
if (!kref_get_unless_zero(&subsys->ref))
|
|
|
|
continue;
|
|
|
|
return subsys;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2017-11-10 17:58:23 +08:00
|
|
|
#define SUBSYS_ATTR_RO(_name, _mode, _show) \
|
|
|
|
struct device_attribute subsys_attr_##_name = \
|
|
|
|
__ATTR(_name, _mode, _show, NULL)
|
|
|
|
|
|
|
|
static ssize_t nvme_subsys_show_nqn(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_subsystem *subsys =
|
|
|
|
container_of(dev, struct nvme_subsystem, dev);
|
|
|
|
|
2021-02-02 15:06:17 +08:00
|
|
|
return sysfs_emit(buf, "%s\n", subsys->subnqn);
|
2017-11-10 17:58:23 +08:00
|
|
|
}
|
|
|
|
static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
|
|
|
|
|
2021-09-22 14:35:23 +08:00
|
|
|
static ssize_t nvme_subsys_show_type(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_subsystem *subsys =
|
|
|
|
container_of(dev, struct nvme_subsystem, dev);
|
|
|
|
|
|
|
|
switch (subsys->subtype) {
|
|
|
|
case NVME_NQN_DISC:
|
|
|
|
return sysfs_emit(buf, "discovery\n");
|
|
|
|
case NVME_NQN_NVME:
|
|
|
|
return sysfs_emit(buf, "nvm\n");
|
|
|
|
default:
|
|
|
|
return sysfs_emit(buf, "reserved\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
static SUBSYS_ATTR_RO(subsystype, S_IRUGO, nvme_subsys_show_type);
|
|
|
|
|
2017-11-10 17:58:23 +08:00
|
|
|
#define nvme_subsys_show_str_function(field) \
|
|
|
|
static ssize_t subsys_##field##_show(struct device *dev, \
|
|
|
|
struct device_attribute *attr, char *buf) \
|
|
|
|
{ \
|
|
|
|
struct nvme_subsystem *subsys = \
|
|
|
|
container_of(dev, struct nvme_subsystem, dev); \
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%.*s\n", \
|
|
|
|
(int)sizeof(subsys->field), subsys->field); \
|
2017-11-10 17:58:23 +08:00
|
|
|
} \
|
|
|
|
static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
|
|
|
|
|
|
|
|
nvme_subsys_show_str_function(model);
|
|
|
|
nvme_subsys_show_str_function(serial);
|
|
|
|
nvme_subsys_show_str_function(firmware_rev);
|
|
|
|
|
|
|
|
static struct attribute *nvme_subsys_attrs[] = {
|
|
|
|
&subsys_attr_model.attr,
|
|
|
|
&subsys_attr_serial.attr,
|
|
|
|
&subsys_attr_firmware_rev.attr,
|
|
|
|
&subsys_attr_subsysnqn.attr,
|
2021-09-22 14:35:23 +08:00
|
|
|
&subsys_attr_subsystype.attr,
|
2019-02-18 18:43:26 +08:00
|
|
|
#ifdef CONFIG_NVME_MULTIPATH
|
|
|
|
&subsys_attr_iopolicy.attr,
|
|
|
|
#endif
|
2017-11-10 17:58:23 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2021-01-09 07:41:47 +08:00
|
|
|
static const struct attribute_group nvme_subsys_attrs_group = {
|
2017-11-10 17:58:23 +08:00
|
|
|
.attrs = nvme_subsys_attrs,
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct attribute_group *nvme_subsys_attrs_groups[] = {
|
|
|
|
&nvme_subsys_attrs_group,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2021-01-14 08:00:22 +08:00
|
|
|
static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
return ctrl->opts && ctrl->opts->discovery_nqn;
|
|
|
|
}
|
|
|
|
|
2019-05-09 15:01:26 +08:00
|
|
|
static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
|
|
|
|
struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
2018-01-04 23:56:14 +08:00
|
|
|
{
|
2019-05-09 15:01:26 +08:00
|
|
|
struct nvme_ctrl *tmp;
|
2018-01-04 23:56:14 +08:00
|
|
|
|
2019-05-08 15:48:27 +08:00
|
|
|
lockdep_assert_held(&nvme_subsystems_lock);
|
|
|
|
|
2019-05-09 15:01:26 +08:00
|
|
|
list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
|
2020-03-10 22:39:10 +08:00
|
|
|
if (nvme_state_terminal(tmp))
|
2019-05-09 15:01:26 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
if (tmp->cntlid == ctrl->cntlid) {
|
|
|
|
dev_err(ctrl->device,
|
2021-11-30 00:24:34 +08:00
|
|
|
"Duplicate cntlid %u with %s, subsys %s, rejecting\n",
|
|
|
|
ctrl->cntlid, dev_name(tmp->device),
|
|
|
|
subsys->subnqn);
|
2019-05-09 15:01:26 +08:00
|
|
|
return false;
|
|
|
|
}
|
2018-01-04 23:56:14 +08:00
|
|
|
|
2020-04-04 01:53:46 +08:00
|
|
|
if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
|
2021-01-14 08:00:22 +08:00
|
|
|
nvme_discovery_ctrl(ctrl))
|
2019-05-09 15:01:26 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
dev_err(ctrl->device,
|
|
|
|
"Subsystem does not support multiple controllers\n");
|
|
|
|
return false;
|
2018-01-04 23:56:14 +08:00
|
|
|
}
|
|
|
|
|
2019-05-09 15:01:26 +08:00
|
|
|
return true;
|
2018-01-04 23:56:14 +08:00
|
|
|
}
|
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
|
|
|
{
|
|
|
|
struct nvme_subsystem *subsys, *found;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
|
|
|
|
if (!subsys)
|
|
|
|
return -ENOMEM;
|
2019-09-06 00:33:54 +08:00
|
|
|
|
|
|
|
subsys->instance = -1;
|
2017-11-09 20:48:55 +08:00
|
|
|
mutex_init(&subsys->lock);
|
|
|
|
kref_init(&subsys->ref);
|
|
|
|
INIT_LIST_HEAD(&subsys->ctrls);
|
2017-11-09 20:50:43 +08:00
|
|
|
INIT_LIST_HEAD(&subsys->nsheads);
|
2017-11-09 20:48:55 +08:00
|
|
|
nvme_init_subnqn(subsys, ctrl, id);
|
|
|
|
memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
|
|
|
|
memcpy(subsys->model, id->mn, sizeof(subsys->model));
|
|
|
|
subsys->vendor_id = le16_to_cpu(id->vid);
|
|
|
|
subsys->cmic = id->cmic;
|
2021-09-22 14:35:23 +08:00
|
|
|
|
|
|
|
/* Versions prior to 1.4 don't necessarily report a valid type */
|
|
|
|
if (id->cntrltype == NVME_CTRL_DISC ||
|
|
|
|
!strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
|
|
|
|
subsys->subtype = NVME_NQN_DISC;
|
|
|
|
else
|
|
|
|
subsys->subtype = NVME_NQN_NVME;
|
|
|
|
|
2021-09-22 14:35:24 +08:00
|
|
|
if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
|
|
|
|
dev_err(ctrl->device,
|
|
|
|
"Subsystem %s is not a discovery controller",
|
|
|
|
subsys->subnqn);
|
|
|
|
kfree(subsys);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2019-06-29 00:53:31 +08:00
|
|
|
subsys->awupf = le16_to_cpu(id->awupf);
|
2021-12-20 20:51:45 +08:00
|
|
|
nvme_mpath_default_iopolicy(subsys);
|
2017-11-09 20:48:55 +08:00
|
|
|
|
|
|
|
subsys->dev.class = nvme_subsys_class;
|
|
|
|
subsys->dev.release = nvme_release_subsystem;
|
2017-11-10 17:58:23 +08:00
|
|
|
subsys->dev.groups = nvme_subsys_attrs_groups;
|
2019-09-06 00:33:54 +08:00
|
|
|
dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
|
2017-11-09 20:48:55 +08:00
|
|
|
device_initialize(&subsys->dev);
|
|
|
|
|
|
|
|
mutex_lock(&nvme_subsystems_lock);
|
|
|
|
found = __nvme_find_get_subsystem(subsys->subnqn);
|
|
|
|
if (found) {
|
2019-07-19 07:53:50 +08:00
|
|
|
put_device(&subsys->dev);
|
2017-11-09 20:48:55 +08:00
|
|
|
subsys = found;
|
2019-05-08 15:48:27 +08:00
|
|
|
|
2019-05-09 15:01:26 +08:00
|
|
|
if (!nvme_validate_cntlid(subsys, ctrl, id)) {
|
2017-11-09 20:48:55 +08:00
|
|
|
ret = -EINVAL;
|
2019-05-08 15:48:27 +08:00
|
|
|
goto out_put_subsystem;
|
2017-11-09 20:48:55 +08:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ret = device_add(&subsys->dev);
|
|
|
|
if (ret) {
|
|
|
|
dev_err(ctrl->device,
|
|
|
|
"failed to register subsystem device.\n");
|
2019-08-01 07:35:34 +08:00
|
|
|
put_device(&subsys->dev);
|
2017-11-09 20:48:55 +08:00
|
|
|
goto out_unlock;
|
|
|
|
}
|
2017-11-09 20:50:43 +08:00
|
|
|
ida_init(&subsys->ns_ida);
|
2017-11-09 20:48:55 +08:00
|
|
|
list_add_tail(&subsys->entry, &nvme_subsystems);
|
|
|
|
}
|
|
|
|
|
2019-09-23 22:18:36 +08:00
|
|
|
ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
|
|
|
|
dev_name(ctrl->device));
|
|
|
|
if (ret) {
|
2017-11-09 20:48:55 +08:00
|
|
|
dev_err(ctrl->device,
|
|
|
|
"failed to create sysfs link from subsystem.\n");
|
2019-05-08 15:48:27 +08:00
|
|
|
goto out_put_subsystem;
|
2017-11-09 20:48:55 +08:00
|
|
|
}
|
|
|
|
|
2019-09-06 00:33:54 +08:00
|
|
|
if (!found)
|
|
|
|
subsys->instance = ctrl->instance;
|
2019-05-08 15:48:27 +08:00
|
|
|
ctrl->subsys = subsys;
|
2017-11-09 20:48:55 +08:00
|
|
|
list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
|
2019-05-08 15:48:27 +08:00
|
|
|
mutex_unlock(&nvme_subsystems_lock);
|
2017-11-09 20:48:55 +08:00
|
|
|
return 0;
|
|
|
|
|
2019-05-08 15:48:27 +08:00
|
|
|
out_put_subsystem:
|
|
|
|
nvme_put_subsystem(subsys);
|
2017-11-09 20:48:55 +08:00
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&nvme_subsystems_lock);
|
|
|
|
return ret;
|
2017-06-26 18:39:02 +08:00
|
|
|
}
|
|
|
|
|
2020-06-30 03:06:40 +08:00
|
|
|
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
|
2018-06-06 20:39:00 +08:00
|
|
|
void *log, size_t size, u64 offset)
|
2017-11-08 01:28:31 +08:00
|
|
|
{
|
|
|
|
struct nvme_command c = { };
|
2020-04-04 00:24:01 +08:00
|
|
|
u32 dwlen = nvme_bytes_to_numd(size);
|
2018-02-26 20:55:40 +08:00
|
|
|
|
|
|
|
c.get_log_page.opcode = nvme_admin_get_log_page;
|
2018-06-06 20:39:00 +08:00
|
|
|
c.get_log_page.nsid = cpu_to_le32(nsid);
|
2018-02-26 20:55:40 +08:00
|
|
|
c.get_log_page.lid = log_page;
|
2018-06-06 20:39:00 +08:00
|
|
|
c.get_log_page.lsp = lsp;
|
2018-02-26 20:55:40 +08:00
|
|
|
c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
|
|
|
|
c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
|
2018-04-12 23:16:03 +08:00
|
|
|
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
|
|
|
|
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
|
2020-06-30 03:06:40 +08:00
|
|
|
c.get_log_page.csi = csi;
|
2017-11-08 01:28:31 +08:00
|
|
|
|
|
|
|
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
|
|
|
|
}
|
|
|
|
|
2020-06-30 03:06:40 +08:00
|
|
|
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
|
|
|
|
struct nvme_effects_log **log)
|
2017-11-08 01:28:32 +08:00
|
|
|
{
|
2020-11-14 02:28:30 +08:00
|
|
|
struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
|
2017-11-08 01:28:32 +08:00
|
|
|
int ret;
|
|
|
|
|
2020-06-30 03:06:40 +08:00
|
|
|
if (cel)
|
|
|
|
goto out;
|
2017-11-08 01:28:32 +08:00
|
|
|
|
2020-06-30 03:06:40 +08:00
|
|
|
cel = kzalloc(sizeof(*cel), GFP_KERNEL);
|
|
|
|
if (!cel)
|
|
|
|
return -ENOMEM;
|
2017-11-08 01:28:32 +08:00
|
|
|
|
2020-09-23 03:49:38 +08:00
|
|
|
ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
|
2020-11-14 02:28:30 +08:00
|
|
|
cel, sizeof(*cel), 0);
|
2017-11-08 01:28:32 +08:00
|
|
|
if (ret) {
|
2020-06-30 03:06:40 +08:00
|
|
|
kfree(cel);
|
|
|
|
return ret;
|
2017-11-08 01:28:32 +08:00
|
|
|
}
|
2020-06-30 03:06:40 +08:00
|
|
|
|
2020-11-14 02:28:30 +08:00
|
|
|
xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
|
2020-06-30 03:06:40 +08:00
|
|
|
out:
|
2020-11-14 02:28:30 +08:00
|
|
|
*log = cel;
|
2020-06-30 03:06:40 +08:00
|
|
|
return 0;
|
2017-06-26 18:39:02 +08:00
|
|
|
}
|
|
|
|
|
2021-03-25 07:18:05 +08:00
|
|
|
static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)
|
2015-11-28 22:37:52 +08:00
|
|
|
{
|
2021-04-03 00:58:20 +08:00
|
|
|
u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val;
|
2015-11-28 22:37:52 +08:00
|
|
|
|
2021-04-03 00:58:20 +08:00
|
|
|
if (check_shl_overflow(1U, units + page_shift - 9, &val))
|
|
|
|
return UINT_MAX;
|
|
|
|
return val;
|
2021-03-25 07:18:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_command c = { };
|
|
|
|
struct nvme_id_ctrl_nvm *id;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
|
|
|
|
ctrl->max_discard_sectors = UINT_MAX;
|
|
|
|
ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
|
|
|
|
} else {
|
|
|
|
ctrl->max_discard_sectors = 0;
|
|
|
|
ctrl->max_discard_segments = 0;
|
2015-11-28 22:40:19 +08:00
|
|
|
}
|
2015-11-28 22:37:52 +08:00
|
|
|
|
2021-03-25 07:18:05 +08:00
|
|
|
/*
|
|
|
|
* Even though NVMe spec explicitly states that MDTS is not applicable
|
|
|
|
* to the write-zeroes, we are cautious and limit the size to the
|
|
|
|
* controllers max_hw_sectors value, which is based on the MDTS field
|
|
|
|
* and possibly other limiting factors.
|
|
|
|
*/
|
|
|
|
if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
|
|
|
|
!(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
|
|
|
|
ctrl->max_zeroes_sectors = ctrl->max_hw_sectors;
|
|
|
|
else
|
|
|
|
ctrl->max_zeroes_sectors = 0;
|
|
|
|
|
2023-03-16 19:50:09 +08:00
|
|
|
if (ctrl->subsys->subtype != NVME_NQN_NVME ||
|
|
|
|
nvme_ctrl_limited_cns(ctrl))
|
2021-03-25 07:18:05 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
id = kzalloc(sizeof(*id), GFP_KERNEL);
|
|
|
|
if (!id)
|
2022-11-03 01:17:08 +08:00
|
|
|
return -ENOMEM;
|
2021-03-25 07:18:05 +08:00
|
|
|
|
|
|
|
c.identify.opcode = nvme_admin_identify;
|
|
|
|
c.identify.cns = NVME_ID_CNS_CS_CTRL;
|
|
|
|
c.identify.csi = NVME_CSI_NVM;
|
|
|
|
|
|
|
|
ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
|
|
|
|
if (ret)
|
|
|
|
goto free_data;
|
|
|
|
|
|
|
|
if (id->dmrl)
|
|
|
|
ctrl->max_discard_segments = id->dmrl;
|
2022-04-29 12:52:43 +08:00
|
|
|
ctrl->dmrsl = le32_to_cpu(id->dmrsl);
|
2021-03-25 07:18:05 +08:00
|
|
|
if (id->wzsl)
|
|
|
|
ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
|
|
|
|
|
|
|
|
free_data:
|
|
|
|
kfree(id);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-01-28 00:56:19 +08:00
|
|
|
static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_effects_log *log = ctrl->effects;
|
|
|
|
|
|
|
|
log->acs[nvme_admin_format_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
|
|
|
|
NVME_CMD_EFFECTS_NCC |
|
|
|
|
NVME_CMD_EFFECTS_CSE_MASK);
|
|
|
|
log->acs[nvme_admin_sanitize_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
|
|
|
|
NVME_CMD_EFFECTS_CSE_MASK);
|
|
|
|
|
2023-01-28 00:56:20 +08:00
|
|
|
/*
|
|
|
|
* The spec says the result of a security receive command depends on
|
|
|
|
* the previous security send command. As such, many vendors log this
|
|
|
|
* command as one to submitted only when no other commands to the same
|
|
|
|
* namespace are outstanding. The intention is to tell the host to
|
|
|
|
* prevent mixing security send and receive.
|
|
|
|
*
|
|
|
|
* This driver can only enforce such exclusive access against IO
|
|
|
|
* queues, though. We are not readily able to enforce such a rule for
|
|
|
|
* two commands to the admin queue, which is the only queue that
|
|
|
|
* matters for this command.
|
|
|
|
*
|
|
|
|
* Rather than blindly freezing the IO queues for this effect that
|
|
|
|
* doesn't even apply to IO, mask it off.
|
|
|
|
*/
|
2023-02-24 23:34:24 +08:00
|
|
|
log->acs[nvme_admin_security_recv] &= cpu_to_le32(~NVME_CMD_EFFECTS_CSE_MASK);
|
2023-01-28 00:56:20 +08:00
|
|
|
|
2023-01-28 00:56:19 +08:00
|
|
|
log->iocs[nvme_cmd_write] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
|
|
|
|
log->iocs[nvme_cmd_write_zeroes] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
|
|
|
|
log->iocs[nvme_cmd_write_uncor] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (ctrl->effects)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
|
|
|
|
ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!ctrl->effects) {
|
|
|
|
ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
|
|
|
|
if (!ctrl->effects)
|
|
|
|
return -ENOMEM;
|
|
|
|
xa_store(&ctrl->cels, NVME_CSI_NVM, ctrl->effects, GFP_KERNEL);
|
|
|
|
}
|
|
|
|
|
|
|
|
nvme_init_known_nvm_effects(ctrl);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-03-01 10:06:05 +08:00
|
|
|
static int nvme_init_identify(struct nvme_ctrl *ctrl)
|
2015-11-28 22:37:52 +08:00
|
|
|
{
|
|
|
|
struct nvme_id_ctrl *id;
|
2016-06-07 05:20:48 +08:00
|
|
|
u32 max_hw_sectors;
|
2017-06-27 04:39:54 +08:00
|
|
|
bool prev_apst_enabled;
|
2021-03-25 07:18:05 +08:00
|
|
|
int ret;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2015-11-28 22:37:52 +08:00
|
|
|
ret = nvme_identify_ctrl(ctrl, &id);
|
|
|
|
if (ret) {
|
2016-02-10 23:51:15 +08:00
|
|
|
dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
|
2015-11-28 22:37:52 +08:00
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
2019-08-14 22:26:10 +08:00
|
|
|
if (!(ctrl->ops->flags & NVME_F_FABRICS))
|
|
|
|
ctrl->cntlid = le16_to_cpu(id->cntlid);
|
|
|
|
|
2017-02-23 04:32:36 +08:00
|
|
|
if (!ctrl->identified) {
|
2021-03-01 10:06:05 +08:00
|
|
|
unsigned int i;
|
2017-11-09 20:48:55 +08:00
|
|
|
|
2017-02-23 04:32:36 +08:00
|
|
|
/*
|
|
|
|
* Check for quirks. Quirk can depend on firmware version,
|
|
|
|
* so, in principle, the set of quirks present can change
|
|
|
|
* across a reset. As a possible future enhancement, we
|
|
|
|
* could re-scan for quirks every time we reinitialize
|
|
|
|
* the device, but we'd have to make sure that the driver
|
|
|
|
* behaves intelligently if the quirks change.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
|
|
|
|
if (quirk_matches(id, &core_quirks[i]))
|
|
|
|
ctrl->quirks |= core_quirks[i].quirks;
|
|
|
|
}
|
2022-12-01 20:52:34 +08:00
|
|
|
|
|
|
|
ret = nvme_init_subsystem(ctrl, id);
|
|
|
|
if (ret)
|
|
|
|
goto out_free;
|
2023-01-28 00:56:19 +08:00
|
|
|
|
|
|
|
ret = nvme_init_effects(ctrl, id);
|
|
|
|
if (ret)
|
|
|
|
goto out_free;
|
2017-02-23 04:32:36 +08:00
|
|
|
}
|
2022-09-20 03:45:08 +08:00
|
|
|
memcpy(ctrl->subsys->firmware_rev, id->fr,
|
|
|
|
sizeof(ctrl->subsys->firmware_rev));
|
2017-02-23 04:32:36 +08:00
|
|
|
|
2017-04-22 07:19:24 +08:00
|
|
|
if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
|
2017-06-09 22:17:21 +08:00
|
|
|
dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
|
2017-04-22 07:19:24 +08:00
|
|
|
ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
|
|
|
|
}
|
|
|
|
|
2018-11-28 00:40:57 +08:00
|
|
|
ctrl->crdt[0] = le16_to_cpu(id->crdt1);
|
|
|
|
ctrl->crdt[1] = le16_to_cpu(id->crdt2);
|
|
|
|
ctrl->crdt[2] = le16_to_cpu(id->crdt3);
|
|
|
|
|
2017-02-17 20:59:40 +08:00
|
|
|
ctrl->oacs = le16_to_cpu(id->oacs);
|
2019-02-25 19:00:04 +08:00
|
|
|
ctrl->oncs = le16_to_cpu(id->oncs);
|
2019-05-21 01:13:04 +08:00
|
|
|
ctrl->mtfa = le16_to_cpu(id->mtfa);
|
2018-05-22 17:09:55 +08:00
|
|
|
ctrl->oaes = le32_to_cpu(id->oaes);
|
2019-11-06 22:35:18 +08:00
|
|
|
ctrl->wctemp = le16_to_cpu(id->wctemp);
|
|
|
|
ctrl->cctemp = le16_to_cpu(id->cctemp);
|
|
|
|
|
2015-11-20 16:36:44 +08:00
|
|
|
atomic_set(&ctrl->abort_limit, id->acl + 1);
|
2015-11-28 22:37:52 +08:00
|
|
|
ctrl->vwc = id->vwc;
|
|
|
|
if (id->mdts)
|
2021-03-25 07:18:05 +08:00
|
|
|
max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
|
2015-11-28 22:37:52 +08:00
|
|
|
else
|
2016-06-07 05:20:48 +08:00
|
|
|
max_hw_sectors = UINT_MAX;
|
|
|
|
ctrl->max_hw_sectors =
|
|
|
|
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
|
2015-11-28 22:37:52 +08:00
|
|
|
|
2016-03-03 01:07:11 +08:00
|
|
|
nvme_set_queue_limits(ctrl, ctrl->admin_q);
|
2016-06-13 22:45:26 +08:00
|
|
|
ctrl->sgls = le32_to_cpu(id->sgls);
|
2016-06-13 22:45:28 +08:00
|
|
|
ctrl->kas = le16_to_cpu(id->kas);
|
2018-05-14 14:48:54 +08:00
|
|
|
ctrl->max_namespaces = le32_to_cpu(id->mnan);
|
2018-11-03 01:28:14 +08:00
|
|
|
ctrl->ctratt = le32_to_cpu(id->ctratt);
|
2016-06-13 22:45:26 +08:00
|
|
|
|
2022-02-09 03:33:46 +08:00
|
|
|
ctrl->cntrltype = id->cntrltype;
|
|
|
|
ctrl->dctype = id->dctype;
|
|
|
|
|
2017-08-26 07:14:50 +08:00
|
|
|
if (id->rtd3e) {
|
|
|
|
/* us -> s */
|
2020-06-24 14:49:58 +08:00
|
|
|
u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
|
2017-08-26 07:14:50 +08:00
|
|
|
|
|
|
|
ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
|
|
|
|
shutdown_timeout, 60);
|
|
|
|
|
|
|
|
if (ctrl->shutdown_timeout != shutdown_timeout)
|
2017-12-31 21:33:27 +08:00
|
|
|
dev_info(ctrl->device,
|
2017-08-26 07:14:50 +08:00
|
|
|
"Shutdown timeout set to %u seconds\n",
|
|
|
|
ctrl->shutdown_timeout);
|
|
|
|
} else
|
|
|
|
ctrl->shutdown_timeout = shutdown_timeout;
|
|
|
|
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
ctrl->npss = id->npss;
|
2017-06-27 04:39:54 +08:00
|
|
|
ctrl->apsta = id->apsta;
|
|
|
|
prev_apst_enabled = ctrl->apst_enabled;
|
2017-04-22 07:19:24 +08:00
|
|
|
if (ctrl->quirks & NVME_QUIRK_NO_APST) {
|
|
|
|
if (force_apst && id->apsta) {
|
2017-06-09 22:17:21 +08:00
|
|
|
dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
|
2017-06-27 04:39:54 +08:00
|
|
|
ctrl->apst_enabled = true;
|
2017-04-22 07:19:24 +08:00
|
|
|
} else {
|
2017-06-27 04:39:54 +08:00
|
|
|
ctrl->apst_enabled = false;
|
2017-04-22 07:19:24 +08:00
|
|
|
}
|
|
|
|
} else {
|
2017-06-27 04:39:54 +08:00
|
|
|
ctrl->apst_enabled = id->apsta;
|
2017-04-22 07:19:24 +08:00
|
|
|
}
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
|
|
|
|
|
2017-05-20 21:14:44 +08:00
|
|
|
if (ctrl->ops->flags & NVME_F_FABRICS) {
|
2016-06-13 22:45:26 +08:00
|
|
|
ctrl->icdoff = le16_to_cpu(id->icdoff);
|
|
|
|
ctrl->ioccsz = le32_to_cpu(id->ioccsz);
|
|
|
|
ctrl->iorcsz = le32_to_cpu(id->iorcsz);
|
|
|
|
ctrl->maxcmd = le16_to_cpu(id->maxcmd);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In fabrics we need to verify the cntlid matches the
|
|
|
|
* admin connect
|
|
|
|
*/
|
2017-08-10 17:23:31 +08:00
|
|
|
if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
|
2019-11-22 01:58:10 +08:00
|
|
|
dev_err(ctrl->device,
|
|
|
|
"Mismatching cntlid: Connect %u vs Identify "
|
|
|
|
"%u, rejecting\n",
|
|
|
|
ctrl->cntlid, le16_to_cpu(id->cntlid));
|
2016-06-13 22:45:26 +08:00
|
|
|
ret = -EINVAL;
|
2017-08-10 17:23:31 +08:00
|
|
|
goto out_free;
|
|
|
|
}
|
2016-06-13 22:45:28 +08:00
|
|
|
|
2021-01-14 08:00:22 +08:00
|
|
|
if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
|
2017-06-09 22:17:21 +08:00
|
|
|
dev_err(ctrl->device,
|
2016-06-13 22:45:28 +08:00
|
|
|
"keep-alive support is mandatory for fabrics\n");
|
|
|
|
ret = -EINVAL;
|
2017-08-10 17:23:31 +08:00
|
|
|
goto out_free;
|
2016-06-13 22:45:28 +08:00
|
|
|
}
|
2016-06-13 22:45:26 +08:00
|
|
|
} else {
|
2017-05-12 23:16:10 +08:00
|
|
|
ctrl->hmpre = le32_to_cpu(id->hmpre);
|
|
|
|
ctrl->hmmin = le32_to_cpu(id->hmmin);
|
2017-09-12 00:09:28 +08:00
|
|
|
ctrl->hmminds = le32_to_cpu(id->hmminds);
|
|
|
|
ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
|
2016-06-13 22:45:26 +08:00
|
|
|
}
|
2016-03-03 01:07:11 +08:00
|
|
|
|
2021-04-29 20:18:53 +08:00
|
|
|
ret = nvme_mpath_init_identify(ctrl, id);
|
2018-05-14 14:48:54 +08:00
|
|
|
if (ret < 0)
|
2021-03-01 10:06:05 +08:00
|
|
|
goto out_free;
|
2018-05-14 14:48:54 +08:00
|
|
|
|
2017-06-27 04:39:54 +08:00
|
|
|
if (ctrl->apst_enabled && !prev_apst_enabled)
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
dev_pm_qos_expose_latency_tolerance(ctrl->device);
|
2017-06-27 04:39:54 +08:00
|
|
|
else if (!ctrl->apst_enabled && prev_apst_enabled)
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
dev_pm_qos_hide_latency_tolerance(ctrl->device);
|
|
|
|
|
2021-03-01 10:06:05 +08:00
|
|
|
out_free:
|
|
|
|
kfree(id);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize the cached copies of the Identify data and various controller
|
|
|
|
* register in our nvme_ctrl structure. This should be called as soon as
|
|
|
|
* the admin queue is fully up and running.
|
|
|
|
*/
|
2022-11-08 22:48:27 +08:00
|
|
|
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
|
2021-03-01 10:06:05 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
|
|
|
|
if (ret) {
|
|
|
|
dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
|
|
|
|
|
|
|
|
if (ctrl->vs >= NVME_VS(1, 1, 0))
|
|
|
|
ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
|
|
|
|
|
|
|
|
ret = nvme_init_identify(ctrl);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2017-08-10 17:23:31 +08:00
|
|
|
ret = nvme_configure_apst(ctrl);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2021-04-11 04:16:21 +08:00
|
|
|
|
2017-08-16 15:51:29 +08:00
|
|
|
ret = nvme_configure_timestamp(ctrl);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2017-08-10 17:23:31 +08:00
|
|
|
|
2022-03-04 04:13:12 +08:00
|
|
|
ret = nvme_configure_host_options(ctrl);
|
2018-11-28 00:40:57 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
2022-11-08 22:48:27 +08:00
|
|
|
nvme_configure_opal(ctrl, was_suspended);
|
|
|
|
|
2021-01-14 08:00:22 +08:00
|
|
|
if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
|
2022-10-18 22:55:55 +08:00
|
|
|
/*
|
|
|
|
* Do not return errors unless we are in a controller reset,
|
|
|
|
* the controller works perfectly fine without hwmon.
|
|
|
|
*/
|
2020-09-17 23:50:25 +08:00
|
|
|
ret = nvme_hwmon_init(ctrl);
|
2022-10-18 22:55:55 +08:00
|
|
|
if (ret == -EINTR)
|
2020-09-17 23:50:25 +08:00
|
|
|
return ret;
|
|
|
|
}
|
2019-11-06 22:35:18 +08:00
|
|
|
|
2017-02-23 04:32:36 +08:00
|
|
|
ctrl->identified = true;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
2017-08-10 17:23:31 +08:00
|
|
|
return 0;
|
2015-11-28 22:37:52 +08:00
|
|
|
}
|
2021-03-01 10:06:04 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
|
2015-11-28 22:37:52 +08:00
|
|
|
|
2015-11-28 22:40:19 +08:00
|
|
|
static int nvme_dev_open(struct inode *inode, struct file *file)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
2017-10-18 22:59:25 +08:00
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(inode->i_cdev, struct nvme_ctrl, cdev);
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2018-01-06 08:01:58 +08:00
|
|
|
switch (ctrl->state) {
|
|
|
|
case NVME_CTRL_LIVE:
|
|
|
|
break;
|
|
|
|
default:
|
2017-10-18 22:59:25 +08:00
|
|
|
return -EWOULDBLOCK;
|
2018-01-06 08:01:58 +08:00
|
|
|
}
|
|
|
|
|
2020-09-16 11:53:25 +08:00
|
|
|
nvme_get_ctrl(ctrl);
|
2020-10-07 07:36:47 +08:00
|
|
|
if (!try_module_get(ctrl->ops->module)) {
|
|
|
|
nvme_put_ctrl(ctrl);
|
2020-09-16 11:53:25 +08:00
|
|
|
return -EINVAL;
|
2020-10-07 07:36:47 +08:00
|
|
|
}
|
2020-09-16 11:53:25 +08:00
|
|
|
|
2017-10-18 22:59:25 +08:00
|
|
|
file->private_data = ctrl;
|
2015-11-28 22:40:19 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-09-16 11:53:25 +08:00
|
|
|
static int nvme_dev_release(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(inode->i_cdev, struct nvme_ctrl, cdev);
|
|
|
|
|
|
|
|
module_put(ctrl->ops->module);
|
|
|
|
nvme_put_ctrl(ctrl);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-11-28 22:40:19 +08:00
|
|
|
static const struct file_operations nvme_dev_fops = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.open = nvme_dev_open,
|
2020-09-16 11:53:25 +08:00
|
|
|
.release = nvme_dev_release,
|
2015-11-28 22:40:19 +08:00
|
|
|
.unlocked_ioctl = nvme_dev_ioctl,
|
2018-09-12 03:59:08 +08:00
|
|
|
.compat_ioctl = compat_ptr_ioctl,
|
2022-05-20 17:06:30 +08:00
|
|
|
.uring_cmd = nvme_dev_uring_cmd,
|
2015-11-28 22:40:19 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static ssize_t nvme_sysfs_reset(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf,
|
|
|
|
size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
int ret;
|
|
|
|
|
2017-06-15 21:41:08 +08:00
|
|
|
ret = nvme_reset_ctrl_sync(ctrl);
|
2015-11-28 22:40:19 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
return count;
|
2015-11-26 17:54:19 +08:00
|
|
|
}
|
2015-11-28 22:40:19 +08:00
|
|
|
static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2016-04-30 05:45:18 +08:00
|
|
|
static ssize_t nvme_sysfs_rescan(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf,
|
|
|
|
size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
nvme_queue_scan(ctrl);
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
|
|
|
|
|
2017-11-09 20:51:03 +08:00
|
|
|
static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
|
|
|
|
{
|
|
|
|
struct gendisk *disk = dev_to_disk(dev);
|
|
|
|
|
2020-12-01 20:56:09 +08:00
|
|
|
if (disk->fops == &nvme_bdev_ops)
|
2017-11-09 20:51:03 +08:00
|
|
|
return nvme_get_ns_from_dev(dev)->head;
|
|
|
|
else
|
|
|
|
return disk->private_data;
|
|
|
|
}
|
|
|
|
|
2016-02-19 00:57:48 +08:00
|
|
|
static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
|
2017-11-09 20:51:03 +08:00
|
|
|
char *buf)
|
2016-02-19 00:57:48 +08:00
|
|
|
{
|
2017-11-09 20:51:03 +08:00
|
|
|
struct nvme_ns_head *head = dev_to_ns_head(dev);
|
|
|
|
struct nvme_ns_ids *ids = &head->ids;
|
|
|
|
struct nvme_subsystem *subsys = head->subsys;
|
2017-11-09 20:48:55 +08:00
|
|
|
int serial_len = sizeof(subsys->serial);
|
|
|
|
int model_len = sizeof(subsys->model);
|
2016-02-19 00:57:48 +08:00
|
|
|
|
2017-11-09 20:50:16 +08:00
|
|
|
if (!uuid_is_null(&ids->uuid))
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid);
|
2017-07-12 21:38:56 +08:00
|
|
|
|
2017-11-09 20:50:16 +08:00
|
|
|
if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "eui.%16phN\n", ids->nguid);
|
2016-02-19 00:57:48 +08:00
|
|
|
|
2017-11-09 20:50:16 +08:00
|
|
|
if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "eui.%8phN\n", ids->eui64);
|
2016-02-19 00:57:48 +08:00
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
|
|
|
|
subsys->serial[serial_len - 1] == '\0'))
|
2016-02-19 00:57:48 +08:00
|
|
|
serial_len--;
|
2017-11-09 20:48:55 +08:00
|
|
|
while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
|
|
|
|
subsys->model[model_len - 1] == '\0'))
|
2016-02-19 00:57:48 +08:00
|
|
|
model_len--;
|
|
|
|
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
|
2017-11-09 20:48:55 +08:00
|
|
|
serial_len, subsys->serial, model_len, subsys->model,
|
2017-11-09 20:51:03 +08:00
|
|
|
head->ns_id);
|
2016-02-19 00:57:48 +08:00
|
|
|
}
|
2017-12-20 02:15:08 +08:00
|
|
|
static DEVICE_ATTR_RO(wwid);
|
2016-02-19 00:57:48 +08:00
|
|
|
|
2017-06-07 17:45:35 +08:00
|
|
|
static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
|
2017-11-09 20:51:03 +08:00
|
|
|
char *buf)
|
2017-06-07 17:45:35 +08:00
|
|
|
{
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
|
2017-06-07 17:45:35 +08:00
|
|
|
}
|
2017-12-20 02:15:08 +08:00
|
|
|
static DEVICE_ATTR_RO(nguid);
|
2017-06-07 17:45:35 +08:00
|
|
|
|
2015-12-23 01:10:45 +08:00
|
|
|
static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
|
2017-11-09 20:51:03 +08:00
|
|
|
char *buf)
|
2015-12-23 01:10:45 +08:00
|
|
|
{
|
2017-11-09 20:51:03 +08:00
|
|
|
struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
|
2017-06-07 17:45:35 +08:00
|
|
|
|
|
|
|
/* For backward compatibility expose the NGUID to userspace if
|
|
|
|
* we have no UUID set
|
|
|
|
*/
|
2017-11-09 20:50:16 +08:00
|
|
|
if (uuid_is_null(&ids->uuid)) {
|
2022-06-07 23:55:55 +08:00
|
|
|
dev_warn_ratelimited(dev,
|
|
|
|
"No UUID available providing old NGUID\n");
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%pU\n", ids->nguid);
|
2017-06-07 17:45:35 +08:00
|
|
|
}
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%pU\n", &ids->uuid);
|
2015-12-23 01:10:45 +08:00
|
|
|
}
|
2017-12-20 02:15:08 +08:00
|
|
|
static DEVICE_ATTR_RO(uuid);
|
2015-12-23 01:10:45 +08:00
|
|
|
|
|
|
|
static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
|
2017-11-09 20:51:03 +08:00
|
|
|
char *buf)
|
2015-12-23 01:10:45 +08:00
|
|
|
{
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
|
2015-12-23 01:10:45 +08:00
|
|
|
}
|
2017-12-20 02:15:08 +08:00
|
|
|
static DEVICE_ATTR_RO(eui);
|
2015-12-23 01:10:45 +08:00
|
|
|
|
|
|
|
static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
|
2017-11-09 20:51:03 +08:00
|
|
|
char *buf)
|
2015-12-23 01:10:45 +08:00
|
|
|
{
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
|
2015-12-23 01:10:45 +08:00
|
|
|
}
|
2017-12-20 02:15:08 +08:00
|
|
|
static DEVICE_ATTR_RO(nsid);
|
2015-12-23 01:10:45 +08:00
|
|
|
|
2017-11-09 20:51:03 +08:00
|
|
|
static struct attribute *nvme_ns_id_attrs[] = {
|
2016-02-19 00:57:48 +08:00
|
|
|
&dev_attr_wwid.attr,
|
2015-12-23 01:10:45 +08:00
|
|
|
&dev_attr_uuid.attr,
|
2017-06-07 17:45:35 +08:00
|
|
|
&dev_attr_nguid.attr,
|
2015-12-23 01:10:45 +08:00
|
|
|
&dev_attr_eui.attr,
|
|
|
|
&dev_attr_nsid.attr,
|
2018-05-14 14:48:54 +08:00
|
|
|
#ifdef CONFIG_NVME_MULTIPATH
|
|
|
|
&dev_attr_ana_grpid.attr,
|
|
|
|
&dev_attr_ana_state.attr,
|
|
|
|
#endif
|
2015-12-23 01:10:45 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2017-11-09 20:51:03 +08:00
|
|
|
static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
|
2015-12-23 01:10:45 +08:00
|
|
|
struct attribute *a, int n)
|
|
|
|
{
|
|
|
|
struct device *dev = container_of(kobj, struct device, kobj);
|
2017-11-09 20:51:03 +08:00
|
|
|
struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
|
2015-12-23 01:10:45 +08:00
|
|
|
|
|
|
|
if (a == &dev_attr_uuid.attr) {
|
2017-09-29 03:33:23 +08:00
|
|
|
if (uuid_is_null(&ids->uuid) &&
|
2017-11-09 20:50:16 +08:00
|
|
|
!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
|
2017-06-07 17:45:35 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (a == &dev_attr_nguid.attr) {
|
2017-11-09 20:50:16 +08:00
|
|
|
if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
|
2015-12-23 01:10:45 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (a == &dev_attr_eui.attr) {
|
2017-11-09 20:50:16 +08:00
|
|
|
if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
|
2015-12-23 01:10:45 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2018-05-14 14:48:54 +08:00
|
|
|
#ifdef CONFIG_NVME_MULTIPATH
|
|
|
|
if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
|
2020-12-01 20:56:09 +08:00
|
|
|
if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */
|
2018-05-14 14:48:54 +08:00
|
|
|
return 0;
|
|
|
|
if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
2015-12-23 01:10:45 +08:00
|
|
|
return a->mode;
|
|
|
|
}
|
|
|
|
|
2018-10-09 05:28:39 +08:00
|
|
|
static const struct attribute_group nvme_ns_id_attr_group = {
|
2017-11-09 20:51:03 +08:00
|
|
|
.attrs = nvme_ns_id_attrs,
|
|
|
|
.is_visible = nvme_ns_id_attrs_are_visible,
|
2015-12-23 01:10:45 +08:00
|
|
|
};
|
|
|
|
|
2018-09-28 14:17:20 +08:00
|
|
|
const struct attribute_group *nvme_ns_id_attr_groups[] = {
|
|
|
|
&nvme_ns_id_attr_group,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2016-02-27 05:24:19 +08:00
|
|
|
#define nvme_show_str_function(field) \
|
2016-01-13 06:09:31 +08:00
|
|
|
static ssize_t field##_show(struct device *dev, \
|
|
|
|
struct device_attribute *attr, char *buf) \
|
|
|
|
{ \
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%.*s\n", \
|
2017-11-09 20:48:55 +08:00
|
|
|
(int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \
|
2016-01-13 06:09:31 +08:00
|
|
|
} \
|
|
|
|
static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
|
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
nvme_show_str_function(model);
|
|
|
|
nvme_show_str_function(serial);
|
|
|
|
nvme_show_str_function(firmware_rev);
|
|
|
|
|
2016-02-27 05:24:19 +08:00
|
|
|
#define nvme_show_int_function(field) \
|
|
|
|
static ssize_t field##_show(struct device *dev, \
|
|
|
|
struct device_attribute *attr, char *buf) \
|
|
|
|
{ \
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%d\n", ctrl->field); \
|
2016-02-27 05:24:19 +08:00
|
|
|
} \
|
|
|
|
static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
|
|
|
|
|
|
|
|
nvme_show_int_function(cntlid);
|
2018-11-16 16:22:29 +08:00
|
|
|
nvme_show_int_function(numa_node);
|
2019-09-25 05:22:08 +08:00
|
|
|
nvme_show_int_function(queue_count);
|
|
|
|
nvme_show_int_function(sqsize);
|
2021-04-16 19:46:21 +08:00
|
|
|
nvme_show_int_function(kato);
|
2016-01-13 06:09:31 +08:00
|
|
|
|
2016-06-13 22:45:24 +08:00
|
|
|
static ssize_t nvme_sysfs_delete(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf,
|
|
|
|
size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
if (device_remove_file_self(dev, attr))
|
2017-10-29 16:44:29 +08:00
|
|
|
nvme_delete_ctrl_sync(ctrl);
|
2016-06-13 22:45:24 +08:00
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
|
|
|
|
|
|
|
|
static ssize_t nvme_sysfs_show_transport(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
2021-02-02 15:06:17 +08:00
|
|
|
return sysfs_emit(buf, "%s\n", ctrl->ops->name);
|
2016-06-13 22:45:24 +08:00
|
|
|
}
|
|
|
|
static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
|
|
|
|
|
2016-11-28 07:47:40 +08:00
|
|
|
static ssize_t nvme_sysfs_show_state(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
static const char *const state_name[] = {
|
|
|
|
[NVME_CTRL_NEW] = "new",
|
|
|
|
[NVME_CTRL_LIVE] = "live",
|
|
|
|
[NVME_CTRL_RESETTING] = "resetting",
|
2018-02-01 00:31:24 +08:00
|
|
|
[NVME_CTRL_CONNECTING] = "connecting",
|
2016-11-28 07:47:40 +08:00
|
|
|
[NVME_CTRL_DELETING] = "deleting",
|
2020-07-23 07:32:19 +08:00
|
|
|
[NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
|
2016-11-28 07:47:40 +08:00
|
|
|
[NVME_CTRL_DEAD] = "dead",
|
|
|
|
};
|
|
|
|
|
|
|
|
if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
|
|
|
|
state_name[ctrl->state])
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%s\n", state_name[ctrl->state]);
|
2016-11-28 07:47:40 +08:00
|
|
|
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "unknown state\n");
|
2016-11-28 07:47:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
|
|
|
|
|
2016-06-13 22:45:24 +08:00
|
|
|
static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
2021-02-02 15:06:17 +08:00
|
|
|
return sysfs_emit(buf, "%s\n", ctrl->subsys->subnqn);
|
2016-06-13 22:45:24 +08:00
|
|
|
}
|
|
|
|
static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
|
|
|
|
|
2020-02-08 09:13:53 +08:00
|
|
|
static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
2021-02-02 15:06:17 +08:00
|
|
|
return sysfs_emit(buf, "%s\n", ctrl->opts->host->nqn);
|
2020-02-08 09:13:53 +08:00
|
|
|
}
|
|
|
|
static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
|
|
|
|
|
2020-02-08 09:13:54 +08:00
|
|
|
static ssize_t nvme_sysfs_show_hostid(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
2021-02-02 15:06:17 +08:00
|
|
|
return sysfs_emit(buf, "%pU\n", &ctrl->opts->host->id);
|
2020-02-08 09:13:54 +08:00
|
|
|
}
|
|
|
|
static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
|
|
|
|
|
2016-06-13 22:45:24 +08:00
|
|
|
static ssize_t nvme_sysfs_show_address(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
|
|
|
|
|
2020-07-05 15:57:55 +08:00
|
|
|
static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
struct nvmf_ctrl_options *opts = ctrl->opts;
|
|
|
|
|
|
|
|
if (ctrl->opts->max_reconnects == -1)
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "off\n");
|
|
|
|
return sysfs_emit(buf, "%d\n",
|
|
|
|
opts->max_reconnects * opts->reconnect_delay);
|
2020-07-05 15:57:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
struct nvmf_ctrl_options *opts = ctrl->opts;
|
|
|
|
int ctrl_loss_tmo, err;
|
|
|
|
|
|
|
|
err = kstrtoint(buf, 10, &ctrl_loss_tmo);
|
|
|
|
if (err)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2021-04-01 17:54:11 +08:00
|
|
|
if (ctrl_loss_tmo < 0)
|
2020-07-05 15:57:55 +08:00
|
|
|
opts->max_reconnects = -1;
|
|
|
|
else
|
|
|
|
opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
|
|
|
|
opts->reconnect_delay);
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
|
|
|
|
nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
|
|
|
|
|
|
|
|
static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
if (ctrl->opts->reconnect_delay == -1)
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "off\n");
|
|
|
|
return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay);
|
2020-07-05 15:57:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
unsigned int v;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = kstrtou32(buf, 10, &v);
|
2020-07-14 18:57:32 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
2020-07-05 15:57:55 +08:00
|
|
|
|
|
|
|
ctrl->opts->reconnect_delay = v;
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
|
|
|
|
nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
|
|
|
|
|
2021-04-01 17:54:12 +08:00
|
|
|
static ssize_t nvme_ctrl_fast_io_fail_tmo_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
if (ctrl->opts->fast_io_fail_tmo == -1)
|
|
|
|
return sysfs_emit(buf, "off\n");
|
|
|
|
return sysfs_emit(buf, "%d\n", ctrl->opts->fast_io_fail_tmo);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
struct nvmf_ctrl_options *opts = ctrl->opts;
|
|
|
|
int fast_io_fail_tmo, err;
|
|
|
|
|
|
|
|
err = kstrtoint(buf, 10, &fast_io_fail_tmo);
|
|
|
|
if (err)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (fast_io_fail_tmo < 0)
|
|
|
|
opts->fast_io_fail_tmo = -1;
|
|
|
|
else
|
|
|
|
opts->fast_io_fail_tmo = fast_io_fail_tmo;
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR,
|
|
|
|
nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store);
|
|
|
|
|
2022-02-09 03:33:46 +08:00
|
|
|
static ssize_t cntrltype_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
static const char * const type[] = {
|
|
|
|
[NVME_CTRL_IO] = "io\n",
|
|
|
|
[NVME_CTRL_DISC] = "discovery\n",
|
|
|
|
[NVME_CTRL_ADMIN] = "admin\n",
|
|
|
|
};
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
if (ctrl->cntrltype > NVME_CTRL_ADMIN || !type[ctrl->cntrltype])
|
|
|
|
return sysfs_emit(buf, "reserved\n");
|
|
|
|
|
|
|
|
return sysfs_emit(buf, type[ctrl->cntrltype]);
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR_RO(cntrltype);
|
|
|
|
|
|
|
|
static ssize_t dctype_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
static const char * const type[] = {
|
|
|
|
[NVME_DCTYPE_NOT_REPORTED] = "none\n",
|
|
|
|
[NVME_DCTYPE_DDC] = "ddc\n",
|
|
|
|
[NVME_DCTYPE_CDC] = "cdc\n",
|
|
|
|
};
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
if (ctrl->dctype > NVME_DCTYPE_CDC || !type[ctrl->dctype])
|
|
|
|
return sysfs_emit(buf, "reserved\n");
|
|
|
|
|
|
|
|
return sysfs_emit(buf, type[ctrl->dctype]);
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR_RO(dctype);
|
|
|
|
|
2022-06-27 17:52:02 +08:00
|
|
|
#ifdef CONFIG_NVME_AUTH
|
|
|
|
static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
struct nvmf_ctrl_options *opts = ctrl->opts;
|
|
|
|
|
|
|
|
if (!opts->dhchap_secret)
|
|
|
|
return sysfs_emit(buf, "none\n");
|
|
|
|
return sysfs_emit(buf, "%s\n", opts->dhchap_secret);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
struct nvmf_ctrl_options *opts = ctrl->opts;
|
|
|
|
char *dhchap_secret;
|
|
|
|
|
|
|
|
if (!ctrl->opts->dhchap_secret)
|
|
|
|
return -EINVAL;
|
|
|
|
if (count < 7)
|
|
|
|
return -EINVAL;
|
|
|
|
if (memcmp(buf, "DHHC-1:", 7))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
|
|
|
|
if (!dhchap_secret)
|
|
|
|
return -ENOMEM;
|
|
|
|
memcpy(dhchap_secret, buf, count);
|
|
|
|
nvme_auth_stop(ctrl);
|
|
|
|
if (strcmp(dhchap_secret, opts->dhchap_secret)) {
|
2022-11-13 19:24:11 +08:00
|
|
|
struct nvme_dhchap_key *key, *host_key;
|
2022-06-27 17:52:02 +08:00
|
|
|
int ret;
|
|
|
|
|
2022-11-13 19:24:11 +08:00
|
|
|
ret = nvme_auth_generate_key(dhchap_secret, &key);
|
2022-06-27 17:52:02 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
kfree(opts->dhchap_secret);
|
|
|
|
opts->dhchap_secret = dhchap_secret;
|
2022-11-13 19:24:11 +08:00
|
|
|
host_key = ctrl->host_key;
|
2022-11-13 19:24:20 +08:00
|
|
|
mutex_lock(&ctrl->dhchap_auth_mutex);
|
2022-11-13 19:24:11 +08:00
|
|
|
ctrl->host_key = key;
|
2022-11-13 19:24:20 +08:00
|
|
|
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
2022-11-13 19:24:11 +08:00
|
|
|
nvme_auth_free_key(host_key);
|
2022-06-27 17:52:02 +08:00
|
|
|
}
|
|
|
|
/* Start re-authentication */
|
|
|
|
dev_info(ctrl->device, "re-authenticating controller\n");
|
|
|
|
queue_work(nvme_wq, &ctrl->dhchap_auth_work);
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(dhchap_secret, S_IRUGO | S_IWUSR,
|
|
|
|
nvme_ctrl_dhchap_secret_show, nvme_ctrl_dhchap_secret_store);
|
|
|
|
|
|
|
|
static ssize_t nvme_ctrl_dhchap_ctrl_secret_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
struct nvmf_ctrl_options *opts = ctrl->opts;
|
|
|
|
|
|
|
|
if (!opts->dhchap_ctrl_secret)
|
|
|
|
return sysfs_emit(buf, "none\n");
|
|
|
|
return sysfs_emit(buf, "%s\n", opts->dhchap_ctrl_secret);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
struct nvmf_ctrl_options *opts = ctrl->opts;
|
|
|
|
char *dhchap_secret;
|
|
|
|
|
|
|
|
if (!ctrl->opts->dhchap_ctrl_secret)
|
|
|
|
return -EINVAL;
|
|
|
|
if (count < 7)
|
|
|
|
return -EINVAL;
|
|
|
|
if (memcmp(buf, "DHHC-1:", 7))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
|
|
|
|
if (!dhchap_secret)
|
|
|
|
return -ENOMEM;
|
|
|
|
memcpy(dhchap_secret, buf, count);
|
|
|
|
nvme_auth_stop(ctrl);
|
|
|
|
if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) {
|
2022-11-13 19:24:11 +08:00
|
|
|
struct nvme_dhchap_key *key, *ctrl_key;
|
2022-06-27 17:52:02 +08:00
|
|
|
int ret;
|
|
|
|
|
2022-11-13 19:24:11 +08:00
|
|
|
ret = nvme_auth_generate_key(dhchap_secret, &key);
|
2022-06-27 17:52:02 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
kfree(opts->dhchap_ctrl_secret);
|
|
|
|
opts->dhchap_ctrl_secret = dhchap_secret;
|
2022-11-13 19:24:11 +08:00
|
|
|
ctrl_key = ctrl->ctrl_key;
|
2022-11-13 19:24:20 +08:00
|
|
|
mutex_lock(&ctrl->dhchap_auth_mutex);
|
2022-11-13 19:24:11 +08:00
|
|
|
ctrl->ctrl_key = key;
|
2022-11-13 19:24:20 +08:00
|
|
|
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
2022-11-13 19:24:11 +08:00
|
|
|
nvme_auth_free_key(ctrl_key);
|
2022-06-27 17:52:02 +08:00
|
|
|
}
|
|
|
|
/* Start re-authentication */
|
|
|
|
dev_info(ctrl->device, "re-authenticating controller\n");
|
|
|
|
queue_work(nvme_wq, &ctrl->dhchap_auth_work);
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR,
|
|
|
|
nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store);
|
|
|
|
#endif
|
|
|
|
|
2016-01-13 06:09:31 +08:00
|
|
|
static struct attribute *nvme_dev_attrs[] = {
|
|
|
|
&dev_attr_reset_controller.attr,
|
2016-04-30 05:45:18 +08:00
|
|
|
&dev_attr_rescan_controller.attr,
|
2016-01-13 06:09:31 +08:00
|
|
|
&dev_attr_model.attr,
|
|
|
|
&dev_attr_serial.attr,
|
|
|
|
&dev_attr_firmware_rev.attr,
|
2016-02-27 05:24:19 +08:00
|
|
|
&dev_attr_cntlid.attr,
|
2016-06-13 22:45:24 +08:00
|
|
|
&dev_attr_delete_controller.attr,
|
|
|
|
&dev_attr_transport.attr,
|
|
|
|
&dev_attr_subsysnqn.attr,
|
|
|
|
&dev_attr_address.attr,
|
2016-11-28 07:47:40 +08:00
|
|
|
&dev_attr_state.attr,
|
2018-11-16 16:22:29 +08:00
|
|
|
&dev_attr_numa_node.attr,
|
2019-09-25 05:22:08 +08:00
|
|
|
&dev_attr_queue_count.attr,
|
|
|
|
&dev_attr_sqsize.attr,
|
2020-02-08 09:13:53 +08:00
|
|
|
&dev_attr_hostnqn.attr,
|
2020-02-08 09:13:54 +08:00
|
|
|
&dev_attr_hostid.attr,
|
2020-07-05 15:57:55 +08:00
|
|
|
&dev_attr_ctrl_loss_tmo.attr,
|
|
|
|
&dev_attr_reconnect_delay.attr,
|
2021-04-01 17:54:12 +08:00
|
|
|
&dev_attr_fast_io_fail_tmo.attr,
|
2021-04-16 19:46:21 +08:00
|
|
|
&dev_attr_kato.attr,
|
2022-02-09 03:33:46 +08:00
|
|
|
&dev_attr_cntrltype.attr,
|
|
|
|
&dev_attr_dctype.attr,
|
2022-06-27 17:52:02 +08:00
|
|
|
#ifdef CONFIG_NVME_AUTH
|
|
|
|
&dev_attr_dhchap_secret.attr,
|
|
|
|
&dev_attr_dhchap_ctrl_secret.attr,
|
|
|
|
#endif
|
2016-01-13 06:09:31 +08:00
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2016-06-13 22:45:24 +08:00
|
|
|
static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
|
|
|
|
struct attribute *a, int n)
|
|
|
|
{
|
|
|
|
struct device *dev = container_of(kobj, struct device, kobj);
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
2017-06-26 18:39:03 +08:00
|
|
|
if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
|
|
|
|
return 0;
|
|
|
|
if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
|
|
|
|
return 0;
|
2020-02-08 09:13:53 +08:00
|
|
|
if (a == &dev_attr_hostnqn.attr && !ctrl->opts)
|
|
|
|
return 0;
|
2020-02-08 09:13:54 +08:00
|
|
|
if (a == &dev_attr_hostid.attr && !ctrl->opts)
|
|
|
|
return 0;
|
2020-08-25 06:47:25 +08:00
|
|
|
if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
|
|
|
|
return 0;
|
|
|
|
if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
|
|
|
|
return 0;
|
2021-04-14 16:46:45 +08:00
|
|
|
if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts)
|
|
|
|
return 0;
|
2022-06-27 17:52:02 +08:00
|
|
|
#ifdef CONFIG_NVME_AUTH
|
|
|
|
if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts)
|
|
|
|
return 0;
|
|
|
|
if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts)
|
|
|
|
return 0;
|
|
|
|
#endif
|
2016-06-13 22:45:24 +08:00
|
|
|
|
|
|
|
return a->mode;
|
|
|
|
}
|
|
|
|
|
2022-10-27 17:34:13 +08:00
|
|
|
const struct attribute_group nvme_dev_attrs_group = {
|
2016-06-13 22:45:24 +08:00
|
|
|
.attrs = nvme_dev_attrs,
|
|
|
|
.is_visible = nvme_dev_attrs_are_visible,
|
2016-01-13 06:09:31 +08:00
|
|
|
};
|
2022-10-27 17:34:13 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_dev_attrs_group);
|
2016-01-13 06:09:31 +08:00
|
|
|
|
|
|
|
static const struct attribute_group *nvme_dev_attr_groups[] = {
|
|
|
|
&nvme_dev_attrs_group,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2022-03-14 19:05:45 +08:00
|
|
|
static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
|
2017-11-09 20:50:43 +08:00
|
|
|
unsigned nsid)
|
|
|
|
{
|
|
|
|
struct nvme_ns_head *h;
|
|
|
|
|
2022-03-14 19:05:45 +08:00
|
|
|
lockdep_assert_held(&ctrl->subsys->lock);
|
2017-11-09 20:50:43 +08:00
|
|
|
|
2022-03-14 19:05:45 +08:00
|
|
|
list_for_each_entry(h, &ctrl->subsys->nsheads, entry) {
|
|
|
|
/*
|
|
|
|
* Private namespaces can share NSIDs under some conditions.
|
|
|
|
* In that case we can't use the same ns_head for namespaces
|
|
|
|
* with the same NSID.
|
|
|
|
*/
|
|
|
|
if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
|
2021-09-02 17:20:02 +08:00
|
|
|
continue;
|
|
|
|
if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
|
2017-11-09 20:50:43 +08:00
|
|
|
return h;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2022-02-24 17:57:15 +08:00
|
|
|
static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
|
|
|
|
struct nvme_ns_ids *ids)
|
2017-11-09 20:50:43 +08:00
|
|
|
{
|
2022-02-24 18:32:58 +08:00
|
|
|
bool has_uuid = !uuid_is_null(&ids->uuid);
|
|
|
|
bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid));
|
|
|
|
bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
|
2017-11-09 20:50:43 +08:00
|
|
|
struct nvme_ns_head *h;
|
|
|
|
|
|
|
|
lockdep_assert_held(&subsys->lock);
|
|
|
|
|
|
|
|
list_for_each_entry(h, &subsys->nsheads, entry) {
|
2022-02-24 18:32:58 +08:00
|
|
|
if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid))
|
|
|
|
return -EINVAL;
|
|
|
|
if (has_nguid &&
|
|
|
|
memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0)
|
|
|
|
return -EINVAL;
|
|
|
|
if (has_eui64 &&
|
|
|
|
memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0)
|
2017-11-09 20:50:43 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-10-13 23:04:19 +08:00
|
|
|
static void nvme_cdev_rel(struct device *dev)
|
|
|
|
{
|
2022-02-14 17:07:27 +08:00
|
|
|
ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt));
|
2021-10-13 23:04:19 +08:00
|
|
|
}
|
|
|
|
|
2021-04-21 15:45:04 +08:00
|
|
|
void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device)
|
|
|
|
{
|
|
|
|
cdev_device_del(cdev, cdev_device);
|
2021-10-13 23:04:19 +08:00
|
|
|
put_device(cdev_device);
|
2021-04-21 15:45:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
|
|
|
|
const struct file_operations *fops, struct module *owner)
|
|
|
|
{
|
|
|
|
int minor, ret;
|
|
|
|
|
2022-02-14 17:07:27 +08:00
|
|
|
minor = ida_alloc(&nvme_ns_chr_minor_ida, GFP_KERNEL);
|
2021-04-21 15:45:04 +08:00
|
|
|
if (minor < 0)
|
|
|
|
return minor;
|
|
|
|
cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
|
|
|
|
cdev_device->class = nvme_ns_chr_class;
|
2021-10-13 23:04:19 +08:00
|
|
|
cdev_device->release = nvme_cdev_rel;
|
2021-04-21 15:45:04 +08:00
|
|
|
device_initialize(cdev_device);
|
|
|
|
cdev_init(cdev, fops);
|
|
|
|
cdev->owner = owner;
|
|
|
|
ret = cdev_device_add(cdev, cdev_device);
|
2021-10-13 23:04:19 +08:00
|
|
|
if (ret)
|
2021-05-21 15:32:39 +08:00
|
|
|
put_device(cdev_device);
|
2021-10-13 23:04:19 +08:00
|
|
|
|
2021-04-21 15:45:04 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_ns_chr_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_ns_chr_release(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev));
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations nvme_ns_chr_fops = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.open = nvme_ns_chr_open,
|
|
|
|
.release = nvme_ns_chr_release,
|
|
|
|
.unlocked_ioctl = nvme_ns_chr_ioctl,
|
|
|
|
.compat_ioctl = compat_ptr_ioctl,
|
2022-05-11 13:47:48 +08:00
|
|
|
.uring_cmd = nvme_ns_chr_uring_cmd,
|
2022-08-24 00:14:43 +08:00
|
|
|
.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
|
2021-04-21 15:45:04 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static int nvme_add_ns_cdev(struct nvme_ns *ns)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ns->cdev_device.parent = ns->ctrl->device;
|
|
|
|
ret = dev_set_name(&ns->cdev_device, "ng%dn%d",
|
|
|
|
ns->ctrl->instance, ns->head->instance);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2021-10-13 23:04:19 +08:00
|
|
|
|
|
|
|
return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops,
|
|
|
|
ns->ctrl->ops->module);
|
2021-04-21 15:45:04 +08:00
|
|
|
}
|
|
|
|
|
2017-11-09 20:50:43 +08:00
|
|
|
static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
|
2022-07-23 00:24:18 +08:00
|
|
|
struct nvme_ns_info *info)
|
2017-11-09 20:50:43 +08:00
|
|
|
{
|
|
|
|
struct nvme_ns_head *head;
|
2018-09-11 15:51:29 +08:00
|
|
|
size_t size = sizeof(*head);
|
2017-11-09 20:50:43 +08:00
|
|
|
int ret = -ENOMEM;
|
|
|
|
|
2018-09-11 15:51:29 +08:00
|
|
|
#ifdef CONFIG_NVME_MULTIPATH
|
|
|
|
size += num_possible_nodes() * sizeof(struct nvme_ns *);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
head = kzalloc(size, GFP_KERNEL);
|
2017-11-09 20:50:43 +08:00
|
|
|
if (!head)
|
|
|
|
goto out;
|
2022-02-14 17:07:27 +08:00
|
|
|
ret = ida_alloc_min(&ctrl->subsys->ns_ida, 1, GFP_KERNEL);
|
2017-11-09 20:50:43 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out_free_head;
|
|
|
|
head->instance = ret;
|
|
|
|
INIT_LIST_HEAD(&head->list);
|
2018-04-12 23:16:12 +08:00
|
|
|
ret = init_srcu_struct(&head->srcu);
|
|
|
|
if (ret)
|
|
|
|
goto out_ida_remove;
|
2017-11-09 20:50:43 +08:00
|
|
|
head->subsys = ctrl->subsys;
|
2022-07-23 00:24:18 +08:00
|
|
|
head->ns_id = info->nsid;
|
|
|
|
head->ids = info->ids;
|
|
|
|
head->shared = info->is_shared;
|
2017-11-09 20:50:43 +08:00
|
|
|
kref_init(&head->ref);
|
|
|
|
|
2020-06-30 03:06:40 +08:00
|
|
|
if (head->ids.csi) {
|
|
|
|
ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
|
|
|
|
if (ret)
|
|
|
|
goto out_cleanup_srcu;
|
|
|
|
} else
|
|
|
|
head->effects = ctrl->effects;
|
|
|
|
|
2017-11-02 19:59:30 +08:00
|
|
|
ret = nvme_mpath_alloc_disk(ctrl, head);
|
|
|
|
if (ret)
|
|
|
|
goto out_cleanup_srcu;
|
|
|
|
|
2017-11-09 20:50:43 +08:00
|
|
|
list_add_tail(&head->entry, &ctrl->subsys->nsheads);
|
2018-05-04 16:01:57 +08:00
|
|
|
|
|
|
|
kref_get(&ctrl->subsys->ref);
|
|
|
|
|
2017-11-09 20:50:43 +08:00
|
|
|
return head;
|
|
|
|
out_cleanup_srcu:
|
|
|
|
cleanup_srcu_struct(&head->srcu);
|
2018-04-12 23:16:12 +08:00
|
|
|
out_ida_remove:
|
2022-02-14 17:07:27 +08:00
|
|
|
ida_free(&ctrl->subsys->ns_ida, head->instance);
|
2017-11-09 20:50:43 +08:00
|
|
|
out_free_head:
|
|
|
|
kfree(head);
|
|
|
|
out:
|
2019-08-03 09:16:12 +08:00
|
|
|
if (ret > 0)
|
|
|
|
ret = blk_status_to_errno(nvme_error_status(ret));
|
2017-11-09 20:50:43 +08:00
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
2022-02-25 00:48:32 +08:00
|
|
|
static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
|
|
|
|
struct nvme_ns_ids *ids)
|
|
|
|
{
|
|
|
|
struct nvme_subsystem *s;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note that this check is racy as we try to avoid holding the global
|
|
|
|
* lock over the whole ns_head creation. But it is only intended as
|
|
|
|
* a sanity check anyway.
|
|
|
|
*/
|
|
|
|
mutex_lock(&nvme_subsystems_lock);
|
|
|
|
list_for_each_entry(s, &nvme_subsystems, entry) {
|
|
|
|
if (s == this)
|
|
|
|
continue;
|
|
|
|
mutex_lock(&s->lock);
|
|
|
|
ret = nvme_subsys_check_duplicate_ids(s, ids);
|
|
|
|
mutex_unlock(&s->lock);
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
mutex_unlock(&nvme_subsystems_lock);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
|
2017-11-09 20:50:43 +08:00
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = ns->ctrl;
|
|
|
|
struct nvme_ns_head *head = NULL;
|
2022-02-25 00:48:32 +08:00
|
|
|
int ret;
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids);
|
2022-02-25 00:48:32 +08:00
|
|
|
if (ret) {
|
|
|
|
dev_err(ctrl->device,
|
2022-07-23 00:24:18 +08:00
|
|
|
"globally duplicate IDs for nsid %d\n", info->nsid);
|
2022-06-07 23:30:29 +08:00
|
|
|
nvme_print_device_info(ctrl);
|
2022-02-25 00:48:32 +08:00
|
|
|
return ret;
|
|
|
|
}
|
2017-11-09 20:50:43 +08:00
|
|
|
|
|
|
|
mutex_lock(&ctrl->subsys->lock);
|
2022-07-23 00:24:18 +08:00
|
|
|
head = nvme_find_ns_head(ctrl, info->nsid);
|
2017-11-09 20:50:43 +08:00
|
|
|
if (!head) {
|
2022-07-23 00:24:18 +08:00
|
|
|
ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &info->ids);
|
2022-02-25 00:46:50 +08:00
|
|
|
if (ret) {
|
|
|
|
dev_err(ctrl->device,
|
2022-02-25 00:48:32 +08:00
|
|
|
"duplicate IDs in subsystem for nsid %d\n",
|
2022-07-23 00:24:18 +08:00
|
|
|
info->nsid);
|
2022-02-25 00:46:50 +08:00
|
|
|
goto out_unlock;
|
|
|
|
}
|
2022-07-23 00:24:18 +08:00
|
|
|
head = nvme_alloc_ns_head(ctrl, info);
|
2017-11-09 20:50:43 +08:00
|
|
|
if (IS_ERR(head)) {
|
|
|
|
ret = PTR_ERR(head);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
} else {
|
2020-04-22 15:59:08 +08:00
|
|
|
ret = -EINVAL;
|
2022-07-23 00:24:18 +08:00
|
|
|
if (!info->is_shared || !head->shared) {
|
2020-04-10 00:09:01 +08:00
|
|
|
dev_err(ctrl->device,
|
2022-07-23 00:24:18 +08:00
|
|
|
"Duplicate unshared namespace %d\n",
|
|
|
|
info->nsid);
|
2020-04-22 15:59:08 +08:00
|
|
|
goto out_put_ns_head;
|
2020-04-10 00:09:01 +08:00
|
|
|
}
|
2022-07-23 00:24:18 +08:00
|
|
|
if (!nvme_ns_ids_equal(&head->ids, &info->ids)) {
|
2017-11-09 20:50:43 +08:00
|
|
|
dev_err(ctrl->device,
|
|
|
|
"IDs don't match for shared namespace %d\n",
|
2022-07-23 00:24:18 +08:00
|
|
|
info->nsid);
|
2020-04-22 15:59:08 +08:00
|
|
|
goto out_put_ns_head;
|
2017-11-09 20:50:43 +08:00
|
|
|
}
|
2022-03-15 20:27:07 +08:00
|
|
|
|
|
|
|
if (!multipath && !list_empty(&head->list)) {
|
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"Found shared namespace %d, but multipathing not supported.\n",
|
2022-07-23 00:24:18 +08:00
|
|
|
info->nsid);
|
2022-03-15 20:27:07 +08:00
|
|
|
dev_warn_once(ctrl->device,
|
|
|
|
"Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
|
|
|
|
}
|
2017-11-09 20:50:43 +08:00
|
|
|
}
|
|
|
|
|
2021-01-28 11:33:51 +08:00
|
|
|
list_add_tail_rcu(&ns->siblings, &head->list);
|
2017-11-09 20:50:43 +08:00
|
|
|
ns->head = head;
|
2020-04-22 15:59:08 +08:00
|
|
|
mutex_unlock(&ctrl->subsys->lock);
|
|
|
|
return 0;
|
2017-11-09 20:50:43 +08:00
|
|
|
|
2020-04-22 15:59:08 +08:00
|
|
|
out_put_ns_head:
|
|
|
|
nvme_put_ns_head(head);
|
2017-11-09 20:50:43 +08:00
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&ctrl->subsys->lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-07-25 01:25:16 +08:00
|
|
|
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
2015-11-28 22:39:07 +08:00
|
|
|
{
|
2016-07-14 01:45:02 +08:00
|
|
|
struct nvme_ns *ns, *ret = NULL;
|
2015-12-24 22:27:00 +08:00
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2015-11-28 22:39:07 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list) {
|
2017-11-09 20:50:43 +08:00
|
|
|
if (ns->head->ns_id == nsid) {
|
2021-04-27 14:47:46 +08:00
|
|
|
if (!nvme_get_ns(ns))
|
2017-10-18 19:20:01 +08:00
|
|
|
continue;
|
2016-07-14 01:45:02 +08:00
|
|
|
ret = ns;
|
|
|
|
break;
|
|
|
|
}
|
2017-11-09 20:50:43 +08:00
|
|
|
if (ns->head->ns_id > nsid)
|
2015-11-28 22:39:07 +08:00
|
|
|
break;
|
|
|
|
}
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2016-07-14 01:45:02 +08:00
|
|
|
return ret;
|
2015-11-28 22:39:07 +08:00
|
|
|
}
|
2020-07-25 01:25:16 +08:00
|
|
|
EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2021-09-14 14:38:20 +08:00
|
|
|
/*
|
|
|
|
* Add the namespace to the controller list while keeping the list ordered.
|
|
|
|
*/
|
|
|
|
static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
|
|
|
|
{
|
|
|
|
struct nvme_ns *tmp;
|
|
|
|
|
|
|
|
list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
|
|
|
|
if (tmp->head->ns_id < ns->head->ns_id) {
|
|
|
|
list_add(&ns->list, &tmp->list);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
list_add(&ns->list, &ns->ctrl->namespaces);
|
|
|
|
}
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
|
2015-11-28 22:39:07 +08:00
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
struct gendisk *disk;
|
2021-04-07 18:46:46 +08:00
|
|
|
int node = ctrl->numa_node;
|
2015-11-28 22:39:07 +08:00
|
|
|
|
|
|
|
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
|
|
|
|
if (!ns)
|
2022-07-23 00:24:18 +08:00
|
|
|
return;
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2021-08-16 21:19:02 +08:00
|
|
|
disk = blk_mq_alloc_disk(ctrl->tagset, ns);
|
|
|
|
if (IS_ERR(disk))
|
2017-11-09 20:50:43 +08:00
|
|
|
goto out_free_ns;
|
2021-08-16 21:19:02 +08:00
|
|
|
disk->fops = &nvme_bdev_ops;
|
|
|
|
disk->private_data = ns;
|
|
|
|
|
|
|
|
ns->disk = disk;
|
|
|
|
ns->queue = disk->queue;
|
2018-10-05 05:27:44 +08:00
|
|
|
|
2019-07-12 01:04:47 +08:00
|
|
|
if (ctrl->opts && ctrl->opts->data_digest)
|
2020-09-24 14:51:38 +08:00
|
|
|
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
|
2019-07-04 15:59:18 +08:00
|
|
|
|
2018-03-08 09:10:10 +08:00
|
|
|
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
|
2022-07-09 00:51:00 +08:00
|
|
|
if (ctrl->ops->supports_pci_p2pdma &&
|
|
|
|
ctrl->ops->supports_pci_p2pdma(ctrl))
|
2018-10-05 05:27:44 +08:00
|
|
|
blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
|
|
|
|
|
2015-11-28 22:39:07 +08:00
|
|
|
ns->ctrl = ctrl;
|
|
|
|
kref_init(&ns->kref);
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
if (nvme_init_ns_head(ns, info))
|
2021-08-16 21:19:02 +08:00
|
|
|
goto out_cleanup_disk;
|
2016-09-16 20:25:04 +08:00
|
|
|
|
2021-04-07 18:46:46 +08:00
|
|
|
/*
|
2022-03-15 19:58:06 +08:00
|
|
|
* If multipathing is enabled, the device name for all disks and not
|
|
|
|
* just those that represent shared namespaces needs to be based on the
|
|
|
|
* subsystem instance. Using the controller instance for private
|
|
|
|
* namespaces could lead to naming collisions between shared and private
|
|
|
|
* namespaces if they don't use a common numbering scheme.
|
|
|
|
*
|
|
|
|
* If multipathing is not enabled, disk names must use the controller
|
|
|
|
* instance as shared namespaces will show up as multiple block
|
|
|
|
* devices.
|
2021-04-07 18:46:46 +08:00
|
|
|
*/
|
2022-03-15 19:58:06 +08:00
|
|
|
if (ns->head->disk) {
|
|
|
|
sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
|
|
|
|
ctrl->instance, ns->head->instance);
|
|
|
|
disk->flags |= GENHD_FL_HIDDEN;
|
|
|
|
} else if (multipath) {
|
|
|
|
sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
|
|
|
|
ns->head->instance);
|
|
|
|
} else {
|
2021-04-07 18:46:46 +08:00
|
|
|
sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
|
|
|
|
ns->head->instance);
|
2022-03-15 19:58:06 +08:00
|
|
|
}
|
2016-11-29 05:38:53 +08:00
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
if (nvme_update_ns_info(ns, info))
|
2021-08-16 21:19:02 +08:00
|
|
|
goto out_unlink_ns;
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_write(&ctrl->namespaces_rwsem);
|
2021-09-14 14:38:20 +08:00
|
|
|
nvme_ns_add_to_ctrl_list(ns);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_write(&ctrl->namespaces_rwsem);
|
2017-10-18 19:25:42 +08:00
|
|
|
nvme_get_ctrl(ctrl);
|
2016-09-16 20:25:04 +08:00
|
|
|
|
2021-08-31 05:25:33 +08:00
|
|
|
if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
|
|
|
|
goto out_cleanup_ns_from_list;
|
|
|
|
|
2021-04-21 15:45:04 +08:00
|
|
|
if (!nvme_ns_head_multipath(ns->head))
|
|
|
|
nvme_add_ns_cdev(ns);
|
2017-11-02 19:59:30 +08:00
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
nvme_mpath_add_disk(ns, info->anagrpid);
|
2019-06-20 14:49:02 +08:00
|
|
|
nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
|
2018-05-14 14:48:54 +08:00
|
|
|
|
2019-11-28 01:17:43 +08:00
|
|
|
return;
|
2021-08-16 21:19:02 +08:00
|
|
|
|
2021-08-31 05:25:33 +08:00
|
|
|
out_cleanup_ns_from_list:
|
|
|
|
nvme_put_ctrl(ctrl);
|
|
|
|
down_write(&ctrl->namespaces_rwsem);
|
|
|
|
list_del_init(&ns->list);
|
|
|
|
up_write(&ctrl->namespaces_rwsem);
|
2017-11-09 20:50:43 +08:00
|
|
|
out_unlink_ns:
|
|
|
|
mutex_lock(&ctrl->subsys->lock);
|
|
|
|
list_del_rcu(&ns->siblings);
|
2020-04-10 00:08:59 +08:00
|
|
|
if (list_empty(&ns->head->list))
|
|
|
|
list_del_init(&ns->head->entry);
|
2017-11-09 20:50:43 +08:00
|
|
|
mutex_unlock(&ctrl->subsys->lock);
|
2019-03-14 01:54:57 +08:00
|
|
|
nvme_put_ns_head(ns->head);
|
2021-08-16 21:19:02 +08:00
|
|
|
out_cleanup_disk:
|
2022-06-19 14:05:52 +08:00
|
|
|
put_disk(disk);
|
2015-11-28 22:39:07 +08:00
|
|
|
out_free_ns:
|
|
|
|
kfree(ns);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_ns_remove(struct nvme_ns *ns)
|
|
|
|
{
|
2021-07-16 19:30:35 +08:00
|
|
|
bool last_path = false;
|
|
|
|
|
2016-02-25 00:15:54 +08:00
|
|
|
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
|
|
|
|
return;
|
2015-12-24 22:27:00 +08:00
|
|
|
|
2021-08-24 22:57:42 +08:00
|
|
|
clear_bit(NVME_NS_READY, &ns->flags);
|
2020-09-28 19:59:06 +08:00
|
|
|
set_capacity(ns->disk, 0);
|
2019-06-20 14:49:02 +08:00
|
|
|
nvme_fault_inject_fini(&ns->fault_inject);
|
2019-06-20 14:48:10 +08:00
|
|
|
|
2022-03-22 06:43:04 +08:00
|
|
|
/*
|
|
|
|
* Ensure that !NVME_NS_READY is seen by other threads to prevent
|
|
|
|
* this ns going back into current_path.
|
|
|
|
*/
|
|
|
|
synchronize_srcu(&ns->head->srcu);
|
|
|
|
|
|
|
|
/* wait for concurrent submissions */
|
|
|
|
if (nvme_mpath_clear_current_path(ns))
|
|
|
|
synchronize_srcu(&ns->head->srcu);
|
|
|
|
|
2019-06-20 14:48:10 +08:00
|
|
|
mutex_lock(&ns->ctrl->subsys->lock);
|
|
|
|
list_del_rcu(&ns->siblings);
|
2021-09-02 17:20:02 +08:00
|
|
|
if (list_empty(&ns->head->list)) {
|
|
|
|
list_del_init(&ns->head->entry);
|
|
|
|
last_path = true;
|
|
|
|
}
|
2019-06-20 14:48:10 +08:00
|
|
|
mutex_unlock(&ns->ctrl->subsys->lock);
|
2020-04-10 00:08:59 +08:00
|
|
|
|
2021-09-01 17:25:24 +08:00
|
|
|
/* guarantee not available in head->list */
|
2022-11-19 07:27:56 +08:00
|
|
|
synchronize_srcu(&ns->head->srcu);
|
2021-09-01 17:25:24 +08:00
|
|
|
|
2021-08-09 14:40:23 +08:00
|
|
|
if (!nvme_ns_head_multipath(ns->head))
|
|
|
|
nvme_cdev_del(&ns->cdev, &ns->cdev_device);
|
|
|
|
del_gendisk(ns->disk);
|
2016-07-14 01:45:02 +08:00
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_write(&ns->ctrl->namespaces_rwsem);
|
2015-11-28 22:39:07 +08:00
|
|
|
list_del_init(&ns->list);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_write(&ns->ctrl->namespaces_rwsem);
|
2016-07-14 01:45:02 +08:00
|
|
|
|
2021-07-16 19:30:35 +08:00
|
|
|
if (last_path)
|
|
|
|
nvme_mpath_shutdown_disk(ns->head);
|
2015-11-28 22:39:07 +08:00
|
|
|
nvme_put_ns(ns);
|
|
|
|
}
|
|
|
|
|
2020-04-04 16:30:32 +08:00
|
|
|
static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
|
|
|
|
|
|
|
|
if (ns) {
|
|
|
|
nvme_ns_remove(ns);
|
|
|
|
nvme_put_ns(ns);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
|
2020-09-28 19:55:22 +08:00
|
|
|
{
|
2021-02-26 15:17:25 +08:00
|
|
|
int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
|
2020-09-28 19:55:22 +08:00
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
|
2020-10-02 02:54:31 +08:00
|
|
|
dev_err(ns->ctrl->device,
|
2020-09-28 19:55:22 +08:00
|
|
|
"identifiers changed for nsid %d\n", ns->head->ns_id);
|
2022-07-23 00:24:18 +08:00
|
|
|
goto out;
|
2020-09-28 19:55:22 +08:00
|
|
|
}
|
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
ret = nvme_update_ns_info(ns, info);
|
2020-09-28 19:55:22 +08:00
|
|
|
out:
|
|
|
|
/*
|
2020-09-28 19:59:06 +08:00
|
|
|
* Only remove the namespace if we got a fatal error back from the
|
2020-09-28 19:55:22 +08:00
|
|
|
* device, otherwise ignore the error and just move on.
|
2020-09-28 19:59:06 +08:00
|
|
|
*
|
|
|
|
* TODO: we should probably schedule a delayed retry here.
|
2020-09-28 19:55:22 +08:00
|
|
|
*/
|
2021-02-26 15:17:25 +08:00
|
|
|
if (ret > 0 && (ret & NVME_SC_DNR))
|
2020-09-28 19:59:06 +08:00
|
|
|
nvme_ns_remove(ns);
|
2020-09-28 19:55:22 +08:00
|
|
|
}
|
|
|
|
|
2022-07-13 00:07:53 +08:00
|
|
|
static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
2015-10-23 05:45:06 +08:00
|
|
|
{
|
2022-07-23 00:24:18 +08:00
|
|
|
struct nvme_ns_info info = { .nsid = nsid };
|
2015-10-23 05:45:06 +08:00
|
|
|
struct nvme_ns *ns;
|
2023-02-22 06:02:25 +08:00
|
|
|
int ret;
|
2015-10-23 05:45:06 +08:00
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
if (nvme_identify_ns_descs(ctrl, &info))
|
2020-09-28 20:07:56 +08:00
|
|
|
return;
|
2015-10-23 05:45:06 +08:00
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) {
|
2022-07-13 00:15:01 +08:00
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"command set not reported for nsid: %d\n", nsid);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2022-05-16 21:09:21 +08:00
|
|
|
/*
|
2022-07-23 00:24:18 +08:00
|
|
|
* If available try to use the Command Set Idependent Identify Namespace
|
|
|
|
* data structure to find all the generic information that is needed to
|
|
|
|
* set up a namespace. If not fall back to the legacy version.
|
2022-05-16 21:09:21 +08:00
|
|
|
*/
|
2022-07-13 02:33:04 +08:00
|
|
|
if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
|
2023-02-22 06:02:25 +08:00
|
|
|
(info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS))
|
|
|
|
ret = nvme_ns_info_from_id_cs_indep(ctrl, &info);
|
|
|
|
else
|
|
|
|
ret = nvme_ns_info_from_identify(ctrl, &info);
|
|
|
|
|
|
|
|
if (info.is_removed)
|
|
|
|
nvme_ns_remove_by_nsid(ctrl, nsid);
|
2022-05-16 21:09:21 +08:00
|
|
|
|
2022-07-23 00:24:18 +08:00
|
|
|
/*
|
|
|
|
* Ignore the namespace if it is not ready. We will get an AEN once it
|
|
|
|
* becomes ready and restart the scan.
|
|
|
|
*/
|
2023-02-22 06:02:25 +08:00
|
|
|
if (ret || !info.is_ready)
|
2022-05-16 21:09:21 +08:00
|
|
|
return;
|
|
|
|
|
2016-07-14 01:45:02 +08:00
|
|
|
ns = nvme_find_get_ns(ctrl, nsid);
|
2020-09-28 20:07:56 +08:00
|
|
|
if (ns) {
|
2022-07-23 00:24:18 +08:00
|
|
|
nvme_validate_ns(ns, &info);
|
2020-09-28 20:07:56 +08:00
|
|
|
nvme_put_ns(ns);
|
2022-07-23 00:24:18 +08:00
|
|
|
} else {
|
|
|
|
nvme_alloc_ns(ctrl, &info);
|
2020-09-28 20:07:56 +08:00
|
|
|
}
|
2015-10-23 05:45:06 +08:00
|
|
|
}
|
|
|
|
|
2016-05-27 18:29:43 +08:00
|
|
|
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
|
|
|
|
unsigned nsid)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns, *next;
|
2018-02-12 20:54:44 +08:00
|
|
|
LIST_HEAD(rm_list);
|
2016-05-27 18:29:43 +08:00
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_write(&ctrl->namespaces_rwsem);
|
2016-05-27 18:29:43 +08:00
|
|
|
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
|
2022-11-01 23:00:40 +08:00
|
|
|
if (ns->head->ns_id > nsid)
|
2018-02-12 20:54:44 +08:00
|
|
|
list_move_tail(&ns->list, &rm_list);
|
2016-05-27 18:29:43 +08:00
|
|
|
}
|
2018-02-12 20:54:46 +08:00
|
|
|
up_write(&ctrl->namespaces_rwsem);
|
2018-02-12 20:54:44 +08:00
|
|
|
|
|
|
|
list_for_each_entry_safe(ns, next, &rm_list, list)
|
|
|
|
nvme_ns_remove(ns);
|
|
|
|
|
2016-05-27 18:29:43 +08:00
|
|
|
}
|
|
|
|
|
2020-04-04 16:31:35 +08:00
|
|
|
static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
|
2015-10-23 05:45:06 +08:00
|
|
|
{
|
2020-04-04 16:34:21 +08:00
|
|
|
const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
|
2015-10-23 05:45:06 +08:00
|
|
|
__le32 *ns_list;
|
2020-04-04 16:31:35 +08:00
|
|
|
u32 prev = 0;
|
|
|
|
int ret = 0, i;
|
2015-10-23 05:45:06 +08:00
|
|
|
|
2018-02-08 21:56:31 +08:00
|
|
|
ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
|
2015-10-23 05:45:06 +08:00
|
|
|
if (!ns_list)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2020-04-04 16:31:35 +08:00
|
|
|
for (;;) {
|
2020-09-28 20:08:28 +08:00
|
|
|
struct nvme_command cmd = {
|
|
|
|
.identify.opcode = nvme_admin_identify,
|
|
|
|
.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST,
|
|
|
|
.identify.nsid = cpu_to_le32(prev),
|
|
|
|
};
|
|
|
|
|
|
|
|
ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
|
|
|
|
NVME_IDENTIFY_DATA_SIZE);
|
2020-11-30 20:47:47 +08:00
|
|
|
if (ret) {
|
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"Identify NS List failed (status=0x%x)\n", ret);
|
2016-05-27 18:29:43 +08:00
|
|
|
goto free;
|
2020-11-30 20:47:47 +08:00
|
|
|
}
|
2015-10-23 05:45:06 +08:00
|
|
|
|
2020-04-04 16:34:21 +08:00
|
|
|
for (i = 0; i < nr_entries; i++) {
|
2020-04-04 16:31:35 +08:00
|
|
|
u32 nsid = le32_to_cpu(ns_list[i]);
|
2015-10-23 05:45:06 +08:00
|
|
|
|
2020-04-04 16:31:35 +08:00
|
|
|
if (!nsid) /* end of the list? */
|
|
|
|
goto out;
|
2022-07-13 00:07:53 +08:00
|
|
|
nvme_scan_ns(ctrl, nsid);
|
2020-04-04 16:30:32 +08:00
|
|
|
while (++prev < nsid)
|
|
|
|
nvme_ns_remove_by_nsid(ctrl, prev);
|
2015-10-23 05:45:06 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
2016-05-27 18:29:43 +08:00
|
|
|
nvme_remove_invalid_namespaces(ctrl, prev);
|
|
|
|
free:
|
2015-10-23 05:45:06 +08:00
|
|
|
kfree(ns_list);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-04-04 16:31:35 +08:00
|
|
|
static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
|
2015-11-28 22:39:07 +08:00
|
|
|
{
|
2020-04-04 16:31:35 +08:00
|
|
|
struct nvme_id_ctrl *id;
|
|
|
|
u32 nn, i;
|
|
|
|
|
|
|
|
if (nvme_identify_ctrl(ctrl, &id))
|
|
|
|
return;
|
|
|
|
nn = le32_to_cpu(id->nn);
|
|
|
|
kfree(id);
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2015-10-23 05:45:06 +08:00
|
|
|
for (i = 1; i <= nn; i++)
|
2022-07-13 00:07:53 +08:00
|
|
|
nvme_scan_ns(ctrl, i);
|
2015-10-23 05:45:06 +08:00
|
|
|
|
2016-05-27 18:29:43 +08:00
|
|
|
nvme_remove_invalid_namespaces(ctrl, nn);
|
2015-11-28 22:39:07 +08:00
|
|
|
}
|
|
|
|
|
2018-06-07 19:47:33 +08:00
|
|
|
static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
|
2018-05-26 00:17:41 +08:00
|
|
|
{
|
|
|
|
size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
|
|
|
|
__le32 *log;
|
2018-06-07 19:47:33 +08:00
|
|
|
int error;
|
2018-05-26 00:17:41 +08:00
|
|
|
|
|
|
|
log = kzalloc(log_size, GFP_KERNEL);
|
|
|
|
if (!log)
|
2018-06-07 19:47:33 +08:00
|
|
|
return;
|
2018-05-26 00:17:41 +08:00
|
|
|
|
2018-06-07 19:47:33 +08:00
|
|
|
/*
|
|
|
|
* We need to read the log to clear the AEN, but we don't want to rely
|
|
|
|
* on it for the changed namespace information as userspace could have
|
|
|
|
* raced with us in reading the log page, which could cause us to miss
|
|
|
|
* updates.
|
|
|
|
*/
|
2020-06-30 03:06:40 +08:00
|
|
|
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
|
|
|
|
NVME_CSI_NVM, log, log_size, 0);
|
2018-06-07 19:47:33 +08:00
|
|
|
if (error)
|
2018-05-26 00:17:41 +08:00
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"reading changed ns log failed: %d\n", error);
|
|
|
|
|
|
|
|
kfree(log);
|
|
|
|
}
|
|
|
|
|
2016-04-26 19:51:59 +08:00
|
|
|
static void nvme_scan_work(struct work_struct *work)
|
2015-11-28 22:39:07 +08:00
|
|
|
{
|
2016-04-26 19:51:59 +08:00
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(work, struct nvme_ctrl, scan_work);
|
2022-05-18 23:51:38 +08:00
|
|
|
int ret;
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2019-09-03 23:22:24 +08:00
|
|
|
/* No tagset on a live ctrl means IO queues could not created */
|
|
|
|
if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
|
2016-04-26 19:51:59 +08:00
|
|
|
return;
|
|
|
|
|
2022-05-18 23:51:38 +08:00
|
|
|
/*
|
|
|
|
* Identify controller limits can change at controller reset due to
|
|
|
|
* new firmware download, even though it is not common we cannot ignore
|
|
|
|
* such scenario. Controller's non-mdts limits are reported in the unit
|
|
|
|
* of logical blocks that is dependent on the format of attached
|
|
|
|
* namespace. Hence re-read the limits at the time of ns allocation.
|
|
|
|
*/
|
|
|
|
ret = nvme_init_non_mdts_limits(ctrl);
|
|
|
|
if (ret < 0) {
|
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"reading non-mdts-limits failed: %d\n", ret);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2018-06-07 16:27:41 +08:00
|
|
|
if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
|
2018-05-26 00:17:41 +08:00
|
|
|
dev_info(ctrl->device, "rescanning namespaces.\n");
|
2018-06-07 19:47:33 +08:00
|
|
|
nvme_clear_changed_ns_log(ctrl);
|
2018-05-26 00:17:41 +08:00
|
|
|
}
|
|
|
|
|
2019-01-29 00:46:07 +08:00
|
|
|
mutex_lock(&ctrl->scan_lock);
|
nvme: avoid fallback to sequential scan due to transient issues
Currently, if nvme_scan_ns_list fails, nvme_scan_work will fall back to
a sequential scan. nvme_scan_ns_list can fail for a variety of reasons,
e.g. a transient transport issue, and the resulting sequential scan can
be extremely expensive on controllers reporting an NN value close to the
maximum allowed (> 4 billion). Avoid sequential scans wherever possible
by only falling back to them in two cases:
- When the NVMe version supported (VS) value reported by the device is
older than NVME_VS(1, 1, 0), before which support of Identify NS List
not required.
- When the Identify NS List command fails with the DNR bit set in the
status. This is to accommodate (non-compliant) devices which report a
VS value which implies support for Identify NS List, but nevertheless
do not support the command. Such devices will most likely fail the
command with the DNR bit set.
The third case is when the device claims support for Identify NS List
but the command fails with DNR not set. In such cases, fallback to
sequential scan is potentially expensive and likely unnecessary, as a
retry of the list scan should succeed. So this change skips the fallback
in this third case.
Signed-off-by: Uday Shankar <ushankar@purestorage.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-11-15 08:23:59 +08:00
|
|
|
if (nvme_ctrl_limited_cns(ctrl)) {
|
2020-04-04 16:31:35 +08:00
|
|
|
nvme_scan_ns_sequential(ctrl);
|
nvme: avoid fallback to sequential scan due to transient issues
Currently, if nvme_scan_ns_list fails, nvme_scan_work will fall back to
a sequential scan. nvme_scan_ns_list can fail for a variety of reasons,
e.g. a transient transport issue, and the resulting sequential scan can
be extremely expensive on controllers reporting an NN value close to the
maximum allowed (> 4 billion). Avoid sequential scans wherever possible
by only falling back to them in two cases:
- When the NVMe version supported (VS) value reported by the device is
older than NVME_VS(1, 1, 0), before which support of Identify NS List
not required.
- When the Identify NS List command fails with the DNR bit set in the
status. This is to accommodate (non-compliant) devices which report a
VS value which implies support for Identify NS List, but nevertheless
do not support the command. Such devices will most likely fail the
command with the DNR bit set.
The third case is when the device claims support for Identify NS List
but the command fails with DNR not set. In such cases, fallback to
sequential scan is potentially expensive and likely unnecessary, as a
retry of the list scan should succeed. So this change skips the fallback
in this third case.
Signed-off-by: Uday Shankar <ushankar@purestorage.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2022-11-15 08:23:59 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Fall back to sequential scan if DNR is set to handle broken
|
|
|
|
* devices which should support Identify NS List (as per the VS
|
|
|
|
* they report) but don't actually support it.
|
|
|
|
*/
|
|
|
|
ret = nvme_scan_ns_list(ctrl);
|
|
|
|
if (ret > 0 && ret & NVME_SC_DNR)
|
|
|
|
nvme_scan_ns_sequential(ctrl);
|
|
|
|
}
|
2019-01-29 00:46:07 +08:00
|
|
|
mutex_unlock(&ctrl->scan_lock);
|
2016-04-26 19:51:59 +08:00
|
|
|
}
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2016-07-14 01:45:02 +08:00
|
|
|
/*
|
|
|
|
* This function iterates the namespace list unlocked to allow recovery from
|
|
|
|
* controller failure. It is up to the caller to ensure the namespace list is
|
|
|
|
* not modified by scan work while this function is executing.
|
|
|
|
*/
|
2015-11-28 22:39:07 +08:00
|
|
|
void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns, *next;
|
2018-02-12 20:54:44 +08:00
|
|
|
LIST_HEAD(ns_list);
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2019-07-26 02:56:57 +08:00
|
|
|
/*
|
|
|
|
* make sure to requeue I/O to all namespaces as these
|
|
|
|
* might result from the scan itself and must complete
|
|
|
|
* for the scan_work to make progress
|
|
|
|
*/
|
|
|
|
nvme_mpath_clear_ctrl_paths(ctrl);
|
|
|
|
|
2018-11-22 07:17:37 +08:00
|
|
|
/* prevent racing with ns scanning */
|
|
|
|
flush_work(&ctrl->scan_work);
|
|
|
|
|
2016-05-12 22:37:14 +08:00
|
|
|
/*
|
|
|
|
* The dead states indicates the controller was not gracefully
|
|
|
|
* disconnected. In that case, we won't be able to flush any data while
|
|
|
|
* removing the namespaces' disks; fail all the queues now to avoid
|
|
|
|
* potentially having to clean up the failed sync later.
|
|
|
|
*/
|
2022-11-01 23:00:43 +08:00
|
|
|
if (ctrl->state == NVME_CTRL_DEAD) {
|
|
|
|
nvme_mark_namespaces_dead(ctrl);
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_unquiesce_io_queues(ctrl);
|
2022-11-01 23:00:43 +08:00
|
|
|
}
|
2016-05-12 22:37:14 +08:00
|
|
|
|
2020-07-23 07:32:19 +08:00
|
|
|
/* this is a no-op when called from the controller reset handler */
|
|
|
|
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_write(&ctrl->namespaces_rwsem);
|
2018-02-12 20:54:44 +08:00
|
|
|
list_splice_init(&ctrl->namespaces, &ns_list);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_write(&ctrl->namespaces_rwsem);
|
2018-02-12 20:54:44 +08:00
|
|
|
|
|
|
|
list_for_each_entry_safe(ns, next, &ns_list, list)
|
2015-11-28 22:39:07 +08:00
|
|
|
nvme_ns_remove(ns);
|
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2022-11-23 20:25:19 +08:00
|
|
|
static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env)
|
2019-09-05 05:29:48 +08:00
|
|
|
{
|
2022-11-23 20:25:19 +08:00
|
|
|
const struct nvme_ctrl *ctrl =
|
2019-09-05 05:29:48 +08:00
|
|
|
container_of(dev, struct nvme_ctrl, ctrl_device);
|
|
|
|
struct nvmf_ctrl_options *opts = ctrl->opts;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (opts) {
|
|
|
|
ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = add_uevent_var(env, "NVME_TRSVCID=%s",
|
|
|
|
opts->trsvcid ?: "none");
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
|
|
|
|
opts->host_traddr ?: "none");
|
nvme-tcp: allow selecting the network interface for connections
In our application, we need a way to force TCP connections to go out a
specific IP interface instead of letting Linux select the interface
based on the routing tables.
Add the 'host-iface' option to allow specifying the interface to use.
When the option host-iface is specified, the driver uses the specified
interface to set the option SO_BINDTODEVICE on the TCP socket before
connecting.
This new option is needed in addtion to the existing host-traddr for
the following reasons:
Specifying an IP interface by its associated IP address is less
intuitive than specifying the actual interface name and, in some cases,
simply doesn't work. That's because the association between interfaces
and IP addresses is not predictable. IP addresses can be changed or can
change by themselves over time (e.g. DHCP). Interface names are
predictable [1] and will persist over time. Consider the following
configuration.
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state ...
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 100.0.0.100/24 scope global lo
valid_lft forever preferred_lft forever
2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff
inet 100.0.0.100/24 scope global enp0s3
valid_lft forever preferred_lft forever
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff
inet 100.0.0.100/24 scope global enp0s8
valid_lft forever preferred_lft forever
The above is a VM that I configured with the same IP address
(100.0.0.100) on all interfaces. Doing a reverse lookup to identify the
unique interface associated with 100.0.0.100 does not work here. And
this is why the option host_iface is required. I understand that the
above config does not represent a standard host system, but I'm using
this to prove a point: "We can never know how users will configure
their systems". By te way, The above configuration is perfectly fine
by Linux.
The current TCP implementation for host_traddr performs a
bind()-before-connect(). This is a common construct to set the source
IP address on a TCP socket before connecting. This has no effect on how
Linux selects the interface for the connection. That's because Linux
uses the Weak End System model as described in RFC1122 [2]. On the other
hand, setting the Source IP Address has benefits and should be supported
by linux-nvme. In fact, setting the Source IP Address is a mandatory
FedGov requirement (e.g. connection to a RADIUS/TACACS+ server).
Consider the following configuration.
$ ip addr list dev enp0s8
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff
inet 192.168.56.101/24 brd 192.168.56.255 scope global enp0s8
valid_lft 426sec preferred_lft 426sec
inet 192.168.56.102/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
inet 192.168.56.103/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
inet 192.168.56.104/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
Here we can see that several addresses are associated with interface
enp0s8. By default, Linux always selects the default IP address,
192.168.56.101, as the source address when connecting over interface
enp0s8. Some users, however, want the ability to specify a different
source address (e.g., 192.168.56.102, 192.168.56.103, ...). The option
host_traddr can be used as-is to perform this function.
In conclusion, I believe that we need 2 options for TCP connections.
One that can be used to specify an interface (host-iface). And one that
can be used to set the source address (host-traddr). Users should be
allowed to use one or the other, or both, or none. Of course, the
documentation for host_traddr will need some clarification. It should
state that when used for TCP connection, this option only sets the
source address. And the documentation for host_iface should say that
this option is only available for TCP connections.
References:
[1] https://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames/
[2] https://tools.ietf.org/html/rfc1122
Tested both IPv4 and IPv6 connections.
Signed-off-by: Martin Belanger <martin.belanger@dell.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-05-21 03:09:34 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = add_uevent_var(env, "NVME_HOST_IFACE=%s",
|
|
|
|
opts->host_iface ?: "none");
|
2019-09-05 05:29:48 +08:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-02-09 03:33:45 +08:00
|
|
|
static void nvme_change_uevent(struct nvme_ctrl *ctrl, char *envdata)
|
|
|
|
{
|
|
|
|
char *envp[2] = { envdata, NULL };
|
|
|
|
|
|
|
|
kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
|
|
|
|
}
|
|
|
|
|
2017-11-08 06:13:14 +08:00
|
|
|
static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
char *envp[2] = { NULL, NULL };
|
|
|
|
u32 aen_result = ctrl->aen_result;
|
|
|
|
|
|
|
|
ctrl->aen_result = 0;
|
|
|
|
if (!aen_result)
|
|
|
|
return;
|
|
|
|
|
|
|
|
envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
|
|
|
|
if (!envp[0])
|
|
|
|
return;
|
|
|
|
kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
|
|
|
|
kfree(envp[0]);
|
|
|
|
}
|
|
|
|
|
2016-04-26 19:52:00 +08:00
|
|
|
static void nvme_async_event_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(work, struct nvme_ctrl, async_event_work);
|
|
|
|
|
2017-11-08 06:13:14 +08:00
|
|
|
nvme_aen_uevent(ctrl);
|
2022-02-01 20:54:19 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The transport drivers must guarantee AER submission here is safe by
|
|
|
|
* flushing ctrl async_event_work after changing the controller state
|
|
|
|
* from LIVE and before freeing the admin queue.
|
|
|
|
*/
|
|
|
|
if (ctrl->state == NVME_CTRL_LIVE)
|
|
|
|
ctrl->ops->submit_async_event(ctrl);
|
2016-04-26 19:52:00 +08:00
|
|
|
}
|
|
|
|
|
2017-07-12 18:40:40 +08:00
|
|
|
static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
|
|
|
|
u32 csts;
|
|
|
|
|
|
|
|
if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (csts == ~0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_fw_slot_info_log *log;
|
|
|
|
|
|
|
|
log = kmalloc(sizeof(*log), GFP_KERNEL);
|
|
|
|
if (!log)
|
|
|
|
return;
|
|
|
|
|
2020-06-30 03:06:40 +08:00
|
|
|
if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
|
|
|
|
log, sizeof(*log), 0))
|
2018-06-06 20:39:00 +08:00
|
|
|
dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
|
2017-07-12 18:40:40 +08:00
|
|
|
kfree(log);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_fw_act_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = container_of(work,
|
|
|
|
struct nvme_ctrl, fw_act_work);
|
|
|
|
unsigned long fw_act_timeout;
|
|
|
|
|
|
|
|
if (ctrl->mtfa)
|
|
|
|
fw_act_timeout = jiffies +
|
|
|
|
msecs_to_jiffies(ctrl->mtfa * 100);
|
|
|
|
else
|
|
|
|
fw_act_timeout = jiffies +
|
|
|
|
msecs_to_jiffies(admin_timeout * 1000);
|
|
|
|
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_quiesce_io_queues(ctrl);
|
2017-07-12 18:40:40 +08:00
|
|
|
while (nvme_ctrl_pp_status(ctrl)) {
|
|
|
|
if (time_after(jiffies, fw_act_timeout)) {
|
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"Fw activation timeout, reset controller\n");
|
2019-09-07 01:23:08 +08:00
|
|
|
nvme_try_sched_reset(ctrl);
|
|
|
|
return;
|
2017-07-12 18:40:40 +08:00
|
|
|
}
|
|
|
|
msleep(100);
|
|
|
|
}
|
|
|
|
|
2019-09-07 01:23:08 +08:00
|
|
|
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
|
2017-07-12 18:40:40 +08:00
|
|
|
return;
|
|
|
|
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_unquiesce_io_queues(ctrl);
|
2017-11-02 17:07:44 +08:00
|
|
|
/* read FW slot information to clear the AER */
|
2017-07-12 18:40:40 +08:00
|
|
|
nvme_get_fw_slot_info(ctrl);
|
2022-09-01 23:30:39 +08:00
|
|
|
|
|
|
|
queue_work(nvme_wq, &ctrl->async_event_work);
|
2017-07-12 18:40:40 +08:00
|
|
|
}
|
|
|
|
|
2022-06-09 02:52:21 +08:00
|
|
|
static u32 nvme_aer_type(u32 result)
|
|
|
|
{
|
|
|
|
return result & 0x7;
|
|
|
|
}
|
|
|
|
|
|
|
|
static u32 nvme_aer_subtype(u32 result)
|
|
|
|
{
|
|
|
|
return (result & 0xff00) >> 8;
|
|
|
|
}
|
|
|
|
|
2022-09-01 23:30:39 +08:00
|
|
|
static bool nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
|
2018-05-22 17:09:54 +08:00
|
|
|
{
|
2022-06-09 02:52:21 +08:00
|
|
|
u32 aer_notice_type = nvme_aer_subtype(result);
|
2022-09-01 23:30:39 +08:00
|
|
|
bool requeue = true;
|
2018-09-18 01:47:06 +08:00
|
|
|
|
|
|
|
switch (aer_notice_type) {
|
2018-05-22 17:09:54 +08:00
|
|
|
case NVME_AER_NOTICE_NS_CHANGED:
|
2018-06-07 16:27:41 +08:00
|
|
|
set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
|
2018-05-22 17:09:54 +08:00
|
|
|
nvme_queue_scan(ctrl);
|
|
|
|
break;
|
|
|
|
case NVME_AER_NOTICE_FW_ACT_STARTING:
|
2019-09-07 01:23:08 +08:00
|
|
|
/*
|
|
|
|
* We are (ab)using the RESETTING state to prevent subsequent
|
|
|
|
* recovery actions from interfering with the controller's
|
|
|
|
* firmware activation.
|
|
|
|
*/
|
2022-06-27 17:52:02 +08:00
|
|
|
if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
|
|
|
|
nvme_auth_stop(ctrl);
|
2022-09-01 23:30:39 +08:00
|
|
|
requeue = false;
|
2019-09-07 01:23:08 +08:00
|
|
|
queue_work(nvme_wq, &ctrl->fw_act_work);
|
2022-06-27 17:52:02 +08:00
|
|
|
}
|
2018-05-22 17:09:54 +08:00
|
|
|
break;
|
2018-05-14 14:48:54 +08:00
|
|
|
#ifdef CONFIG_NVME_MULTIPATH
|
|
|
|
case NVME_AER_NOTICE_ANA:
|
|
|
|
if (!ctrl->ana_log_buf)
|
|
|
|
break;
|
|
|
|
queue_work(nvme_wq, &ctrl->ana_work);
|
|
|
|
break;
|
|
|
|
#endif
|
2019-07-13 02:02:10 +08:00
|
|
|
case NVME_AER_NOTICE_DISC_CHANGED:
|
|
|
|
ctrl->aen_result = result;
|
|
|
|
break;
|
2018-05-22 17:09:54 +08:00
|
|
|
default:
|
|
|
|
dev_warn(ctrl->device, "async event result %08x\n", result);
|
|
|
|
}
|
2022-09-01 23:30:39 +08:00
|
|
|
return requeue;
|
2018-05-22 17:09:54 +08:00
|
|
|
}
|
|
|
|
|
2022-06-09 02:52:21 +08:00
|
|
|
static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
dev_warn(ctrl->device, "resetting controller due to AER\n");
|
|
|
|
nvme_reset_ctrl(ctrl);
|
|
|
|
}
|
|
|
|
|
2016-11-10 23:32:34 +08:00
|
|
|
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
|
2018-05-18 00:31:46 +08:00
|
|
|
volatile union nvme_result *res)
|
2016-04-26 19:52:00 +08:00
|
|
|
{
|
2016-11-10 23:32:34 +08:00
|
|
|
u32 result = le32_to_cpu(res->u32);
|
2022-06-09 02:52:21 +08:00
|
|
|
u32 aer_type = nvme_aer_type(result);
|
|
|
|
u32 aer_subtype = nvme_aer_subtype(result);
|
2022-09-01 23:30:39 +08:00
|
|
|
bool requeue = true;
|
2016-04-26 19:52:00 +08:00
|
|
|
|
2017-11-08 06:13:12 +08:00
|
|
|
if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
|
2016-04-26 19:52:00 +08:00
|
|
|
return;
|
|
|
|
|
2023-04-06 05:57:20 +08:00
|
|
|
trace_nvme_async_event(ctrl, result);
|
2018-09-18 01:47:06 +08:00
|
|
|
switch (aer_type) {
|
2018-05-22 17:09:54 +08:00
|
|
|
case NVME_AER_NOTICE:
|
2022-09-01 23:30:39 +08:00
|
|
|
requeue = nvme_handle_aen_notice(ctrl, result);
|
2018-05-22 17:09:54 +08:00
|
|
|
break;
|
2017-11-08 06:13:14 +08:00
|
|
|
case NVME_AER_ERROR:
|
2022-06-09 02:52:21 +08:00
|
|
|
/*
|
|
|
|
* For a persistent internal error, don't run async_event_work
|
|
|
|
* to submit a new AER. The controller reset will do it.
|
|
|
|
*/
|
|
|
|
if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
|
|
|
|
nvme_handle_aer_persistent_error(ctrl);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
fallthrough;
|
2017-11-08 06:13:14 +08:00
|
|
|
case NVME_AER_SMART:
|
|
|
|
case NVME_AER_CSS:
|
|
|
|
case NVME_AER_VS:
|
|
|
|
ctrl->aen_result = result;
|
2016-11-10 23:32:34 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
2016-04-26 19:52:00 +08:00
|
|
|
}
|
2022-09-01 23:30:39 +08:00
|
|
|
|
|
|
|
if (requeue)
|
|
|
|
queue_work(nvme_wq, &ctrl->async_event_work);
|
2016-04-26 19:52:00 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_complete_async_event);
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2022-09-04 20:18:30 +08:00
|
|
|
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
|
2022-12-01 00:19:50 +08:00
|
|
|
const struct blk_mq_ops *ops, unsigned int cmd_size)
|
2022-09-04 20:18:30 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
memset(set, 0, sizeof(*set));
|
|
|
|
set->ops = ops;
|
|
|
|
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
|
|
|
|
if (ctrl->ops->flags & NVME_F_FABRICS)
|
|
|
|
set->reserved_tags = NVMF_RESERVED_TAGS;
|
|
|
|
set->numa_node = ctrl->numa_node;
|
2022-12-01 00:19:50 +08:00
|
|
|
set->flags = BLK_MQ_F_NO_SCHED;
|
|
|
|
if (ctrl->ops->flags & NVME_F_BLOCKING)
|
|
|
|
set->flags |= BLK_MQ_F_BLOCKING;
|
2022-09-04 20:18:30 +08:00
|
|
|
set->cmd_size = cmd_size;
|
|
|
|
set->driver_data = ctrl;
|
|
|
|
set->nr_hw_queues = 1;
|
|
|
|
set->timeout = NVME_ADMIN_TIMEOUT;
|
|
|
|
ret = blk_mq_alloc_tag_set(set);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ctrl->admin_q = blk_mq_init_queue(set);
|
|
|
|
if (IS_ERR(ctrl->admin_q)) {
|
|
|
|
ret = PTR_ERR(ctrl->admin_q);
|
|
|
|
goto out_free_tagset;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ctrl->ops->flags & NVME_F_FABRICS) {
|
|
|
|
ctrl->fabrics_q = blk_mq_init_queue(set);
|
|
|
|
if (IS_ERR(ctrl->fabrics_q)) {
|
|
|
|
ret = PTR_ERR(ctrl->fabrics_q);
|
|
|
|
goto out_cleanup_admin_q;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ctrl->admin_tagset = set;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_cleanup_admin_q:
|
2022-10-15 16:25:56 +08:00
|
|
|
blk_mq_destroy_queue(ctrl->admin_q);
|
2022-10-18 21:57:17 +08:00
|
|
|
blk_put_queue(ctrl->admin_q);
|
2022-09-04 20:18:30 +08:00
|
|
|
out_free_tagset:
|
2023-01-27 23:42:37 +08:00
|
|
|
blk_mq_free_tag_set(set);
|
|
|
|
ctrl->admin_q = NULL;
|
|
|
|
ctrl->fabrics_q = NULL;
|
2022-09-04 20:18:30 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
|
|
|
|
|
|
|
|
void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
blk_mq_destroy_queue(ctrl->admin_q);
|
2022-10-18 21:57:17 +08:00
|
|
|
blk_put_queue(ctrl->admin_q);
|
|
|
|
if (ctrl->ops->flags & NVME_F_FABRICS) {
|
2022-09-04 20:18:30 +08:00
|
|
|
blk_mq_destroy_queue(ctrl->fabrics_q);
|
2022-10-18 21:57:17 +08:00
|
|
|
blk_put_queue(ctrl->fabrics_q);
|
|
|
|
}
|
2022-09-04 20:18:30 +08:00
|
|
|
blk_mq_free_tag_set(ctrl->admin_tagset);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);
|
|
|
|
|
|
|
|
int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
|
2022-12-01 00:19:50 +08:00
|
|
|
const struct blk_mq_ops *ops, unsigned int nr_maps,
|
2022-09-04 20:18:30 +08:00
|
|
|
unsigned int cmd_size)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
memset(set, 0, sizeof(*set));
|
|
|
|
set->ops = ops;
|
2022-12-25 18:32:31 +08:00
|
|
|
set->queue_depth = min_t(unsigned, ctrl->sqsize, BLK_MQ_MAX_DEPTH - 1);
|
2022-12-01 00:28:48 +08:00
|
|
|
/*
|
|
|
|
* Some Apple controllers requires tags to be unique across admin and
|
|
|
|
* the (only) I/O queue, so reserve the first 32 tags of the I/O queue.
|
|
|
|
*/
|
|
|
|
if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
|
|
|
|
set->reserved_tags = NVME_AQ_DEPTH;
|
|
|
|
else if (ctrl->ops->flags & NVME_F_FABRICS)
|
2022-12-01 00:27:07 +08:00
|
|
|
set->reserved_tags = NVMF_RESERVED_TAGS;
|
2022-09-04 20:18:30 +08:00
|
|
|
set->numa_node = ctrl->numa_node;
|
2022-12-01 00:19:50 +08:00
|
|
|
set->flags = BLK_MQ_F_SHOULD_MERGE;
|
|
|
|
if (ctrl->ops->flags & NVME_F_BLOCKING)
|
|
|
|
set->flags |= BLK_MQ_F_BLOCKING;
|
2022-09-04 20:18:30 +08:00
|
|
|
set->cmd_size = cmd_size,
|
|
|
|
set->driver_data = ctrl;
|
|
|
|
set->nr_hw_queues = ctrl->queue_count - 1;
|
|
|
|
set->timeout = NVME_IO_TIMEOUT;
|
2022-12-01 00:16:52 +08:00
|
|
|
set->nr_maps = nr_maps;
|
2022-09-04 20:18:30 +08:00
|
|
|
ret = blk_mq_alloc_tag_set(set);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (ctrl->ops->flags & NVME_F_FABRICS) {
|
|
|
|
ctrl->connect_q = blk_mq_init_queue(set);
|
|
|
|
if (IS_ERR(ctrl->connect_q)) {
|
|
|
|
ret = PTR_ERR(ctrl->connect_q);
|
|
|
|
goto out_free_tag_set;
|
|
|
|
}
|
2022-11-01 23:00:50 +08:00
|
|
|
blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE,
|
|
|
|
ctrl->connect_q);
|
2022-09-04 20:18:30 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ctrl->tagset = set;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_free_tag_set:
|
|
|
|
blk_mq_free_tag_set(set);
|
2023-02-01 00:38:42 +08:00
|
|
|
ctrl->connect_q = NULL;
|
2022-09-04 20:18:30 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set);
|
|
|
|
|
|
|
|
void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
2022-10-18 21:57:17 +08:00
|
|
|
if (ctrl->ops->flags & NVME_F_FABRICS) {
|
2022-09-04 20:18:30 +08:00
|
|
|
blk_mq_destroy_queue(ctrl->connect_q);
|
2022-10-18 21:57:17 +08:00
|
|
|
blk_put_queue(ctrl->connect_q);
|
|
|
|
}
|
2022-09-04 20:18:30 +08:00
|
|
|
blk_mq_free_tag_set(ctrl->tagset);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
|
|
|
|
|
2017-07-02 15:56:43 +08:00
|
|
|
void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
|
2016-02-11 02:03:32 +08:00
|
|
|
{
|
2018-05-14 14:48:54 +08:00
|
|
|
nvme_mpath_stop(ctrl);
|
2022-06-27 17:52:02 +08:00
|
|
|
nvme_auth_stop(ctrl);
|
2017-07-02 15:56:43 +08:00
|
|
|
nvme_stop_keep_alive(ctrl);
|
2020-11-25 02:34:59 +08:00
|
|
|
nvme_stop_failfast_work(ctrl);
|
2016-04-26 19:52:00 +08:00
|
|
|
flush_work(&ctrl->async_event_work);
|
2017-07-12 18:40:40 +08:00
|
|
|
cancel_work_sync(&ctrl->fw_act_work);
|
2022-06-23 14:45:39 +08:00
|
|
|
if (ctrl->ops->stop_ctrl)
|
|
|
|
ctrl->ops->stop_ctrl(ctrl);
|
2017-07-02 15:56:43 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
|
|
|
|
|
|
|
|
void nvme_start_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
2020-07-13 14:25:21 +08:00
|
|
|
nvme_start_keep_alive(ctrl);
|
2017-07-02 15:56:43 +08:00
|
|
|
|
2019-08-23 02:25:46 +08:00
|
|
|
nvme_enable_aen(ctrl);
|
|
|
|
|
2022-09-22 16:15:37 +08:00
|
|
|
/*
|
|
|
|
* persistent discovery controllers need to send indication to userspace
|
|
|
|
* to re-read the discovery log page to learn about possible changes
|
|
|
|
* that were missed. We identify persistent discovery controllers by
|
|
|
|
* checking that they started once before, hence are reconnecting back.
|
|
|
|
*/
|
|
|
|
if (test_and_set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) &&
|
|
|
|
nvme_discovery_ctrl(ctrl))
|
|
|
|
nvme_change_uevent(ctrl, "NVME_EVENT=rediscover");
|
|
|
|
|
2017-07-02 15:56:43 +08:00
|
|
|
if (ctrl->queue_count > 1) {
|
|
|
|
nvme_queue_scan(ctrl);
|
2022-11-15 18:22:14 +08:00
|
|
|
nvme_unquiesce_io_queues(ctrl);
|
2022-03-25 03:05:11 +08:00
|
|
|
nvme_mpath_update(ctrl);
|
2017-07-02 15:56:43 +08:00
|
|
|
}
|
2022-02-09 03:33:45 +08:00
|
|
|
|
|
|
|
nvme_change_uevent(ctrl, "NVME_EVENT=connected");
|
2017-07-02 15:56:43 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_start_ctrl);
|
2016-04-26 19:51:59 +08:00
|
|
|
|
2017-07-02 15:56:43 +08:00
|
|
|
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
2021-01-19 14:43:18 +08:00
|
|
|
nvme_hwmon_exit(ctrl);
|
2019-06-09 22:17:01 +08:00
|
|
|
nvme_fault_inject_fini(&ctrl->fault_inject);
|
2019-05-17 10:30:07 +08:00
|
|
|
dev_pm_qos_hide_latency_tolerance(ctrl->device);
|
2017-10-18 22:59:25 +08:00
|
|
|
cdev_device_del(&ctrl->cdev, ctrl->device);
|
2020-03-24 23:29:42 +08:00
|
|
|
nvme_put_ctrl(ctrl);
|
2015-11-28 22:41:02 +08:00
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
|
2015-11-28 22:41:02 +08:00
|
|
|
|
2020-11-14 02:45:45 +08:00
|
|
|
static void nvme_free_cels(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_effects_log *cel;
|
|
|
|
unsigned long i;
|
|
|
|
|
2021-01-27 03:47:52 +08:00
|
|
|
xa_for_each(&ctrl->cels, i, cel) {
|
2020-11-14 02:45:45 +08:00
|
|
|
xa_erase(&ctrl->cels, i);
|
|
|
|
kfree(cel);
|
|
|
|
}
|
|
|
|
|
|
|
|
xa_destroy(&ctrl->cels);
|
|
|
|
}
|
|
|
|
|
2017-10-18 19:25:42 +08:00
|
|
|
static void nvme_free_ctrl(struct device *dev)
|
2015-11-28 22:41:02 +08:00
|
|
|
{
|
2017-10-18 19:25:42 +08:00
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(dev, struct nvme_ctrl, ctrl_device);
|
2017-11-09 20:48:55 +08:00
|
|
|
struct nvme_subsystem *subsys = ctrl->subsys;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2020-08-27 01:53:04 +08:00
|
|
|
if (!subsys || ctrl->instance != subsys->instance)
|
2022-02-14 17:07:27 +08:00
|
|
|
ida_free(&nvme_instance_ida, ctrl->instance);
|
2019-09-06 00:33:54 +08:00
|
|
|
|
2020-11-14 02:45:45 +08:00
|
|
|
nvme_free_cels(ctrl);
|
2018-05-14 14:48:54 +08:00
|
|
|
nvme_mpath_uninit(ctrl);
|
2022-06-27 17:52:02 +08:00
|
|
|
nvme_auth_stop(ctrl);
|
|
|
|
nvme_auth_free(ctrl);
|
2018-12-14 04:34:07 +08:00
|
|
|
__free_page(ctrl->discard_page);
|
2022-11-08 22:48:27 +08:00
|
|
|
free_opal_dev(ctrl->opal_dev);
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
if (subsys) {
|
2019-05-08 15:48:27 +08:00
|
|
|
mutex_lock(&nvme_subsystems_lock);
|
2017-11-09 20:48:55 +08:00
|
|
|
list_del(&ctrl->subsys_entry);
|
|
|
|
sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
|
2019-05-08 15:48:27 +08:00
|
|
|
mutex_unlock(&nvme_subsystems_lock);
|
2017-11-09 20:48:55 +08:00
|
|
|
}
|
2015-11-28 22:40:19 +08:00
|
|
|
|
|
|
|
ctrl->ops->free_ctrl(ctrl);
|
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
if (subsys)
|
|
|
|
nvme_put_subsystem(subsys);
|
2015-11-28 22:40:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize a NVMe controller structures. This needs to be called during
|
|
|
|
* earliest initialization so that we have the initialized structured around
|
|
|
|
* during probing.
|
|
|
|
*/
|
|
|
|
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
|
|
|
|
const struct nvme_ctrl_ops *ops, unsigned long quirks)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2016-04-26 19:51:57 +08:00
|
|
|
ctrl->state = NVME_CTRL_NEW;
|
2020-11-25 02:34:59 +08:00
|
|
|
clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
|
2016-04-26 19:51:57 +08:00
|
|
|
spin_lock_init(&ctrl->lock);
|
2019-01-29 00:46:07 +08:00
|
|
|
mutex_init(&ctrl->scan_lock);
|
2015-11-28 22:40:19 +08:00
|
|
|
INIT_LIST_HEAD(&ctrl->namespaces);
|
2020-09-23 05:05:29 +08:00
|
|
|
xa_init(&ctrl->cels);
|
2018-02-12 20:54:46 +08:00
|
|
|
init_rwsem(&ctrl->namespaces_rwsem);
|
2015-11-28 22:40:19 +08:00
|
|
|
ctrl->dev = dev;
|
|
|
|
ctrl->ops = ops;
|
|
|
|
ctrl->quirks = quirks;
|
2020-06-16 17:34:21 +08:00
|
|
|
ctrl->numa_node = NUMA_NO_NODE;
|
2016-04-26 19:51:59 +08:00
|
|
|
INIT_WORK(&ctrl->scan_work, nvme_scan_work);
|
2016-04-26 19:52:00 +08:00
|
|
|
INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
|
2017-07-12 18:40:40 +08:00
|
|
|
INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
|
2017-10-29 16:44:29 +08:00
|
|
|
INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
|
2019-09-05 00:06:11 +08:00
|
|
|
init_waitqueue_head(&ctrl->state_wq);
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2018-06-13 07:28:24 +08:00
|
|
|
INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
|
2020-11-25 02:34:59 +08:00
|
|
|
INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
|
2018-06-13 07:28:24 +08:00
|
|
|
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
|
|
|
|
ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
|
|
|
|
|
2018-12-13 00:18:11 +08:00
|
|
|
BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
|
|
|
|
PAGE_SIZE);
|
|
|
|
ctrl->discard_page = alloc_page(GFP_KERNEL);
|
|
|
|
if (!ctrl->discard_page) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2022-02-14 17:07:27 +08:00
|
|
|
ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
|
2017-10-18 19:10:01 +08:00
|
|
|
if (ret < 0)
|
2015-11-28 22:40:19 +08:00
|
|
|
goto out;
|
2017-10-18 19:10:01 +08:00
|
|
|
ctrl->instance = ret;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2017-10-18 19:25:42 +08:00
|
|
|
device_initialize(&ctrl->ctrl_device);
|
|
|
|
ctrl->device = &ctrl->ctrl_device;
|
2020-12-01 20:56:08 +08:00
|
|
|
ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
|
|
|
|
ctrl->instance);
|
2017-10-18 19:25:42 +08:00
|
|
|
ctrl->device->class = nvme_class;
|
|
|
|
ctrl->device->parent = ctrl->dev;
|
2022-10-27 17:34:13 +08:00
|
|
|
if (ops->dev_attr_groups)
|
|
|
|
ctrl->device->groups = ops->dev_attr_groups;
|
|
|
|
else
|
|
|
|
ctrl->device->groups = nvme_dev_attr_groups;
|
2017-10-18 19:25:42 +08:00
|
|
|
ctrl->device->release = nvme_free_ctrl;
|
|
|
|
dev_set_drvdata(ctrl->device, ctrl);
|
|
|
|
ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
|
|
|
|
if (ret)
|
2015-11-28 22:40:19 +08:00
|
|
|
goto out_release_instance;
|
|
|
|
|
2020-03-24 23:29:41 +08:00
|
|
|
nvme_get_ctrl(ctrl);
|
2017-10-18 22:59:25 +08:00
|
|
|
cdev_init(&ctrl->cdev, &nvme_dev_fops);
|
|
|
|
ctrl->cdev.owner = ops->module;
|
|
|
|
ret = cdev_device_add(&ctrl->cdev, ctrl->device);
|
2017-10-18 19:25:42 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_free_name;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
/*
|
|
|
|
* Initialize latency tolerance controls. The sysfs files won't
|
|
|
|
* be visible to userspace unless the device actually supports APST.
|
|
|
|
*/
|
|
|
|
ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
|
|
|
|
dev_pm_qos_update_user_latency_tolerance(ctrl->device,
|
|
|
|
min(default_ps_max_latency_us, (unsigned long)S32_MAX));
|
|
|
|
|
2019-06-09 22:17:01 +08:00
|
|
|
nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
|
2021-04-29 20:18:53 +08:00
|
|
|
nvme_mpath_init_ctrl(ctrl);
|
2022-11-13 19:24:10 +08:00
|
|
|
ret = nvme_auth_init_ctrl(ctrl);
|
|
|
|
if (ret)
|
|
|
|
goto out_free_cdev;
|
2019-06-09 22:17:01 +08:00
|
|
|
|
2015-11-28 22:40:19 +08:00
|
|
|
return 0;
|
2022-11-13 19:24:10 +08:00
|
|
|
out_free_cdev:
|
|
|
|
cdev_device_del(&ctrl->cdev, ctrl->device);
|
2017-10-18 19:25:42 +08:00
|
|
|
out_free_name:
|
2020-03-24 23:29:41 +08:00
|
|
|
nvme_put_ctrl(ctrl);
|
2018-11-27 07:39:47 +08:00
|
|
|
kfree_const(ctrl->device->kobj.name);
|
2015-11-28 22:40:19 +08:00
|
|
|
out_release_instance:
|
2022-02-14 17:07:27 +08:00
|
|
|
ida_free(&nvme_instance_ida, ctrl->instance);
|
2015-11-28 22:40:19 +08:00
|
|
|
out:
|
2018-12-13 00:18:11 +08:00
|
|
|
if (ctrl->discard_page)
|
|
|
|
__free_page(ctrl->discard_page);
|
2015-11-28 22:40:19 +08:00
|
|
|
return ret;
|
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2022-11-01 23:00:43 +08:00
|
|
|
/* let I/O to all namespaces fail in preparation for surprise removal */
|
|
|
|
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
|
2016-02-25 00:15:56 +08:00
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2018-06-30 03:03:28 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list)
|
2022-11-01 23:00:43 +08:00
|
|
|
blk_mark_disk_dead(ns->disk);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2016-02-25 00:15:56 +08:00
|
|
|
}
|
2022-11-01 23:00:43 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
|
2016-02-25 00:15:56 +08:00
|
|
|
|
2017-03-02 03:22:12 +08:00
|
|
|
void nvme_unfreeze(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list)
|
|
|
|
blk_mq_unfreeze_queue(ns->queue);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_unfreeze);
|
|
|
|
|
2020-07-31 04:24:45 +08:00
|
|
|
int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
|
2017-03-02 03:22:12 +08:00
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list) {
|
|
|
|
timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
|
|
|
|
if (timeout <= 0)
|
|
|
|
break;
|
|
|
|
}
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2020-07-31 04:24:45 +08:00
|
|
|
return timeout;
|
2017-03-02 03:22:12 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
|
|
|
|
|
|
|
|
void nvme_wait_freeze(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list)
|
|
|
|
blk_mq_freeze_queue_wait(ns->queue);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_wait_freeze);
|
|
|
|
|
|
|
|
void nvme_start_freeze(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list)
|
2017-03-27 20:06:57 +08:00
|
|
|
blk_freeze_queue_start(ns->queue);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_start_freeze);
|
|
|
|
|
2022-11-15 18:22:14 +08:00
|
|
|
void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl)
|
2015-12-24 22:26:59 +08:00
|
|
|
{
|
2022-11-16 15:14:46 +08:00
|
|
|
if (!ctrl->tagset)
|
|
|
|
return;
|
2022-11-01 23:00:50 +08:00
|
|
|
if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags))
|
|
|
|
blk_mq_quiesce_tagset(ctrl->tagset);
|
|
|
|
else
|
|
|
|
blk_mq_wait_quiesce_done(ctrl->tagset);
|
2015-12-24 22:26:59 +08:00
|
|
|
}
|
2022-11-15 18:22:14 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues);
|
2015-12-24 22:26:59 +08:00
|
|
|
|
2022-11-15 18:22:14 +08:00
|
|
|
void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl)
|
2015-12-24 22:26:59 +08:00
|
|
|
{
|
2022-11-16 15:14:46 +08:00
|
|
|
if (!ctrl->tagset)
|
|
|
|
return;
|
2022-11-01 23:00:50 +08:00
|
|
|
if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
|
|
|
|
blk_mq_unquiesce_tagset(ctrl->tagset);
|
2015-12-24 22:26:59 +08:00
|
|
|
}
|
2022-11-15 18:22:14 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues);
|
2015-12-24 22:26:59 +08:00
|
|
|
|
2022-11-15 18:22:14 +08:00
|
|
|
void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl)
|
2021-10-14 16:17:05 +08:00
|
|
|
{
|
2021-10-14 16:17:08 +08:00
|
|
|
if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
|
|
|
|
blk_mq_quiesce_queue(ctrl->admin_q);
|
2021-11-09 15:11:44 +08:00
|
|
|
else
|
2022-11-01 23:00:48 +08:00
|
|
|
blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set);
|
2021-10-14 16:17:05 +08:00
|
|
|
}
|
2022-11-15 18:22:14 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue);
|
2021-10-14 16:17:05 +08:00
|
|
|
|
2022-11-15 18:22:14 +08:00
|
|
|
void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
|
2021-10-14 16:17:05 +08:00
|
|
|
{
|
2021-10-14 16:17:08 +08:00
|
|
|
if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
|
|
|
|
blk_mq_unquiesce_queue(ctrl->admin_q);
|
2021-10-14 16:17:05 +08:00
|
|
|
}
|
2022-11-15 18:22:14 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
|
2021-10-14 16:17:05 +08:00
|
|
|
|
2020-10-22 10:15:00 +08:00
|
|
|
void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
|
2019-05-15 04:46:09 +08:00
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list)
|
|
|
|
blk_sync_queue(ns->queue);
|
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2020-10-22 10:15:00 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
|
2019-09-04 04:08:47 +08:00
|
|
|
|
2020-10-22 10:15:00 +08:00
|
|
|
void nvme_sync_queues(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
nvme_sync_io_queues(ctrl);
|
2019-09-04 04:08:47 +08:00
|
|
|
if (ctrl->admin_q)
|
|
|
|
blk_sync_queue(ctrl->admin_q);
|
2019-05-15 04:46:09 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_sync_queues);
|
|
|
|
|
2020-09-17 09:11:02 +08:00
|
|
|
struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
|
2020-07-25 01:25:15 +08:00
|
|
|
{
|
2020-09-17 09:11:02 +08:00
|
|
|
if (file->f_op != &nvme_dev_fops)
|
|
|
|
return NULL;
|
|
|
|
return file->private_data;
|
2020-07-25 01:25:15 +08:00
|
|
|
}
|
2020-09-17 09:11:02 +08:00
|
|
|
EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
|
2020-07-25 01:25:15 +08:00
|
|
|
|
2019-04-30 23:36:52 +08:00
|
|
|
/*
|
|
|
|
* Check we didn't inadvertently grow the command structure sizes:
|
|
|
|
*/
|
|
|
|
static inline void _nvme_check_size(void)
|
|
|
|
{
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
|
2022-05-16 21:09:21 +08:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) !=
|
|
|
|
NVME_IDENTIFY_DATA_SIZE);
|
2020-06-30 03:06:41 +08:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
|
2022-03-04 04:13:12 +08:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE);
|
2020-06-30 03:06:41 +08:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
|
2021-03-25 07:18:05 +08:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
|
2019-04-30 23:36:52 +08:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
|
2022-03-04 04:13:12 +08:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
|
2019-04-30 23:36:52 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2019-04-30 23:37:43 +08:00
|
|
|
static int __init nvme_core_init(void)
|
2015-11-28 22:39:07 +08:00
|
|
|
{
|
2018-01-14 18:39:02 +08:00
|
|
|
int result = -ENOMEM;
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2019-04-30 23:36:52 +08:00
|
|
|
_nvme_check_size();
|
|
|
|
|
2017-06-08 02:31:55 +08:00
|
|
|
nvme_wq = alloc_workqueue("nvme-wq",
|
|
|
|
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
|
|
|
|
if (!nvme_wq)
|
2018-01-14 18:39:02 +08:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
|
|
|
|
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
|
|
|
|
if (!nvme_reset_wq)
|
|
|
|
goto destroy_wq;
|
|
|
|
|
|
|
|
nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
|
|
|
|
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
|
|
|
|
if (!nvme_delete_wq)
|
|
|
|
goto destroy_reset_wq;
|
2017-06-08 02:31:55 +08:00
|
|
|
|
2020-12-01 20:56:08 +08:00
|
|
|
result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0,
|
|
|
|
NVME_MINORS, "nvme");
|
2015-11-28 22:40:19 +08:00
|
|
|
if (result < 0)
|
2018-01-14 18:39:02 +08:00
|
|
|
goto destroy_delete_wq;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2023-03-14 02:18:35 +08:00
|
|
|
nvme_class = class_create("nvme");
|
2015-11-28 22:40:19 +08:00
|
|
|
if (IS_ERR(nvme_class)) {
|
|
|
|
result = PTR_ERR(nvme_class);
|
|
|
|
goto unregister_chrdev;
|
|
|
|
}
|
2019-09-05 05:29:48 +08:00
|
|
|
nvme_class->dev_uevent = nvme_class_uevent;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2023-03-14 02:18:35 +08:00
|
|
|
nvme_subsys_class = class_create("nvme-subsystem");
|
2017-11-09 20:48:55 +08:00
|
|
|
if (IS_ERR(nvme_subsys_class)) {
|
|
|
|
result = PTR_ERR(nvme_subsys_class);
|
|
|
|
goto destroy_class;
|
|
|
|
}
|
2021-04-21 15:45:04 +08:00
|
|
|
|
|
|
|
result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
|
|
|
|
"nvme-generic");
|
|
|
|
if (result < 0)
|
|
|
|
goto destroy_subsys_class;
|
|
|
|
|
2023-03-14 02:18:35 +08:00
|
|
|
nvme_ns_chr_class = class_create("nvme-generic");
|
2021-04-21 15:45:04 +08:00
|
|
|
if (IS_ERR(nvme_ns_chr_class)) {
|
|
|
|
result = PTR_ERR(nvme_ns_chr_class);
|
|
|
|
goto unregister_generic_ns;
|
|
|
|
}
|
|
|
|
|
2022-11-16 00:08:06 +08:00
|
|
|
result = nvme_init_auth();
|
|
|
|
if (result)
|
|
|
|
goto destroy_ns_chr;
|
2015-11-28 22:39:07 +08:00
|
|
|
return 0;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2022-11-16 00:08:06 +08:00
|
|
|
destroy_ns_chr:
|
|
|
|
class_destroy(nvme_ns_chr_class);
|
2021-04-21 15:45:04 +08:00
|
|
|
unregister_generic_ns:
|
|
|
|
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
|
|
|
|
destroy_subsys_class:
|
|
|
|
class_destroy(nvme_subsys_class);
|
2017-11-09 20:48:55 +08:00
|
|
|
destroy_class:
|
|
|
|
class_destroy(nvme_class);
|
2017-06-08 02:31:55 +08:00
|
|
|
unregister_chrdev:
|
2020-12-01 20:56:08 +08:00
|
|
|
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
|
2018-01-14 18:39:02 +08:00
|
|
|
destroy_delete_wq:
|
|
|
|
destroy_workqueue(nvme_delete_wq);
|
|
|
|
destroy_reset_wq:
|
|
|
|
destroy_workqueue(nvme_reset_wq);
|
2017-06-08 02:31:55 +08:00
|
|
|
destroy_wq:
|
|
|
|
destroy_workqueue(nvme_wq);
|
2018-01-14 18:39:02 +08:00
|
|
|
out:
|
2015-11-28 22:40:19 +08:00
|
|
|
return result;
|
2015-11-28 22:39:07 +08:00
|
|
|
}
|
|
|
|
|
2019-04-30 23:37:43 +08:00
|
|
|
static void __exit nvme_core_exit(void)
|
2015-11-28 22:39:07 +08:00
|
|
|
{
|
2022-11-16 00:08:06 +08:00
|
|
|
nvme_exit_auth();
|
2021-04-21 15:45:04 +08:00
|
|
|
class_destroy(nvme_ns_chr_class);
|
2017-11-09 20:48:55 +08:00
|
|
|
class_destroy(nvme_subsys_class);
|
2015-11-28 22:40:19 +08:00
|
|
|
class_destroy(nvme_class);
|
2021-04-21 15:45:04 +08:00
|
|
|
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
|
2020-12-01 20:56:08 +08:00
|
|
|
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
|
2018-01-14 18:39:02 +08:00
|
|
|
destroy_workqueue(nvme_delete_wq);
|
|
|
|
destroy_workqueue(nvme_reset_wq);
|
2017-06-08 02:31:55 +08:00
|
|
|
destroy_workqueue(nvme_wq);
|
2021-04-21 15:45:04 +08:00
|
|
|
ida_destroy(&nvme_ns_chr_minor_ida);
|
2020-03-18 23:27:59 +08:00
|
|
|
ida_destroy(&nvme_instance_ida);
|
2015-11-28 22:39:07 +08:00
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
|
|
|
|
MODULE_LICENSE("GPL");
|
|
|
|
MODULE_VERSION("1.0");
|
|
|
|
module_init(nvme_core_init);
|
|
|
|
module_exit(nvme_core_exit);
|