2019-02-18 16:36:29 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2015-11-26 16:08:36 +08:00
|
|
|
/*
|
|
|
|
* NVM Express device driver
|
|
|
|
* Copyright (c) 2011-2014, Intel Corporation.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/blk-mq.h>
|
2021-09-20 20:33:27 +08:00
|
|
|
#include <linux/blk-integrity.h>
|
2020-03-28 13:09:09 +08:00
|
|
|
#include <linux/compat.h>
|
2015-11-28 22:03:49 +08:00
|
|
|
#include <linux/delay.h>
|
2015-11-26 16:08:36 +08:00
|
|
|
#include <linux/errno.h>
|
2015-11-26 17:54:19 +08:00
|
|
|
#include <linux/hdreg.h>
|
2015-11-26 16:08:36 +08:00
|
|
|
#include <linux/kernel.h>
|
2015-11-28 22:39:07 +08:00
|
|
|
#include <linux/module.h>
|
2019-07-04 15:59:18 +08:00
|
|
|
#include <linux/backing-dev.h>
|
2015-11-26 16:08:36 +08:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/types.h>
|
2015-11-26 17:54:19 +08:00
|
|
|
#include <linux/pr.h>
|
|
|
|
#include <linux/ptrace.h>
|
|
|
|
#include <linux/nvme_ioctl.h>
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
#include <linux/pm_qos.h>
|
2015-11-26 17:54:19 +08:00
|
|
|
#include <asm/unaligned.h>
|
2015-11-26 16:08:36 +08:00
|
|
|
|
|
|
|
#include "nvme.h"
|
2016-06-13 22:45:28 +08:00
|
|
|
#include "fabrics.h"
|
2015-11-26 16:08:36 +08:00
|
|
|
|
2019-07-24 21:47:55 +08:00
|
|
|
#define CREATE_TRACE_POINTS
|
|
|
|
#include "trace.h"
|
|
|
|
|
2015-11-28 22:40:19 +08:00
|
|
|
#define NVME_MINORS (1U << MINORBITS)
|
|
|
|
|
2017-09-07 08:23:56 +08:00
|
|
|
unsigned int admin_timeout = 60;
|
|
|
|
module_param(admin_timeout, uint, 0644);
|
2016-02-11 02:03:30 +08:00
|
|
|
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(admin_timeout);
|
2016-02-11 02:03:30 +08:00
|
|
|
|
2017-09-07 08:23:56 +08:00
|
|
|
unsigned int nvme_io_timeout = 30;
|
|
|
|
module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
|
2016-02-11 02:03:30 +08:00
|
|
|
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_io_timeout);
|
2016-02-11 02:03:30 +08:00
|
|
|
|
2017-06-13 00:30:51 +08:00
|
|
|
static unsigned char shutdown_timeout = 5;
|
2016-02-11 02:03:30 +08:00
|
|
|
module_param(shutdown_timeout, byte, 0644);
|
|
|
|
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
|
|
|
|
|
2017-04-06 01:18:11 +08:00
|
|
|
static u8 nvme_max_retries = 5;
|
|
|
|
module_param_named(max_retries, nvme_max_retries, byte, 0644);
|
2016-07-13 07:20:31 +08:00
|
|
|
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2017-06-07 15:25:43 +08:00
|
|
|
static unsigned long default_ps_max_latency_us = 100000;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
module_param(default_ps_max_latency_us, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(default_ps_max_latency_us,
|
|
|
|
"max power saving latency for new devices; use PM QOS to change per device");
|
|
|
|
|
2017-04-22 07:19:24 +08:00
|
|
|
static bool force_apst;
|
|
|
|
module_param(force_apst, bool, 0644);
|
|
|
|
MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
|
|
|
|
|
nvme: extend and modify the APST configuration algorithm
The algorithm that was used until now for building the APST configuration
table has been found to produce entries with excessively long ITPT
(idle time prior to transition) for devices declaring relatively long
entry and exit latencies for non-operational power states. This leads
to unnecessary waste of power and, as a result, failure to pass
mandatory power consumption tests on Chromebook platforms.
The new algorithm is based on two predefined ITPT values and two
predefined latency tolerances. Based on these values, as well as on
exit and entry latencies reported by the device, the algorithm looks
for up to 2 suitable non-operational power states to use as primary
and secondary APST transition targets. The predefined values are
supplied to the nvme driver as module parameters:
- apst_primary_timeout_ms (default: 100)
- apst_secondary_timeout_ms (default: 2000)
- apst_primary_latency_tol_us (default: 15000)
- apst_secondary_latency_tol_us (default: 100000)
The algorithm echoes the approach used by Intel's and Microsoft's drivers
on Windows. The specific default parameter values are also based on those
drivers. Yet, this patch doesn't introduce the ability to dynamically
regenerate the APST table in the event of switching the power source from
AC to battery and back. Adding this functionality may be considered in the
future. In the meantime, the timeouts and tolerances reflect a compromise
between values used by Microsoft for AC and battery scenarios.
In most NVMe devices the new algorithm causes them to implement a more
aggressive power saving policy. While beneficial in most cases, this
sometimes comes at the price of a higher IO processing latency in certain
scenarios as well as at the price of a potential impact on the drive's
endurance (due to more frequent context saving when entering deep non-
operational states). So in order to provide a fallback for systems where
these regressions cannot be tolerated, the patch allows to revert to
the legacy behavior by setting either apst_primary_timeout_ms or
apst_primary_latency_tol_us parameter to 0. Eventually (and possibly after
fine tuning the default values of the module parameters) the legacy behavior
can be removed.
TESTING.
The new algorithm has been extensively tested. Initially, simulations were
used to compare APST tables generated by old and new algorithms for a wide
range of devices. After that, power consumption, performance and latencies
were measured under different workloads on devices from multiple vendors
(WD, Intel, Samsung, Hynix, Kioxia). Below is the description of the tests
and the findings.
General observations.
The effect the patch has on the APST table varies depending on the entry and
exit latencies advertised by the devices. For some devices, the effect is
negligible (e.g. Kioxia KBG40ZNS), for some significant, making the
transitions to PS3 and PS4 much quicker (e.g. WD SN530, Intel 760P), or making
the sleep deeper, PS4 rather than PS3 after a similar amount of time (e.g.
SK Hynix BC511). For some devices (e.g. Samsung PM991) the effect is mixed:
the initial transition happens after a longer idle time, but takes the device
to a lower power state.
Workflows.
In order to evaluate the patch's effect on the power consumption and latency,
7 workflows were used for each device. The workflows were designed to test
the scenarios where significant differences between the old and new behaviors
are most likely. Each workflow was tested twice: with the new and with the
old APST table generation implementation. Power consumption, performance and
latency were measured in the process. The following workflows were used:
1) Consecutive write at the maximum rate with IO depth of 2, with no pauses
2) Repeated pattern of 1000 consecutive writes of 4K packets followed by 50ms
idle time
3) Repeated pattern of 1000 consecutive writes of 4K packets followed by 150ms
idle time
4) Repeated pattern of 1000 consecutive writes of 4K packets followed by 500ms
idle time
5) Repeated pattern of 1000 consecutive writes of 4K packets followed by 1.5s
idle time
6) Repeated pattern of 1000 consecutive writes of 4K packets followed by 5s
idle time
7) Repeated pattern of a single random read of a 4K packet followed by 150ms
idle time
Power consumption
Actual power consumption measurements produced predictable results in
accordance with the APST mechanism's theory of operation.
Devices with long entry and exit latencies such as WD SN530 showed huge
improvement on scenarios 4,5 and 6 of up to 62%. Devices such as Kioxia
KBG40ZNS where the resulting APST table looks virtually identical with
both legacy and new algorithms, showed little or no change in the average power
consumption on all workflows. Devices with extra short latencies such as
Samsung PM991 showed moderate increase in power consumption of up to 18% in
worst case scenarios.
In addition, on Intel and Samsung devices a more complex impact was observed
on scenarios 3, 4 and 7. Our understanding is that due to longer stay in deep
non-operational states between the writes the devices start performing background
operations leading to an increase of power consumption. With the old APST tables
part of these operations are delayed until the scenario is over and a longer idle
period begins, but eventually this extra power is consumed anyway.
Performance.
In terms of performance measured on sustained write or read scenarios, the
effect of the patch is minimal as in this case the device doesn't enter low power
states.
Latency
As expected, in devices where the patch causes a more aggressive power saving
policy (e.g. WD SN530, Intel 760P), an increase in latency was observed in
certain scenarios. Workflow number 7, specifically designed to simulate the
worst case scenario as far as latency is concerned, indeed shows a sharp
increase in average latency (~2ms -> ~53ms on Intel 760P and 0.6 -> 10ms on
WD SN530). The latency increase on other workloads and other devices is much
milder or non-existent.
Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-04-28 17:27:36 +08:00
|
|
|
static unsigned long apst_primary_timeout_ms = 100;
|
|
|
|
module_param(apst_primary_timeout_ms, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(apst_primary_timeout_ms,
|
|
|
|
"primary APST timeout in ms");
|
|
|
|
|
|
|
|
static unsigned long apst_secondary_timeout_ms = 2000;
|
|
|
|
module_param(apst_secondary_timeout_ms, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(apst_secondary_timeout_ms,
|
|
|
|
"secondary APST timeout in ms");
|
|
|
|
|
|
|
|
static unsigned long apst_primary_latency_tol_us = 15000;
|
|
|
|
module_param(apst_primary_latency_tol_us, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(apst_primary_latency_tol_us,
|
|
|
|
"primary APST latency tolerance in us");
|
|
|
|
|
|
|
|
static unsigned long apst_secondary_latency_tol_us = 100000;
|
|
|
|
module_param(apst_secondary_latency_tol_us, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(apst_secondary_latency_tol_us,
|
|
|
|
"secondary APST latency tolerance in us");
|
|
|
|
|
2017-06-28 02:03:06 +08:00
|
|
|
static bool streams;
|
|
|
|
module_param(streams, bool, 0644);
|
|
|
|
MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
|
|
|
|
|
2018-01-14 18:39:02 +08:00
|
|
|
/*
|
|
|
|
* nvme_wq - hosts nvme related works that are not reset or delete
|
|
|
|
* nvme_reset_wq - hosts nvme reset works
|
|
|
|
* nvme_delete_wq - hosts nvme delete works
|
|
|
|
*
|
2020-02-11 08:01:45 +08:00
|
|
|
* nvme_wq will host works such as scan, aen handling, fw activation,
|
|
|
|
* keep-alive, periodic reconnects etc. nvme_reset_wq
|
2018-01-14 18:39:02 +08:00
|
|
|
* runs reset works which also flush works hosted on nvme_wq for
|
|
|
|
* serialization purposes. nvme_delete_wq host controller deletion
|
|
|
|
* works which flush reset works for serialization.
|
|
|
|
*/
|
2017-06-08 02:31:55 +08:00
|
|
|
struct workqueue_struct *nvme_wq;
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_wq);
|
|
|
|
|
2018-01-14 18:39:02 +08:00
|
|
|
struct workqueue_struct *nvme_reset_wq;
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_reset_wq);
|
|
|
|
|
|
|
|
struct workqueue_struct *nvme_delete_wq;
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_delete_wq);
|
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
static LIST_HEAD(nvme_subsystems);
|
|
|
|
static DEFINE_MUTEX(nvme_subsystems_lock);
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2017-10-18 19:10:01 +08:00
|
|
|
static DEFINE_IDA(nvme_instance_ida);
|
2020-12-01 20:56:08 +08:00
|
|
|
static dev_t nvme_ctrl_base_chr_devt;
|
2015-11-28 22:40:19 +08:00
|
|
|
static struct class *nvme_class;
|
2017-11-09 20:48:55 +08:00
|
|
|
static struct class *nvme_subsys_class;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2021-04-21 15:45:04 +08:00
|
|
|
static DEFINE_IDA(nvme_ns_chr_minor_ida);
|
|
|
|
static dev_t nvme_ns_chr_devt;
|
|
|
|
static struct class *nvme_ns_chr_class;
|
|
|
|
|
2018-05-04 16:01:57 +08:00
|
|
|
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
|
2018-06-30 03:03:28 +08:00
|
|
|
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
|
|
|
|
unsigned nsid);
|
2021-09-01 16:23:42 +08:00
|
|
|
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
|
|
|
|
struct nvme_command *cmd);
|
2018-06-30 03:03:28 +08:00
|
|
|
|
2021-04-10 14:42:03 +08:00
|
|
|
void nvme_queue_scan(struct nvme_ctrl *ctrl)
|
2018-05-26 00:15:47 +08:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Only new queue scan work when admin and IO queues are both alive
|
|
|
|
*/
|
2019-09-03 23:22:24 +08:00
|
|
|
if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
|
2018-05-26 00:15:47 +08:00
|
|
|
queue_work(nvme_wq, &ctrl->scan_work);
|
|
|
|
}
|
|
|
|
|
2019-09-07 01:23:08 +08:00
|
|
|
/*
|
|
|
|
* Use this function to proceed with scheduling reset_work for a controller
|
|
|
|
* that had previously been set to the resetting state. This is intended for
|
|
|
|
* code paths that can't be interrupted by other reset attempts. A hot removal
|
|
|
|
* may prevent this from succeeding.
|
|
|
|
*/
|
2019-09-05 00:06:11 +08:00
|
|
|
int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
|
2019-09-07 01:23:08 +08:00
|
|
|
{
|
|
|
|
if (ctrl->state != NVME_CTRL_RESETTING)
|
|
|
|
return -EBUSY;
|
|
|
|
if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
|
|
|
|
return -EBUSY;
|
|
|
|
return 0;
|
|
|
|
}
|
2019-09-05 00:06:11 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
|
2019-09-07 01:23:08 +08:00
|
|
|
|
2020-11-25 02:34:59 +08:00
|
|
|
static void nvme_failfast_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
|
|
|
|
struct nvme_ctrl, failfast_work);
|
|
|
|
|
|
|
|
if (ctrl->state != NVME_CTRL_CONNECTING)
|
|
|
|
return;
|
|
|
|
|
|
|
|
set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
|
|
|
|
dev_info(ctrl->device, "failfast expired\n");
|
|
|
|
nvme_kick_requeue_lists(ctrl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
schedule_delayed_work(&ctrl->failfast_work,
|
|
|
|
ctrl->opts->fast_io_fail_tmo * HZ);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (!ctrl->opts)
|
|
|
|
return;
|
|
|
|
|
|
|
|
cancel_delayed_work_sync(&ctrl->failfast_work);
|
|
|
|
clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-06-15 21:41:08 +08:00
|
|
|
int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
|
|
|
|
return -EBUSY;
|
2018-01-14 18:39:02 +08:00
|
|
|
if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
|
2017-06-15 21:41:08 +08:00
|
|
|
return -EBUSY;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
|
|
|
|
|
2021-04-10 14:42:03 +08:00
|
|
|
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
|
2017-06-15 21:41:08 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = nvme_reset_ctrl(ctrl);
|
2018-01-17 19:01:14 +08:00
|
|
|
if (!ret) {
|
2017-06-15 21:41:08 +08:00
|
|
|
flush_work(&ctrl->reset_work);
|
2019-09-03 23:22:24 +08:00
|
|
|
if (ctrl->state != NVME_CTRL_LIVE)
|
2018-01-17 19:01:14 +08:00
|
|
|
ret = -ENETRESET;
|
|
|
|
}
|
|
|
|
|
2017-06-15 21:41:08 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-02-15 06:50:56 +08:00
|
|
|
static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
|
2017-10-29 16:44:29 +08:00
|
|
|
{
|
2018-03-11 23:46:06 +08:00
|
|
|
dev_info(ctrl->device,
|
2021-09-22 14:35:25 +08:00
|
|
|
"Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
|
2018-03-11 23:46:06 +08:00
|
|
|
|
2017-10-29 20:21:02 +08:00
|
|
|
flush_work(&ctrl->reset_work);
|
2017-10-29 16:44:31 +08:00
|
|
|
nvme_stop_ctrl(ctrl);
|
|
|
|
nvme_remove_namespaces(ctrl);
|
2017-10-29 16:44:29 +08:00
|
|
|
ctrl->ops->delete_ctrl(ctrl);
|
2017-10-29 16:44:31 +08:00
|
|
|
nvme_uninit_ctrl(ctrl);
|
2017-10-29 16:44:29 +08:00
|
|
|
}
|
|
|
|
|
2019-02-15 06:50:56 +08:00
|
|
|
static void nvme_delete_ctrl_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(work, struct nvme_ctrl, delete_work);
|
|
|
|
|
|
|
|
nvme_do_delete_ctrl(ctrl);
|
|
|
|
}
|
|
|
|
|
2017-10-29 16:44:29 +08:00
|
|
|
int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
|
|
|
|
return -EBUSY;
|
2018-01-14 18:39:02 +08:00
|
|
|
if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
|
2017-10-29 16:44:29 +08:00
|
|
|
return -EBUSY;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
|
|
|
|
|
2020-03-24 23:29:39 +08:00
|
|
|
static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
|
2017-10-29 16:44:29 +08:00
|
|
|
{
|
|
|
|
/*
|
2019-03-14 01:54:58 +08:00
|
|
|
* Keep a reference until nvme_do_delete_ctrl() complete,
|
|
|
|
* since ->delete_ctrl can free the controller.
|
2017-10-29 16:44:29 +08:00
|
|
|
*/
|
|
|
|
nvme_get_ctrl(ctrl);
|
2020-03-24 23:29:39 +08:00
|
|
|
if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
|
2019-02-15 06:50:57 +08:00
|
|
|
nvme_do_delete_ctrl(ctrl);
|
2017-10-29 16:44:29 +08:00
|
|
|
nvme_put_ctrl(ctrl);
|
|
|
|
}
|
|
|
|
|
2019-08-30 03:53:15 +08:00
|
|
|
static blk_status_t nvme_error_status(u16 status)
|
2017-04-20 22:02:57 +08:00
|
|
|
{
|
2019-08-30 03:53:15 +08:00
|
|
|
switch (status & 0x7ff) {
|
2017-04-20 22:02:57 +08:00
|
|
|
case NVME_SC_SUCCESS:
|
2017-06-03 15:38:04 +08:00
|
|
|
return BLK_STS_OK;
|
2017-04-20 22:02:57 +08:00
|
|
|
case NVME_SC_CAP_EXCEEDED:
|
2017-06-03 15:38:04 +08:00
|
|
|
return BLK_STS_NOSPC;
|
2018-01-10 03:04:14 +08:00
|
|
|
case NVME_SC_LBA_RANGE:
|
2019-12-06 03:50:44 +08:00
|
|
|
case NVME_SC_CMD_INTERRUPTED:
|
|
|
|
case NVME_SC_NS_NOT_READY:
|
2018-01-10 03:04:14 +08:00
|
|
|
return BLK_STS_TARGET;
|
|
|
|
case NVME_SC_BAD_ATTRIBUTES:
|
2017-04-21 18:59:07 +08:00
|
|
|
case NVME_SC_ONCS_NOT_SUPPORTED:
|
2018-01-10 03:04:14 +08:00
|
|
|
case NVME_SC_INVALID_OPCODE:
|
|
|
|
case NVME_SC_INVALID_FIELD:
|
|
|
|
case NVME_SC_INVALID_NS:
|
2017-06-03 15:38:04 +08:00
|
|
|
return BLK_STS_NOTSUPP;
|
2017-04-21 18:59:07 +08:00
|
|
|
case NVME_SC_WRITE_FAULT:
|
|
|
|
case NVME_SC_READ_ERROR:
|
|
|
|
case NVME_SC_UNWRITTEN_BLOCK:
|
2017-08-22 16:17:03 +08:00
|
|
|
case NVME_SC_ACCESS_DENIED:
|
|
|
|
case NVME_SC_READ_ONLY:
|
2018-01-10 03:04:14 +08:00
|
|
|
case NVME_SC_COMPARE_FAILED:
|
2017-06-03 15:38:04 +08:00
|
|
|
return BLK_STS_MEDIUM;
|
2017-08-22 16:17:03 +08:00
|
|
|
case NVME_SC_GUARD_CHECK:
|
|
|
|
case NVME_SC_APPTAG_CHECK:
|
|
|
|
case NVME_SC_REFTAG_CHECK:
|
|
|
|
case NVME_SC_INVALID_PI:
|
|
|
|
return BLK_STS_PROTECTION;
|
|
|
|
case NVME_SC_RESERVATION_CONFLICT:
|
|
|
|
return BLK_STS_NEXUS;
|
2019-08-03 09:04:12 +08:00
|
|
|
case NVME_SC_HOST_PATH_ERROR:
|
|
|
|
return BLK_STS_TRANSPORT;
|
2020-09-25 04:53:29 +08:00
|
|
|
case NVME_SC_ZONE_TOO_MANY_ACTIVE:
|
|
|
|
return BLK_STS_ZONE_ACTIVE_RESOURCE;
|
|
|
|
case NVME_SC_ZONE_TOO_MANY_OPEN:
|
|
|
|
return BLK_STS_ZONE_OPEN_RESOURCE;
|
2017-06-03 15:38:04 +08:00
|
|
|
default:
|
|
|
|
return BLK_STS_IOERR;
|
2017-04-20 22:02:57 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-28 00:40:57 +08:00
|
|
|
static void nvme_retry_req(struct request *req)
|
|
|
|
{
|
|
|
|
unsigned long delay = 0;
|
|
|
|
u16 crd;
|
|
|
|
|
|
|
|
/* The mask and shift result must be <= 3 */
|
|
|
|
crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
|
2021-01-08 22:46:57 +08:00
|
|
|
if (crd)
|
|
|
|
delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
|
2018-11-28 00:40:57 +08:00
|
|
|
|
|
|
|
nvme_req(req)->retries++;
|
|
|
|
blk_mq_requeue_request(req, false);
|
|
|
|
blk_mq_delay_kick_requeue_list(req->q, delay);
|
|
|
|
}
|
|
|
|
|
2022-02-03 16:11:53 +08:00
|
|
|
static void nvme_log_error(struct request *req)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns = req->q->queuedata;
|
|
|
|
struct nvme_request *nr = nvme_req(req);
|
|
|
|
|
|
|
|
if (ns) {
|
|
|
|
pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
|
|
|
|
ns->disk ? ns->disk->disk_name : "?",
|
|
|
|
nvme_get_opcode_str(nr->cmd->common.opcode),
|
|
|
|
nr->cmd->common.opcode,
|
|
|
|
(unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
|
|
|
|
(unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
|
|
|
|
nvme_get_error_status_str(nr->status),
|
|
|
|
nr->status >> 8 & 7, /* Status Code Type */
|
|
|
|
nr->status & 0xff, /* Status Code */
|
|
|
|
nr->status & NVME_SC_MORE ? "MORE " : "",
|
|
|
|
nr->status & NVME_SC_DNR ? "DNR " : "");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
|
|
|
|
dev_name(nr->ctrl->device),
|
|
|
|
nvme_get_admin_opcode_str(nr->cmd->common.opcode),
|
|
|
|
nr->cmd->common.opcode,
|
|
|
|
nvme_get_error_status_str(nr->status),
|
|
|
|
nr->status >> 8 & 7, /* Status Code Type */
|
|
|
|
nr->status & 0xff, /* Status Code */
|
|
|
|
nr->status & NVME_SC_MORE ? "MORE " : "",
|
|
|
|
nr->status & NVME_SC_DNR ? "DNR " : "");
|
|
|
|
}
|
|
|
|
|
2020-08-18 15:11:30 +08:00
|
|
|
enum nvme_disposition {
|
|
|
|
COMPLETE,
|
|
|
|
RETRY,
|
|
|
|
FAILOVER,
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
|
2017-03-30 19:41:32 +08:00
|
|
|
{
|
2020-08-18 15:11:30 +08:00
|
|
|
if (likely(nvme_req(req)->status == 0))
|
|
|
|
return COMPLETE;
|
2018-01-10 03:04:15 +08:00
|
|
|
|
2020-08-18 15:11:30 +08:00
|
|
|
if (blk_noretry_request(req) ||
|
|
|
|
(nvme_req(req)->status & NVME_SC_DNR) ||
|
|
|
|
nvme_req(req)->retries >= nvme_max_retries)
|
|
|
|
return COMPLETE;
|
2018-01-26 18:21:38 +08:00
|
|
|
|
2020-08-18 15:11:30 +08:00
|
|
|
if (req->cmd_flags & REQ_NVME_MPATH) {
|
2020-08-18 15:11:32 +08:00
|
|
|
if (nvme_is_path_error(nvme_req(req)->status) ||
|
|
|
|
blk_queue_dying(req->q))
|
2020-08-18 15:11:30 +08:00
|
|
|
return FAILOVER;
|
2020-08-18 15:11:32 +08:00
|
|
|
} else {
|
|
|
|
if (blk_queue_dying(req->q))
|
|
|
|
return COMPLETE;
|
2020-08-18 15:11:30 +08:00
|
|
|
}
|
2019-10-14 00:57:36 +08:00
|
|
|
|
2020-08-18 15:11:30 +08:00
|
|
|
return RETRY;
|
|
|
|
}
|
2018-11-03 01:28:15 +08:00
|
|
|
|
2021-10-08 19:59:37 +08:00
|
|
|
static inline void nvme_end_req_zoned(struct request *req)
|
2020-08-18 15:11:30 +08:00
|
|
|
{
|
|
|
|
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
|
|
|
|
req_op(req) == REQ_OP_ZONE_APPEND)
|
2020-06-30 03:06:41 +08:00
|
|
|
req->__sector = nvme_lba_to_sect(req->q->queuedata,
|
|
|
|
le64_to_cpu(nvme_req(req)->result.u64));
|
2021-10-08 19:59:37 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void nvme_end_req(struct request *req)
|
|
|
|
{
|
|
|
|
blk_status_t status = nvme_error_status(nvme_req(req)->status);
|
2019-07-24 21:47:55 +08:00
|
|
|
|
2022-02-03 16:11:53 +08:00
|
|
|
if (unlikely(nvme_req(req)->status != NVME_SC_SUCCESS))
|
|
|
|
nvme_log_error(req);
|
2021-10-08 19:59:37 +08:00
|
|
|
nvme_end_req_zoned(req);
|
2021-01-05 18:34:02 +08:00
|
|
|
nvme_trace_bio_complete(req);
|
2018-01-10 03:04:15 +08:00
|
|
|
blk_mq_end_request(req, status);
|
2017-03-30 19:41:32 +08:00
|
|
|
}
|
2020-08-18 15:11:30 +08:00
|
|
|
|
|
|
|
void nvme_complete_rq(struct request *req)
|
|
|
|
{
|
|
|
|
trace_nvme_complete_rq(req);
|
|
|
|
nvme_cleanup_cmd(req);
|
|
|
|
|
|
|
|
if (nvme_req(req)->ctrl->kas)
|
|
|
|
nvme_req(req)->ctrl->comp_seen = true;
|
|
|
|
|
|
|
|
switch (nvme_decide_disposition(req)) {
|
|
|
|
case COMPLETE:
|
|
|
|
nvme_end_req(req);
|
|
|
|
return;
|
|
|
|
case RETRY:
|
|
|
|
nvme_retry_req(req);
|
|
|
|
return;
|
|
|
|
case FAILOVER:
|
|
|
|
nvme_failover_req(req);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2017-03-30 19:41:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_complete_rq);
|
|
|
|
|
2021-10-08 19:59:37 +08:00
|
|
|
void nvme_complete_batch_req(struct request *req)
|
|
|
|
{
|
|
|
|
nvme_cleanup_cmd(req);
|
|
|
|
nvme_end_req_zoned(req);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
|
|
|
|
|
2021-02-04 15:55:11 +08:00
|
|
|
/*
|
|
|
|
* Called to unwind from ->queue_rq on a failed command submission so that the
|
|
|
|
* multipathing code gets called to potentially failover to another path.
|
|
|
|
* The caller needs to unwind all transport specific resource allocations and
|
|
|
|
* must return propagate the return value.
|
|
|
|
*/
|
|
|
|
blk_status_t nvme_host_path_error(struct request *req)
|
|
|
|
{
|
|
|
|
nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
|
|
|
|
blk_mq_set_request_complete(req);
|
|
|
|
nvme_complete_rq(req);
|
|
|
|
return BLK_STS_OK;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_host_path_error);
|
|
|
|
|
2018-11-09 01:24:07 +08:00
|
|
|
bool nvme_cancel_request(struct request *req, void *data, bool reserved)
|
2016-05-19 05:05:02 +08:00
|
|
|
{
|
|
|
|
dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
|
|
|
|
"Cancelling I/O %d", req->tag);
|
|
|
|
|
2019-07-24 11:48:41 +08:00
|
|
|
/* don't abort one completed request */
|
|
|
|
if (blk_mq_request_completed(req))
|
|
|
|
return true;
|
|
|
|
|
2019-10-14 00:57:35 +08:00
|
|
|
nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
|
2021-02-26 15:17:26 +08:00
|
|
|
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
|
2020-06-11 14:44:47 +08:00
|
|
|
blk_mq_complete_request(req);
|
2018-11-09 01:24:07 +08:00
|
|
|
return true;
|
2016-05-19 05:05:02 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_cancel_request);
|
|
|
|
|
2021-01-21 11:32:36 +08:00
|
|
|
void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (ctrl->tagset) {
|
|
|
|
blk_mq_tagset_busy_iter(ctrl->tagset,
|
|
|
|
nvme_cancel_request, ctrl);
|
|
|
|
blk_mq_tagset_wait_completed_request(ctrl->tagset);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
|
|
|
|
|
|
|
|
void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (ctrl->admin_tagset) {
|
|
|
|
blk_mq_tagset_busy_iter(ctrl->admin_tagset,
|
|
|
|
nvme_cancel_request, ctrl);
|
|
|
|
blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
|
|
|
|
|
2016-04-26 19:51:57 +08:00
|
|
|
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
|
|
|
|
enum nvme_ctrl_state new_state)
|
|
|
|
{
|
2016-07-30 03:15:18 +08:00
|
|
|
enum nvme_ctrl_state old_state;
|
2017-08-22 17:42:24 +08:00
|
|
|
unsigned long flags;
|
2016-04-26 19:51:57 +08:00
|
|
|
bool changed = false;
|
|
|
|
|
2017-08-22 17:42:24 +08:00
|
|
|
spin_lock_irqsave(&ctrl->lock, flags);
|
2016-07-30 03:15:18 +08:00
|
|
|
|
|
|
|
old_state = ctrl->state;
|
2016-04-26 19:51:57 +08:00
|
|
|
switch (new_state) {
|
|
|
|
case NVME_CTRL_LIVE:
|
|
|
|
switch (old_state) {
|
2016-06-13 22:45:22 +08:00
|
|
|
case NVME_CTRL_NEW:
|
2016-04-26 19:51:57 +08:00
|
|
|
case NVME_CTRL_RESETTING:
|
2018-02-01 00:31:24 +08:00
|
|
|
case NVME_CTRL_CONNECTING:
|
2016-04-26 19:51:57 +08:00
|
|
|
changed = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2016-04-26 19:51:57 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case NVME_CTRL_RESETTING:
|
|
|
|
switch (old_state) {
|
|
|
|
case NVME_CTRL_NEW:
|
2016-07-06 20:55:49 +08:00
|
|
|
case NVME_CTRL_LIVE:
|
|
|
|
changed = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2016-07-06 20:55:49 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
2018-02-01 00:31:24 +08:00
|
|
|
case NVME_CTRL_CONNECTING:
|
2016-07-06 20:55:49 +08:00
|
|
|
switch (old_state) {
|
2018-02-01 00:31:25 +08:00
|
|
|
case NVME_CTRL_NEW:
|
2017-10-26 07:43:13 +08:00
|
|
|
case NVME_CTRL_RESETTING:
|
2016-04-26 19:51:57 +08:00
|
|
|
changed = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2016-04-26 19:51:57 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case NVME_CTRL_DELETING:
|
|
|
|
switch (old_state) {
|
|
|
|
case NVME_CTRL_LIVE:
|
|
|
|
case NVME_CTRL_RESETTING:
|
2018-02-01 00:31:24 +08:00
|
|
|
case NVME_CTRL_CONNECTING:
|
2016-04-26 19:51:57 +08:00
|
|
|
changed = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2016-04-26 19:51:57 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
2020-07-23 07:32:19 +08:00
|
|
|
case NVME_CTRL_DELETING_NOIO:
|
|
|
|
switch (old_state) {
|
|
|
|
case NVME_CTRL_DELETING:
|
|
|
|
case NVME_CTRL_DEAD:
|
|
|
|
changed = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2020-07-23 07:32:19 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
2016-05-12 22:37:14 +08:00
|
|
|
case NVME_CTRL_DEAD:
|
|
|
|
switch (old_state) {
|
|
|
|
case NVME_CTRL_DELETING:
|
|
|
|
changed = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2016-05-12 22:37:14 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
2016-04-26 19:51:57 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2019-09-05 00:06:11 +08:00
|
|
|
if (changed) {
|
2016-04-26 19:51:57 +08:00
|
|
|
ctrl->state = new_state;
|
2019-09-05 00:06:11 +08:00
|
|
|
wake_up_all(&ctrl->state_wq);
|
|
|
|
}
|
2016-04-26 19:51:57 +08:00
|
|
|
|
2017-08-22 17:42:24 +08:00
|
|
|
spin_unlock_irqrestore(&ctrl->lock, flags);
|
2020-11-25 02:34:59 +08:00
|
|
|
if (!changed)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (ctrl->state == NVME_CTRL_LIVE) {
|
|
|
|
if (old_state == NVME_CTRL_CONNECTING)
|
|
|
|
nvme_stop_failfast_work(ctrl);
|
2017-11-02 19:59:30 +08:00
|
|
|
nvme_kick_requeue_lists(ctrl);
|
2020-11-25 02:34:59 +08:00
|
|
|
} else if (ctrl->state == NVME_CTRL_CONNECTING &&
|
|
|
|
old_state == NVME_CTRL_RESETTING) {
|
|
|
|
nvme_start_failfast_work(ctrl);
|
|
|
|
}
|
2016-04-26 19:51:57 +08:00
|
|
|
return changed;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
|
|
|
|
|
2019-09-05 00:06:11 +08:00
|
|
|
/*
|
|
|
|
* Returns true for sink states that can't ever transition back to live.
|
|
|
|
*/
|
|
|
|
static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
switch (ctrl->state) {
|
|
|
|
case NVME_CTRL_NEW:
|
|
|
|
case NVME_CTRL_LIVE:
|
|
|
|
case NVME_CTRL_RESETTING:
|
|
|
|
case NVME_CTRL_CONNECTING:
|
|
|
|
return false;
|
|
|
|
case NVME_CTRL_DELETING:
|
2020-07-23 07:32:19 +08:00
|
|
|
case NVME_CTRL_DELETING_NOIO:
|
2019-09-05 00:06:11 +08:00
|
|
|
case NVME_CTRL_DEAD:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Waits for the controller state to be resetting, or returns false if it is
|
|
|
|
* not possible to ever transition to that state.
|
|
|
|
*/
|
|
|
|
bool nvme_wait_reset(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
wait_event(ctrl->state_wq,
|
|
|
|
nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
|
|
|
|
nvme_state_terminal(ctrl));
|
|
|
|
return ctrl->state == NVME_CTRL_RESETTING;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_wait_reset);
|
|
|
|
|
2017-11-09 20:50:43 +08:00
|
|
|
static void nvme_free_ns_head(struct kref *ref)
|
|
|
|
{
|
|
|
|
struct nvme_ns_head *head =
|
|
|
|
container_of(ref, struct nvme_ns_head, ref);
|
|
|
|
|
2017-11-02 19:59:30 +08:00
|
|
|
nvme_mpath_remove_disk(head);
|
2022-02-14 17:07:27 +08:00
|
|
|
ida_free(&head->subsys->ns_ida, head->instance);
|
2019-02-14 05:54:37 +08:00
|
|
|
cleanup_srcu_struct(&head->srcu);
|
2018-05-04 16:01:57 +08:00
|
|
|
nvme_put_subsystem(head->subsys);
|
2017-11-09 20:50:43 +08:00
|
|
|
kfree(head);
|
|
|
|
}
|
|
|
|
|
2021-04-07 20:22:12 +08:00
|
|
|
bool nvme_tryget_ns_head(struct nvme_ns_head *head)
|
2021-04-07 20:20:40 +08:00
|
|
|
{
|
|
|
|
return kref_get_unless_zero(&head->ref);
|
|
|
|
}
|
|
|
|
|
2021-04-07 20:22:12 +08:00
|
|
|
void nvme_put_ns_head(struct nvme_ns_head *head)
|
2017-11-09 20:50:43 +08:00
|
|
|
{
|
|
|
|
kref_put(&head->ref, nvme_free_ns_head);
|
|
|
|
}
|
|
|
|
|
2015-11-26 17:54:19 +08:00
|
|
|
static void nvme_free_ns(struct kref *kref)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
|
|
|
|
|
|
|
|
put_disk(ns->disk);
|
2017-11-09 20:50:43 +08:00
|
|
|
nvme_put_ns_head(ns->head);
|
2016-02-25 00:15:53 +08:00
|
|
|
nvme_put_ctrl(ns->ctrl);
|
2015-11-26 17:54:19 +08:00
|
|
|
kfree(ns);
|
|
|
|
}
|
|
|
|
|
2021-04-27 14:47:46 +08:00
|
|
|
static inline bool nvme_get_ns(struct nvme_ns *ns)
|
|
|
|
{
|
|
|
|
return kref_get_unless_zero(&ns->kref);
|
|
|
|
}
|
|
|
|
|
2020-07-25 01:25:16 +08:00
|
|
|
void nvme_put_ns(struct nvme_ns *ns)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
|
|
|
kref_put(&ns->kref, nvme_free_ns);
|
|
|
|
}
|
2020-07-25 01:25:16 +08:00
|
|
|
EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2018-04-12 23:16:15 +08:00
|
|
|
static inline void nvme_clear_nvme_request(struct request *req)
|
|
|
|
{
|
2021-06-11 05:44:37 +08:00
|
|
|
nvme_req(req)->status = 0;
|
2021-03-01 10:06:08 +08:00
|
|
|
nvme_req(req)->retries = 0;
|
|
|
|
nvme_req(req)->flags = 0;
|
|
|
|
req->rq_flags |= RQF_DONTPREP;
|
2018-04-12 23:16:15 +08:00
|
|
|
}
|
|
|
|
|
2022-03-15 22:53:59 +08:00
|
|
|
/* initialize a passthrough request */
|
|
|
|
void nvme_init_request(struct request *req, struct nvme_command *cmd)
|
2020-11-10 10:24:00 +08:00
|
|
|
{
|
2020-11-10 08:33:42 +08:00
|
|
|
if (req->q->queuedata)
|
|
|
|
req->timeout = NVME_IO_TIMEOUT;
|
|
|
|
else /* no queuedata implies admin queue */
|
2020-11-10 08:33:45 +08:00
|
|
|
req->timeout = NVME_ADMIN_TIMEOUT;
|
2015-11-26 16:08:36 +08:00
|
|
|
|
2021-03-18 04:37:03 +08:00
|
|
|
/* passthru commands should let the driver set the SGL flags */
|
|
|
|
cmd->common.flags &= ~NVME_CMD_SGL_ALL;
|
|
|
|
|
2015-11-26 16:08:36 +08:00
|
|
|
req->cmd_flags |= REQ_FAILFAST_DRIVER;
|
2021-06-11 05:44:35 +08:00
|
|
|
if (req->mq_hctx->type == HCTX_TYPE_POLL)
|
2021-10-12 19:12:21 +08:00
|
|
|
req->cmd_flags |= REQ_POLLED;
|
2018-04-12 23:16:15 +08:00
|
|
|
nvme_clear_nvme_request(req);
|
2021-03-18 04:37:03 +08:00
|
|
|
memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd));
|
2020-11-10 10:24:00 +08:00
|
|
|
}
|
2022-03-15 22:53:59 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_init_request);
|
2020-11-10 10:24:00 +08:00
|
|
|
|
2021-04-26 10:53:10 +08:00
|
|
|
/*
|
|
|
|
* For something we're not in a state to send to the device the default action
|
|
|
|
* is to busy it and retry it after the controller state is recovered. However,
|
|
|
|
* if the controller is deleting or if anything is marked for failfast or
|
|
|
|
* nvme multipath it is immediately failed.
|
|
|
|
*
|
|
|
|
* Note: commands used to initialize the controller will be marked for failfast.
|
|
|
|
* Note: nvme cli/ioctl commands are marked for failfast.
|
|
|
|
*/
|
|
|
|
blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
|
|
|
|
struct request *rq)
|
|
|
|
{
|
|
|
|
if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
|
2021-11-04 15:13:32 +08:00
|
|
|
ctrl->state != NVME_CTRL_DELETING &&
|
2021-04-26 10:53:10 +08:00
|
|
|
ctrl->state != NVME_CTRL_DEAD &&
|
|
|
|
!test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
|
|
|
|
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
|
|
|
|
return BLK_STS_RESOURCE;
|
|
|
|
return nvme_host_path_error(rq);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
|
|
|
|
|
|
|
|
bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
|
|
|
|
bool queue_live)
|
|
|
|
{
|
|
|
|
struct nvme_request *req = nvme_req(rq);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* currently we have a problem sending passthru commands
|
|
|
|
* on the admin_q if the controller is not LIVE because we can't
|
|
|
|
* make sure that they are going out after the admin connect,
|
|
|
|
* controller enable and/or other commands in the initialization
|
|
|
|
* sequence. until the controller will be LIVE, fail with
|
|
|
|
* BLK_STS_RESOURCE so that they will be rescheduled.
|
|
|
|
*/
|
|
|
|
if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (ctrl->ops->flags & NVME_F_FABRICS) {
|
|
|
|
/*
|
|
|
|
* Only allow commands on a live queue, except for the connect
|
|
|
|
* command, which is require to set the queue live in the
|
|
|
|
* appropinquate states.
|
|
|
|
*/
|
|
|
|
switch (ctrl->state) {
|
|
|
|
case NVME_CTRL_CONNECTING:
|
|
|
|
if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
|
|
|
|
req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
|
|
|
|
return true;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
case NVME_CTRL_DEAD:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return queue_live;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__nvme_check_ready);
|
|
|
|
|
2017-06-28 02:03:06 +08:00
|
|
|
static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
|
|
|
|
{
|
2021-06-17 06:15:52 +08:00
|
|
|
struct nvme_command c = { };
|
2017-06-28 02:03:06 +08:00
|
|
|
|
|
|
|
c.directive.opcode = nvme_admin_directive_send;
|
2017-07-12 18:41:53 +08:00
|
|
|
c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
|
2017-06-28 02:03:06 +08:00
|
|
|
c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
|
|
|
|
c.directive.dtype = NVME_DIR_IDENTIFY;
|
|
|
|
c.directive.tdtype = NVME_DIR_STREAMS;
|
|
|
|
c.directive.endir = enable ? NVME_DIR_ENDIR : 0;
|
|
|
|
|
|
|
|
return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_disable_streams(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
return nvme_toggle_streams(ctrl, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_enable_streams(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
return nvme_toggle_streams(ctrl, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
|
|
|
|
struct streams_directive_params *s, u32 nsid)
|
|
|
|
{
|
2021-06-17 06:15:52 +08:00
|
|
|
struct nvme_command c = { };
|
2017-06-28 02:03:06 +08:00
|
|
|
|
|
|
|
memset(s, 0, sizeof(*s));
|
|
|
|
|
|
|
|
c.directive.opcode = nvme_admin_directive_recv;
|
|
|
|
c.directive.nsid = cpu_to_le32(nsid);
|
2020-04-04 00:24:01 +08:00
|
|
|
c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s)));
|
2017-06-28 02:03:06 +08:00
|
|
|
c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
|
|
|
|
c.directive.dtype = NVME_DIR_STREAMS;
|
|
|
|
|
|
|
|
return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_configure_directives(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct streams_directive_params s;
|
2022-02-15 23:37:19 +08:00
|
|
|
u16 nssa;
|
2017-06-28 02:03:06 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
|
|
|
|
return 0;
|
|
|
|
if (!streams)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
ret = nvme_enable_streams(ctrl);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2017-07-12 18:41:53 +08:00
|
|
|
ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
|
2017-06-28 02:03:06 +08:00
|
|
|
if (ret)
|
2020-05-13 16:18:13 +08:00
|
|
|
goto out_disable_stream;
|
2017-06-28 02:03:06 +08:00
|
|
|
|
2022-02-15 23:37:19 +08:00
|
|
|
nssa = le16_to_cpu(s.nssa);
|
|
|
|
if (nssa < BLK_MAX_WRITE_HINTS - 1) {
|
2017-06-28 02:03:06 +08:00
|
|
|
dev_info(ctrl->device, "too few streams (%u) available\n",
|
2022-02-15 23:37:19 +08:00
|
|
|
nssa);
|
2022-02-15 23:03:08 +08:00
|
|
|
/* this condition is not an error: streams are optional */
|
|
|
|
ret = 0;
|
2020-05-13 16:18:13 +08:00
|
|
|
goto out_disable_stream;
|
2017-06-28 02:03:06 +08:00
|
|
|
}
|
|
|
|
|
2022-02-15 23:37:19 +08:00
|
|
|
ctrl->nr_streams = min_t(u16, nssa, BLK_MAX_WRITE_HINTS - 1);
|
2017-06-28 02:03:06 +08:00
|
|
|
dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
|
|
|
|
return 0;
|
2020-05-13 16:18:13 +08:00
|
|
|
|
|
|
|
out_disable_stream:
|
|
|
|
nvme_disable_streams(ctrl);
|
|
|
|
return ret;
|
2017-06-28 02:03:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if 'req' has a write hint associated with it. If it does, assign
|
|
|
|
* a valid namespace stream to the write.
|
|
|
|
*/
|
|
|
|
static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
|
|
|
|
struct request *req, u16 *control,
|
|
|
|
u32 *dsmgmt)
|
|
|
|
{
|
|
|
|
enum rw_hint streamid = req->write_hint;
|
|
|
|
|
|
|
|
if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
|
|
|
|
streamid = 0;
|
|
|
|
else {
|
|
|
|
streamid--;
|
|
|
|
if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
|
|
|
|
return;
|
|
|
|
|
|
|
|
*control |= NVME_RW_DTYPE_STREAMS;
|
|
|
|
*dsmgmt |= streamid << 16;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (streamid < ARRAY_SIZE(req->q->write_hints))
|
|
|
|
req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
|
|
|
|
}
|
|
|
|
|
2016-04-13 03:10:14 +08:00
|
|
|
static inline void nvme_setup_flush(struct nvme_ns *ns,
|
|
|
|
struct nvme_command *cmnd)
|
|
|
|
{
|
2021-10-18 20:45:06 +08:00
|
|
|
memset(cmnd, 0, sizeof(*cmnd));
|
2016-04-13 03:10:14 +08:00
|
|
|
cmnd->common.opcode = nvme_cmd_flush;
|
2017-11-09 20:50:43 +08:00
|
|
|
cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
|
2016-04-13 03:10:14 +08:00
|
|
|
}
|
|
|
|
|
2017-06-03 15:38:05 +08:00
|
|
|
static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
|
2016-04-13 03:10:14 +08:00
|
|
|
struct nvme_command *cmnd)
|
|
|
|
{
|
2017-02-08 21:46:50 +08:00
|
|
|
unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
|
2016-04-13 03:10:14 +08:00
|
|
|
struct nvme_dsm_range *range;
|
2017-02-08 21:46:50 +08:00
|
|
|
struct bio *bio;
|
2016-04-13 03:10:14 +08:00
|
|
|
|
2019-11-13 04:55:01 +08:00
|
|
|
/*
|
|
|
|
* Some devices do not consider the DSM 'Number of Ranges' field when
|
|
|
|
* determining how much data to DMA. Always allocate memory for maximum
|
|
|
|
* number of segments to prevent device reading beyond end of buffer.
|
|
|
|
*/
|
|
|
|
static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
|
|
|
|
|
|
|
|
range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
|
2018-12-13 00:18:11 +08:00
|
|
|
if (!range) {
|
|
|
|
/*
|
|
|
|
* If we fail allocation our range, fallback to the controller
|
|
|
|
* discard page. If that's also busy, it's safe to return
|
|
|
|
* busy, as we know we can make progress once that's freed.
|
|
|
|
*/
|
|
|
|
if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
|
|
|
|
return BLK_STS_RESOURCE;
|
|
|
|
|
|
|
|
range = page_address(ns->ctrl->discard_page);
|
|
|
|
}
|
2016-04-13 03:10:14 +08:00
|
|
|
|
2017-02-08 21:46:50 +08:00
|
|
|
__rq_for_each_bio(bio, req) {
|
2019-10-21 11:40:03 +08:00
|
|
|
u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
|
2017-02-08 21:46:50 +08:00
|
|
|
u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
|
|
|
|
|
2018-02-01 08:01:58 +08:00
|
|
|
if (n < segments) {
|
|
|
|
range[n].cattr = cpu_to_le32(0);
|
|
|
|
range[n].nlb = cpu_to_le32(nlb);
|
|
|
|
range[n].slba = cpu_to_le64(slba);
|
|
|
|
}
|
2017-02-08 21:46:50 +08:00
|
|
|
n++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(n != segments)) {
|
2018-12-13 00:18:11 +08:00
|
|
|
if (virt_to_page(range) == ns->ctrl->discard_page)
|
|
|
|
clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
|
|
|
|
else
|
|
|
|
kfree(range);
|
2017-06-03 15:38:05 +08:00
|
|
|
return BLK_STS_IOERR;
|
2017-02-08 21:46:50 +08:00
|
|
|
}
|
2016-04-13 03:10:14 +08:00
|
|
|
|
2021-10-18 20:45:06 +08:00
|
|
|
memset(cmnd, 0, sizeof(*cmnd));
|
2016-04-13 03:10:14 +08:00
|
|
|
cmnd->dsm.opcode = nvme_cmd_dsm;
|
2017-11-09 20:50:43 +08:00
|
|
|
cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
|
2017-03-31 23:00:05 +08:00
|
|
|
cmnd->dsm.nr = cpu_to_le32(segments - 1);
|
2016-04-13 03:10:14 +08:00
|
|
|
cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
|
|
|
|
|
2016-12-09 06:20:32 +08:00
|
|
|
req->special_vec.bv_page = virt_to_page(range);
|
|
|
|
req->special_vec.bv_offset = offset_in_page(range);
|
2019-11-13 04:55:01 +08:00
|
|
|
req->special_vec.bv_len = alloc_size;
|
2016-12-09 06:20:32 +08:00
|
|
|
req->rq_flags |= RQF_SPECIAL_PAYLOAD;
|
2016-04-13 03:10:14 +08:00
|
|
|
|
2017-06-03 15:38:05 +08:00
|
|
|
return BLK_STS_OK;
|
2016-04-13 03:10:14 +08:00
|
|
|
}
|
|
|
|
|
2018-12-18 11:42:03 +08:00
|
|
|
static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
|
|
|
|
struct request *req, struct nvme_command *cmnd)
|
|
|
|
{
|
2021-10-18 20:45:06 +08:00
|
|
|
memset(cmnd, 0, sizeof(*cmnd));
|
|
|
|
|
2018-12-18 11:42:03 +08:00
|
|
|
if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
|
|
|
|
return nvme_setup_discard(ns, req, cmnd);
|
|
|
|
|
|
|
|
cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
|
|
|
|
cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
|
|
|
|
cmnd->write_zeroes.slba =
|
2019-10-21 11:40:03 +08:00
|
|
|
cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
|
2018-12-18 11:42:03 +08:00
|
|
|
cmnd->write_zeroes.length =
|
|
|
|
cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
|
2021-11-10 17:19:06 +08:00
|
|
|
|
|
|
|
if (nvme_ns_has_pi(ns)) {
|
2021-07-21 16:00:11 +08:00
|
|
|
cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
|
2021-11-10 17:19:06 +08:00
|
|
|
|
|
|
|
switch (ns->pi_type) {
|
|
|
|
case NVME_NS_DPS_PI_TYPE1:
|
|
|
|
case NVME_NS_DPS_PI_TYPE2:
|
|
|
|
cmnd->write_zeroes.reftag =
|
|
|
|
cpu_to_le32(t10_pi_ref_tag(req));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-18 11:42:03 +08:00
|
|
|
return BLK_STS_OK;
|
|
|
|
}
|
|
|
|
|
2017-06-13 00:36:32 +08:00
|
|
|
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
|
2020-06-30 03:06:41 +08:00
|
|
|
struct request *req, struct nvme_command *cmnd,
|
|
|
|
enum nvme_opcode op)
|
2016-04-13 03:10:14 +08:00
|
|
|
{
|
2017-06-28 02:03:06 +08:00
|
|
|
struct nvme_ctrl *ctrl = ns->ctrl;
|
2016-04-13 03:10:14 +08:00
|
|
|
u16 control = 0;
|
|
|
|
u32 dsmgmt = 0;
|
|
|
|
|
|
|
|
if (req->cmd_flags & REQ_FUA)
|
|
|
|
control |= NVME_RW_FUA;
|
|
|
|
if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
|
|
|
|
control |= NVME_RW_LR;
|
|
|
|
|
|
|
|
if (req->cmd_flags & REQ_RAHEAD)
|
|
|
|
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
|
|
|
|
|
2020-06-30 03:06:41 +08:00
|
|
|
cmnd->rw.opcode = op;
|
2021-10-18 20:47:18 +08:00
|
|
|
cmnd->rw.flags = 0;
|
2017-11-09 20:50:43 +08:00
|
|
|
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
|
2021-10-18 20:47:18 +08:00
|
|
|
cmnd->rw.rsvd2 = 0;
|
|
|
|
cmnd->rw.metadata = 0;
|
2019-10-21 11:40:03 +08:00
|
|
|
cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
|
2016-04-13 03:10:14 +08:00
|
|
|
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
|
2021-10-18 20:47:18 +08:00
|
|
|
cmnd->rw.reftag = 0;
|
|
|
|
cmnd->rw.apptag = 0;
|
|
|
|
cmnd->rw.appmask = 0;
|
2016-04-13 03:10:14 +08:00
|
|
|
|
2017-06-28 02:03:06 +08:00
|
|
|
if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
|
|
|
|
nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
|
|
|
|
|
2016-04-13 03:10:14 +08:00
|
|
|
if (ns->ms) {
|
2017-11-08 00:27:34 +08:00
|
|
|
/*
|
|
|
|
* If formated with metadata, the block layer always provides a
|
|
|
|
* metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
|
|
|
|
* we enable the PRACT bit for protection information or set the
|
|
|
|
* namespace capacity to zero to prevent any I/O.
|
|
|
|
*/
|
|
|
|
if (!blk_integrity_rq(req)) {
|
|
|
|
if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
|
|
|
|
return BLK_STS_NOTSUPP;
|
|
|
|
control |= NVME_RW_PRINFO_PRACT;
|
|
|
|
}
|
|
|
|
|
2016-04-13 03:10:14 +08:00
|
|
|
switch (ns->pi_type) {
|
|
|
|
case NVME_NS_DPS_PI_TYPE3:
|
|
|
|
control |= NVME_RW_PRINFO_PRCHK_GUARD;
|
|
|
|
break;
|
|
|
|
case NVME_NS_DPS_PI_TYPE1:
|
|
|
|
case NVME_NS_DPS_PI_TYPE2:
|
|
|
|
control |= NVME_RW_PRINFO_PRCHK_GUARD |
|
|
|
|
NVME_RW_PRINFO_PRCHK_REF;
|
2020-06-30 03:06:41 +08:00
|
|
|
if (op == nvme_cmd_zone_append)
|
|
|
|
control |= NVME_RW_APPEND_PIREMAP;
|
2018-07-30 05:15:31 +08:00
|
|
|
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
|
2016-04-13 03:10:14 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
cmnd->rw.control = cpu_to_le16(control);
|
|
|
|
cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
|
2017-06-13 00:36:32 +08:00
|
|
|
return 0;
|
2016-04-13 03:10:14 +08:00
|
|
|
}
|
|
|
|
|
2018-07-30 05:15:33 +08:00
|
|
|
void nvme_cleanup_cmd(struct request *req)
|
|
|
|
{
|
|
|
|
if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
|
2021-01-13 22:36:27 +08:00
|
|
|
struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
|
2018-12-13 00:18:11 +08:00
|
|
|
|
2021-08-04 17:56:34 +08:00
|
|
|
if (req->special_vec.bv_page == ctrl->discard_page)
|
2021-01-13 22:36:27 +08:00
|
|
|
clear_bit_unlock(0, &ctrl->discard_page_busy);
|
2018-12-13 00:18:11 +08:00
|
|
|
else
|
2021-08-04 17:56:34 +08:00
|
|
|
kfree(bvec_virt(&req->special_vec));
|
2018-07-30 05:15:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
|
|
|
|
|
2021-03-18 04:37:03 +08:00
|
|
|
blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
|
2016-04-13 03:10:14 +08:00
|
|
|
{
|
2021-03-18 04:37:03 +08:00
|
|
|
struct nvme_command *cmd = nvme_req(req)->cmd;
|
2017-06-03 15:38:05 +08:00
|
|
|
blk_status_t ret = BLK_STS_OK;
|
2016-04-13 03:10:14 +08:00
|
|
|
|
2021-10-18 20:45:06 +08:00
|
|
|
if (!(req->rq_flags & RQF_DONTPREP))
|
2021-03-01 10:06:08 +08:00
|
|
|
nvme_clear_nvme_request(req);
|
2017-04-06 01:18:08 +08:00
|
|
|
|
2017-01-31 23:57:31 +08:00
|
|
|
switch (req_op(req)) {
|
|
|
|
case REQ_OP_DRV_IN:
|
|
|
|
case REQ_OP_DRV_OUT:
|
2021-03-18 04:37:03 +08:00
|
|
|
/* these are setup prior to execution in nvme_init_request() */
|
2017-01-31 23:57:31 +08:00
|
|
|
break;
|
|
|
|
case REQ_OP_FLUSH:
|
2016-04-13 03:10:14 +08:00
|
|
|
nvme_setup_flush(ns, cmd);
|
2017-01-31 23:57:31 +08:00
|
|
|
break;
|
2020-06-30 03:06:41 +08:00
|
|
|
case REQ_OP_ZONE_RESET_ALL:
|
|
|
|
case REQ_OP_ZONE_RESET:
|
|
|
|
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
|
|
|
|
break;
|
|
|
|
case REQ_OP_ZONE_OPEN:
|
|
|
|
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
|
|
|
|
break;
|
|
|
|
case REQ_OP_ZONE_CLOSE:
|
|
|
|
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
|
|
|
|
break;
|
|
|
|
case REQ_OP_ZONE_FINISH:
|
|
|
|
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
|
|
|
|
break;
|
2017-04-06 01:21:13 +08:00
|
|
|
case REQ_OP_WRITE_ZEROES:
|
2018-12-18 11:42:03 +08:00
|
|
|
ret = nvme_setup_write_zeroes(ns, req, cmd);
|
|
|
|
break;
|
2017-01-31 23:57:31 +08:00
|
|
|
case REQ_OP_DISCARD:
|
2016-04-13 03:10:14 +08:00
|
|
|
ret = nvme_setup_discard(ns, req, cmd);
|
2017-01-31 23:57:31 +08:00
|
|
|
break;
|
|
|
|
case REQ_OP_READ:
|
2020-06-30 03:06:41 +08:00
|
|
|
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
|
|
|
|
break;
|
2017-01-31 23:57:31 +08:00
|
|
|
case REQ_OP_WRITE:
|
2020-06-30 03:06:41 +08:00
|
|
|
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
|
|
|
|
break;
|
|
|
|
case REQ_OP_ZONE_APPEND:
|
|
|
|
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
|
2017-01-31 23:57:31 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
WARN_ON_ONCE(1);
|
2017-06-03 15:38:05 +08:00
|
|
|
return BLK_STS_IOERR;
|
2017-01-31 23:57:31 +08:00
|
|
|
}
|
2016-04-13 03:10:14 +08:00
|
|
|
|
2021-06-17 05:19:36 +08:00
|
|
|
cmd->common.command_id = nvme_cid(req);
|
2018-06-30 06:50:01 +08:00
|
|
|
trace_nvme_setup_cmd(req, cmd);
|
2016-04-13 03:10:14 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_setup_cmd);
|
|
|
|
|
2021-06-11 05:44:37 +08:00
|
|
|
/*
|
|
|
|
* Return values:
|
|
|
|
* 0: success
|
|
|
|
* >0: nvme controller's cqe status response
|
|
|
|
* <0: kernel error in lieu of controller response
|
|
|
|
*/
|
2022-01-22 13:05:39 +08:00
|
|
|
static int nvme_execute_rq(struct request *rq, bool at_head)
|
2021-06-11 05:44:37 +08:00
|
|
|
{
|
|
|
|
blk_status_t status;
|
|
|
|
|
2021-11-26 20:18:01 +08:00
|
|
|
status = blk_execute_rq(rq, at_head);
|
2021-06-11 05:44:37 +08:00
|
|
|
if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
|
|
|
|
return -EINTR;
|
|
|
|
if (nvme_req(rq)->status)
|
|
|
|
return nvme_req(rq)->status;
|
|
|
|
return blk_status_to_errno(status);
|
|
|
|
}
|
|
|
|
|
2015-11-20 16:00:02 +08:00
|
|
|
/*
|
|
|
|
* Returns 0 on success. If the result is negative, it's a Linux error code;
|
|
|
|
* if the result is positive, it's an NVM Express status code
|
|
|
|
*/
|
|
|
|
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
|
2016-11-10 23:32:33 +08:00
|
|
|
union nvme_result *result, void *buffer, unsigned bufflen,
|
2017-11-10 02:49:59 +08:00
|
|
|
unsigned timeout, int qid, int at_head,
|
2021-06-11 05:44:35 +08:00
|
|
|
blk_mq_req_flags_t flags)
|
2015-11-20 16:00:02 +08:00
|
|
|
{
|
|
|
|
struct request *req;
|
|
|
|
int ret;
|
|
|
|
|
2020-11-10 10:24:00 +08:00
|
|
|
if (qid == NVME_QID_ANY)
|
2022-03-15 22:53:59 +08:00
|
|
|
req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
|
2020-11-10 10:24:00 +08:00
|
|
|
else
|
2022-03-15 22:53:59 +08:00
|
|
|
req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
|
|
|
|
qid ? qid - 1 : 0);
|
|
|
|
|
2015-11-20 16:00:02 +08:00
|
|
|
if (IS_ERR(req))
|
|
|
|
return PTR_ERR(req);
|
2022-03-15 22:53:59 +08:00
|
|
|
nvme_init_request(req, cmd);
|
2015-11-20 16:00:02 +08:00
|
|
|
|
2020-11-10 08:33:42 +08:00
|
|
|
if (timeout)
|
|
|
|
req->timeout = timeout;
|
2015-11-20 16:00:02 +08:00
|
|
|
|
2015-11-26 16:08:36 +08:00
|
|
|
if (buffer && bufflen) {
|
|
|
|
ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
2015-11-20 16:00:02 +08:00
|
|
|
}
|
|
|
|
|
2022-01-22 13:05:39 +08:00
|
|
|
ret = nvme_execute_rq(req, at_head);
|
2021-06-11 05:44:37 +08:00
|
|
|
if (result && ret >= 0)
|
2016-11-10 23:32:33 +08:00
|
|
|
*result = nvme_req(req)->result;
|
2015-11-20 16:00:02 +08:00
|
|
|
out:
|
|
|
|
blk_mq_free_request(req);
|
|
|
|
return ret;
|
|
|
|
}
|
2016-06-13 22:45:23 +08:00
|
|
|
EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
|
2015-11-20 16:00:02 +08:00
|
|
|
|
|
|
|
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
|
|
|
|
void *buffer, unsigned bufflen)
|
|
|
|
{
|
2016-06-13 22:45:23 +08:00
|
|
|
return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
|
2021-06-11 05:44:35 +08:00
|
|
|
NVME_QID_ANY, 0, 0);
|
2015-11-20 16:00:02 +08:00
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
|
2015-11-20 16:00:02 +08:00
|
|
|
|
2020-07-25 01:25:13 +08:00
|
|
|
static u32 nvme_known_admin_effects(u8 opcode)
|
|
|
|
{
|
|
|
|
switch (opcode) {
|
|
|
|
case nvme_admin_format_nvm:
|
2020-09-25 14:34:43 +08:00
|
|
|
return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC |
|
2020-07-25 01:25:13 +08:00
|
|
|
NVME_CMD_EFFECTS_CSE_MASK;
|
|
|
|
case nvme_admin_sanitize_nvm:
|
2020-09-25 14:34:43 +08:00
|
|
|
return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK;
|
2020-07-25 01:25:13 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
|
|
|
|
{
|
|
|
|
u32 effects = 0;
|
|
|
|
|
|
|
|
if (ns) {
|
|
|
|
if (ns->head->effects)
|
|
|
|
effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
|
|
|
|
if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
|
2021-03-18 04:33:41 +08:00
|
|
|
dev_warn_once(ctrl->device,
|
|
|
|
"IO command:%02x has unhandled effects:%08x\n",
|
|
|
|
opcode, effects);
|
2020-07-25 01:25:13 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ctrl->effects)
|
|
|
|
effects = le32_to_cpu(ctrl->effects->acs[opcode]);
|
|
|
|
effects |= nvme_known_admin_effects(opcode);
|
|
|
|
|
|
|
|
return effects;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
|
|
|
|
|
|
|
|
static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
|
|
|
u8 opcode)
|
|
|
|
{
|
|
|
|
u32 effects = nvme_command_effects(ctrl, ns, opcode);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For simplicity, IO to all namespaces is quiesced even if the command
|
|
|
|
* effects say only one namespace is affected.
|
|
|
|
*/
|
2020-09-28 17:10:36 +08:00
|
|
|
if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
|
2020-07-25 01:25:13 +08:00
|
|
|
mutex_lock(&ctrl->scan_lock);
|
|
|
|
mutex_lock(&ctrl->subsys->lock);
|
|
|
|
nvme_mpath_start_freeze(ctrl->subsys);
|
|
|
|
nvme_mpath_wait_freeze(ctrl->subsys);
|
|
|
|
nvme_start_freeze(ctrl);
|
|
|
|
nvme_wait_freeze(ctrl);
|
|
|
|
}
|
|
|
|
return effects;
|
|
|
|
}
|
|
|
|
|
2021-09-01 16:23:42 +08:00
|
|
|
static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
|
|
|
|
struct nvme_command *cmd, int status)
|
2020-07-25 01:25:13 +08:00
|
|
|
{
|
2020-09-28 17:10:36 +08:00
|
|
|
if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
|
2020-07-25 01:25:13 +08:00
|
|
|
nvme_unfreeze(ctrl);
|
|
|
|
nvme_mpath_unfreeze(ctrl->subsys);
|
|
|
|
mutex_unlock(&ctrl->subsys->lock);
|
|
|
|
nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
|
|
|
|
mutex_unlock(&ctrl->scan_lock);
|
|
|
|
}
|
|
|
|
if (effects & NVME_CMD_EFFECTS_CCC)
|
2021-03-01 10:06:04 +08:00
|
|
|
nvme_init_ctrl_finish(ctrl);
|
2020-07-25 01:25:13 +08:00
|
|
|
if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
|
|
|
|
nvme_queue_scan(ctrl);
|
|
|
|
flush_work(&ctrl->scan_work);
|
|
|
|
}
|
2021-09-01 16:23:42 +08:00
|
|
|
|
|
|
|
switch (cmd->common.opcode) {
|
|
|
|
case nvme_admin_set_features:
|
|
|
|
switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) {
|
|
|
|
case NVME_FEAT_KATO:
|
|
|
|
/*
|
|
|
|
* Keep alive commands interval on the host should be
|
|
|
|
* updated when KATO is modified by Set Features
|
|
|
|
* commands.
|
|
|
|
*/
|
|
|
|
if (!status)
|
|
|
|
nvme_update_keep_alive(ctrl, cmd);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2020-07-25 01:25:13 +08:00
|
|
|
}
|
|
|
|
|
2021-06-11 05:44:37 +08:00
|
|
|
int nvme_execute_passthru_rq(struct request *rq)
|
2020-07-25 01:25:14 +08:00
|
|
|
{
|
|
|
|
struct nvme_command *cmd = nvme_req(rq)->cmd;
|
|
|
|
struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
|
|
|
|
struct nvme_ns *ns = rq->q->queuedata;
|
|
|
|
u32 effects;
|
2021-06-11 05:44:37 +08:00
|
|
|
int ret;
|
2020-07-25 01:25:14 +08:00
|
|
|
|
|
|
|
effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
|
2022-01-22 13:05:39 +08:00
|
|
|
ret = nvme_execute_rq(rq, false);
|
2021-03-09 03:18:04 +08:00
|
|
|
if (effects) /* nothing to be done for zero cmd effects */
|
2021-09-01 16:23:42 +08:00
|
|
|
nvme_passthru_end(ctrl, effects, cmd, ret);
|
2021-06-11 05:44:37 +08:00
|
|
|
|
|
|
|
return ret;
|
2020-07-25 01:25:14 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
|
|
|
|
|
2021-04-16 19:46:20 +08:00
|
|
|
/*
|
|
|
|
* Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
|
|
|
|
*
|
|
|
|
* The host should send Keep Alive commands at half of the Keep Alive Timeout
|
|
|
|
* accounting for transport roundtrip times [..].
|
|
|
|
*/
|
|
|
|
static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
|
2015-11-20 16:00:02 +08:00
|
|
|
{
|
2021-04-16 19:46:20 +08:00
|
|
|
queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2);
|
2015-11-26 16:08:36 +08:00
|
|
|
}
|
|
|
|
|
2017-06-03 15:38:04 +08:00
|
|
|
static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
|
2016-06-13 22:45:28 +08:00
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = rq->end_io_data;
|
nvme: validate controller state before rescheduling keep alive
Delete operations are seeing NULL pointer references in call_timer_fn.
Tracking these back, the timer appears to be the keep alive timer.
nvme_keep_alive_work() which is tied to the timer that is cancelled
by nvme_stop_keep_alive(), simply starts the keep alive io but doesn't
wait for it's completion. So nvme_stop_keep_alive() only stops a timer
when it's pending. When a keep alive is in flight, there is no timer
running and the nvme_stop_keep_alive() will have no affect on the keep
alive io. Thus, if the io completes successfully, the keep alive timer
will be rescheduled. In the failure case, delete is called, the
controller state is changed, the nvme_stop_keep_alive() is called while
the io is outstanding, and the delete path continues on. The keep
alive happens to successfully complete before the delete paths mark it
as aborted as part of the queue termination, so the timer is restarted.
The delete paths then tear down the controller, and later on the timer
code fires and the timer entry is now corrupt.
Fix by validating the controller state before rescheduling the keep
alive. Testing with the fix has confirmed the condition above was hit.
Signed-off-by: James Smart <jsmart2021@gmail.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2018-11-28 09:04:44 +08:00
|
|
|
unsigned long flags;
|
|
|
|
bool startka = false;
|
2016-06-13 22:45:28 +08:00
|
|
|
|
|
|
|
blk_mq_free_request(rq);
|
|
|
|
|
2017-06-03 15:38:04 +08:00
|
|
|
if (status) {
|
2016-06-13 22:45:28 +08:00
|
|
|
dev_err(ctrl->device,
|
2017-06-03 15:38:04 +08:00
|
|
|
"failed nvme_keep_alive_end_io error=%d\n",
|
|
|
|
status);
|
2016-06-13 22:45:28 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2018-11-03 01:28:15 +08:00
|
|
|
ctrl->comp_seen = false;
|
nvme: validate controller state before rescheduling keep alive
Delete operations are seeing NULL pointer references in call_timer_fn.
Tracking these back, the timer appears to be the keep alive timer.
nvme_keep_alive_work() which is tied to the timer that is cancelled
by nvme_stop_keep_alive(), simply starts the keep alive io but doesn't
wait for it's completion. So nvme_stop_keep_alive() only stops a timer
when it's pending. When a keep alive is in flight, there is no timer
running and the nvme_stop_keep_alive() will have no affect on the keep
alive io. Thus, if the io completes successfully, the keep alive timer
will be rescheduled. In the failure case, delete is called, the
controller state is changed, the nvme_stop_keep_alive() is called while
the io is outstanding, and the delete path continues on. The keep
alive happens to successfully complete before the delete paths mark it
as aborted as part of the queue termination, so the timer is restarted.
The delete paths then tear down the controller, and later on the timer
code fires and the timer entry is now corrupt.
Fix by validating the controller state before rescheduling the keep
alive. Testing with the fix has confirmed the condition above was hit.
Signed-off-by: James Smart <jsmart2021@gmail.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2018-11-28 09:04:44 +08:00
|
|
|
spin_lock_irqsave(&ctrl->lock, flags);
|
|
|
|
if (ctrl->state == NVME_CTRL_LIVE ||
|
|
|
|
ctrl->state == NVME_CTRL_CONNECTING)
|
|
|
|
startka = true;
|
|
|
|
spin_unlock_irqrestore(&ctrl->lock, flags);
|
|
|
|
if (startka)
|
2021-04-16 19:46:20 +08:00
|
|
|
nvme_queue_keep_alive_work(ctrl);
|
2016-06-13 22:45:28 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_keep_alive_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
|
|
|
|
struct nvme_ctrl, ka_work);
|
2018-11-03 01:28:15 +08:00
|
|
|
bool comp_seen = ctrl->comp_seen;
|
2021-03-03 20:46:06 +08:00
|
|
|
struct request *rq;
|
2018-11-03 01:28:15 +08:00
|
|
|
|
|
|
|
if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
|
|
|
|
dev_dbg(ctrl->device,
|
|
|
|
"reschedule traffic based keep-alive timer\n");
|
|
|
|
ctrl->comp_seen = false;
|
2021-04-16 19:46:20 +08:00
|
|
|
nvme_queue_keep_alive_work(ctrl);
|
2018-11-03 01:28:15 +08:00
|
|
|
return;
|
|
|
|
}
|
2016-06-13 22:45:28 +08:00
|
|
|
|
2022-03-15 22:53:59 +08:00
|
|
|
rq = blk_mq_alloc_request(ctrl->admin_q, nvme_req_op(&ctrl->ka_cmd),
|
|
|
|
BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
|
2021-03-03 20:46:06 +08:00
|
|
|
if (IS_ERR(rq)) {
|
2016-06-13 22:45:28 +08:00
|
|
|
/* allocation failure, reset the controller */
|
2021-03-03 20:51:47 +08:00
|
|
|
dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
|
2017-06-13 00:21:19 +08:00
|
|
|
nvme_reset_ctrl(ctrl);
|
2016-06-13 22:45:28 +08:00
|
|
|
return;
|
|
|
|
}
|
2022-03-15 22:53:59 +08:00
|
|
|
nvme_init_request(rq, &ctrl->ka_cmd);
|
2021-03-03 20:46:06 +08:00
|
|
|
|
|
|
|
rq->timeout = ctrl->kato * HZ;
|
|
|
|
rq->end_io_data = ctrl;
|
2021-11-26 20:18:01 +08:00
|
|
|
blk_execute_rq_nowait(rq, false, nvme_keep_alive_end_io);
|
2016-06-13 22:45:28 +08:00
|
|
|
}
|
|
|
|
|
2018-04-12 23:16:05 +08:00
|
|
|
static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
|
2016-06-13 22:45:28 +08:00
|
|
|
{
|
|
|
|
if (unlikely(ctrl->kato == 0))
|
|
|
|
return;
|
|
|
|
|
2021-04-16 19:46:20 +08:00
|
|
|
nvme_queue_keep_alive_work(ctrl);
|
2016-06-13 22:45:28 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (unlikely(ctrl->kato == 0))
|
|
|
|
return;
|
|
|
|
|
|
|
|
cancel_delayed_work_sync(&ctrl->ka_work);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
|
|
|
|
|
2021-09-01 16:23:42 +08:00
|
|
|
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
|
|
|
|
struct nvme_command *cmd)
|
|
|
|
{
|
|
|
|
unsigned int new_kato =
|
|
|
|
DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000);
|
|
|
|
|
|
|
|
dev_info(ctrl->device,
|
|
|
|
"keep alive interval updated from %u ms to %u ms\n",
|
|
|
|
ctrl->kato * 1000 / 2, new_kato * 1000 / 2);
|
|
|
|
|
|
|
|
nvme_stop_keep_alive(ctrl);
|
|
|
|
ctrl->kato = new_kato;
|
|
|
|
nvme_start_keep_alive(ctrl);
|
|
|
|
}
|
|
|
|
|
2020-04-04 16:11:28 +08:00
|
|
|
/*
|
|
|
|
* In NVMe 1.0 the CNS field was just a binary controller or namespace
|
|
|
|
* flag, thus sending any new CNS opcodes has a big chance of not working.
|
|
|
|
* Qemu unfortunately had that bug after reporting a 1.1 version compliance
|
|
|
|
* (but not for any later version).
|
|
|
|
*/
|
|
|
|
static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
|
|
|
|
return ctrl->vs < NVME_VS(1, 2, 0);
|
|
|
|
return ctrl->vs < NVME_VS(1, 1, 0);
|
|
|
|
}
|
|
|
|
|
2017-06-21 03:09:56 +08:00
|
|
|
static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
|
2015-11-26 16:08:36 +08:00
|
|
|
{
|
|
|
|
struct nvme_command c = { };
|
|
|
|
int error;
|
|
|
|
|
|
|
|
/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
|
|
|
|
c.identify.opcode = nvme_admin_identify;
|
2017-01-26 23:17:28 +08:00
|
|
|
c.identify.cns = NVME_ID_CNS_CTRL;
|
2015-11-26 16:08:36 +08:00
|
|
|
|
|
|
|
*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
|
|
|
|
if (!*id)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
|
|
|
|
sizeof(struct nvme_id_ctrl));
|
|
|
|
if (error)
|
|
|
|
kfree(*id);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2020-02-20 00:14:31 +08:00
|
|
|
static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
|
2020-06-30 03:06:39 +08:00
|
|
|
struct nvme_ns_id_desc *cur, bool *csi_seen)
|
2020-02-20 00:14:31 +08:00
|
|
|
{
|
|
|
|
const char *warn_str = "ctrl returned bogus length:";
|
|
|
|
void *data = cur;
|
|
|
|
|
|
|
|
switch (cur->nidt) {
|
|
|
|
case NVME_NIDT_EUI64:
|
|
|
|
if (cur->nidl != NVME_NIDT_EUI64_LEN) {
|
|
|
|
dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
|
|
|
|
warn_str, cur->nidl);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
|
|
|
|
return NVME_NIDT_EUI64_LEN;
|
|
|
|
case NVME_NIDT_NGUID:
|
|
|
|
if (cur->nidl != NVME_NIDT_NGUID_LEN) {
|
|
|
|
dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
|
|
|
|
warn_str, cur->nidl);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
|
|
|
|
return NVME_NIDT_NGUID_LEN;
|
|
|
|
case NVME_NIDT_UUID:
|
|
|
|
if (cur->nidl != NVME_NIDT_UUID_LEN) {
|
|
|
|
dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
|
|
|
|
warn_str, cur->nidl);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
uuid_copy(&ids->uuid, data + sizeof(*cur));
|
|
|
|
return NVME_NIDT_UUID_LEN;
|
2020-06-30 03:06:39 +08:00
|
|
|
case NVME_NIDT_CSI:
|
|
|
|
if (cur->nidl != NVME_NIDT_CSI_LEN) {
|
|
|
|
dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
|
|
|
|
warn_str, cur->nidl);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
|
|
|
|
*csi_seen = true;
|
|
|
|
return NVME_NIDT_CSI_LEN;
|
2020-02-20 00:14:31 +08:00
|
|
|
default:
|
|
|
|
/* Skip unknown types */
|
|
|
|
return cur->nidl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-08-16 22:14:47 +08:00
|
|
|
static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
|
2017-11-09 20:50:16 +08:00
|
|
|
struct nvme_ns_ids *ids)
|
2017-06-07 17:45:34 +08:00
|
|
|
{
|
|
|
|
struct nvme_command c = { };
|
2020-06-30 03:06:39 +08:00
|
|
|
bool csi_seen = false;
|
|
|
|
int status, pos, len;
|
2017-06-07 17:45:34 +08:00
|
|
|
void *data;
|
|
|
|
|
2020-09-28 20:07:56 +08:00
|
|
|
if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
|
|
|
|
return 0;
|
2020-07-28 19:09:03 +08:00
|
|
|
if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
|
|
|
|
return 0;
|
|
|
|
|
2017-06-07 17:45:34 +08:00
|
|
|
c.identify.opcode = nvme_admin_identify;
|
|
|
|
c.identify.nsid = cpu_to_le32(nsid);
|
|
|
|
c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
|
|
|
|
|
|
|
|
data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
|
|
|
|
if (!data)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2017-08-16 22:14:47 +08:00
|
|
|
status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
|
2017-06-07 17:45:34 +08:00
|
|
|
NVME_IDENTIFY_DATA_SIZE);
|
2020-03-25 21:19:35 +08:00
|
|
|
if (status) {
|
|
|
|
dev_warn(ctrl->device,
|
2020-11-30 20:47:46 +08:00
|
|
|
"Identify Descriptors failed (nsid=%u, status=0x%x)\n",
|
|
|
|
nsid, status);
|
2017-06-07 17:45:34 +08:00
|
|
|
goto free_data;
|
2020-03-25 21:19:35 +08:00
|
|
|
}
|
2017-06-07 17:45:34 +08:00
|
|
|
|
|
|
|
for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
|
|
|
|
struct nvme_ns_id_desc *cur = data + pos;
|
|
|
|
|
|
|
|
if (cur->nidl == 0)
|
|
|
|
break;
|
|
|
|
|
2020-06-30 03:06:39 +08:00
|
|
|
len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
|
2020-02-20 00:14:31 +08:00
|
|
|
if (len < 0)
|
2020-06-30 03:06:39 +08:00
|
|
|
break;
|
2017-06-07 17:45:34 +08:00
|
|
|
|
|
|
|
len += sizeof(*cur);
|
|
|
|
}
|
2020-06-30 03:06:39 +08:00
|
|
|
|
|
|
|
if (nvme_multi_css(ctrl) && !csi_seen) {
|
|
|
|
dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
|
|
|
|
nsid);
|
|
|
|
status = -EINVAL;
|
|
|
|
}
|
|
|
|
|
2017-06-07 17:45:34 +08:00
|
|
|
free_data:
|
|
|
|
kfree(data);
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2020-09-28 20:07:56 +08:00
|
|
|
static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
|
|
|
|
struct nvme_ns_ids *ids, struct nvme_id_ns **id)
|
2015-11-26 16:08:36 +08:00
|
|
|
{
|
|
|
|
struct nvme_command c = { };
|
|
|
|
int error;
|
|
|
|
|
|
|
|
/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
|
2017-01-26 23:17:27 +08:00
|
|
|
c.identify.opcode = nvme_admin_identify;
|
|
|
|
c.identify.nsid = cpu_to_le32(nsid);
|
2017-01-26 23:17:28 +08:00
|
|
|
c.identify.cns = NVME_ID_CNS_NS;
|
2015-11-26 16:08:36 +08:00
|
|
|
|
2019-08-03 09:11:42 +08:00
|
|
|
*id = kmalloc(sizeof(**id), GFP_KERNEL);
|
|
|
|
if (!*id)
|
|
|
|
return -ENOMEM;
|
2015-11-26 16:08:36 +08:00
|
|
|
|
2019-08-03 09:11:42 +08:00
|
|
|
error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
|
2017-08-16 22:14:47 +08:00
|
|
|
if (error) {
|
2019-04-05 02:57:45 +08:00
|
|
|
dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
|
2020-09-28 18:33:19 +08:00
|
|
|
goto out_free_id;
|
2017-08-16 22:14:47 +08:00
|
|
|
}
|
|
|
|
|
2021-02-26 15:17:25 +08:00
|
|
|
error = NVME_SC_INVALID_NS | NVME_SC_DNR;
|
2020-09-28 18:33:19 +08:00
|
|
|
if ((*id)->ncap == 0) /* namespace not allocated or attached */
|
|
|
|
goto out_free_id;
|
2020-09-28 20:07:56 +08:00
|
|
|
|
|
|
|
if (ctrl->vs >= NVME_VS(1, 1, 0) &&
|
|
|
|
!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
|
|
|
|
memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
|
|
|
|
if (ctrl->vs >= NVME_VS(1, 2, 0) &&
|
|
|
|
!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
|
|
|
|
memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
|
|
|
|
|
2020-09-28 18:33:19 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_free_id:
|
|
|
|
kfree(*id);
|
2019-08-03 09:11:42 +08:00
|
|
|
return error;
|
2015-11-26 16:08:36 +08:00
|
|
|
}
|
|
|
|
|
2019-05-27 00:29:01 +08:00
|
|
|
static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
|
|
|
|
unsigned int dword11, void *buffer, size_t buflen, u32 *result)
|
2015-11-26 16:08:36 +08:00
|
|
|
{
|
2020-02-19 23:59:36 +08:00
|
|
|
union nvme_result res = { 0 };
|
2021-06-17 06:15:52 +08:00
|
|
|
struct nvme_command c = { };
|
2016-02-29 22:59:47 +08:00
|
|
|
int ret;
|
2015-11-26 16:08:36 +08:00
|
|
|
|
2019-05-27 00:29:01 +08:00
|
|
|
c.features.opcode = op;
|
2015-11-26 16:08:36 +08:00
|
|
|
c.features.fid = cpu_to_le32(fid);
|
|
|
|
c.features.dword11 = cpu_to_le32(dword11);
|
|
|
|
|
2016-11-10 23:32:33 +08:00
|
|
|
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
|
2021-06-11 05:44:35 +08:00
|
|
|
buffer, buflen, 0, NVME_QID_ANY, 0, 0);
|
2016-08-24 18:52:12 +08:00
|
|
|
if (ret >= 0 && result)
|
2016-11-10 23:32:33 +08:00
|
|
|
*result = le32_to_cpu(res.u32);
|
2016-02-29 22:59:47 +08:00
|
|
|
return ret;
|
2015-11-26 16:08:36 +08:00
|
|
|
}
|
|
|
|
|
2019-05-27 00:29:01 +08:00
|
|
|
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
|
|
|
|
unsigned int dword11, void *buffer, size_t buflen,
|
|
|
|
u32 *result)
|
|
|
|
{
|
|
|
|
return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
|
|
|
|
buflen, result);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_set_features);
|
|
|
|
|
|
|
|
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
|
|
|
|
unsigned int dword11, void *buffer, size_t buflen,
|
|
|
|
u32 *result)
|
|
|
|
{
|
|
|
|
return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
|
|
|
|
buflen, result);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_get_features);
|
|
|
|
|
2015-11-26 18:09:06 +08:00
|
|
|
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
|
|
|
|
{
|
|
|
|
u32 q_count = (*count - 1) | ((*count - 1) << 16);
|
|
|
|
u32 result;
|
|
|
|
int status, nr_io_queues;
|
|
|
|
|
2016-09-17 02:16:10 +08:00
|
|
|
status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
|
2015-11-26 18:09:06 +08:00
|
|
|
&result);
|
2016-06-07 05:20:50 +08:00
|
|
|
if (status < 0)
|
2015-11-26 18:09:06 +08:00
|
|
|
return status;
|
|
|
|
|
2016-06-07 05:20:50 +08:00
|
|
|
/*
|
|
|
|
* Degraded controllers might return an error when setting the queue
|
|
|
|
* count. We still want to be able to bring them online and offer
|
|
|
|
* access to the admin queue, as that might be only way to fix them up.
|
|
|
|
*/
|
|
|
|
if (status > 0) {
|
2017-06-09 22:17:21 +08:00
|
|
|
dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
|
2016-06-07 05:20:50 +08:00
|
|
|
*count = 0;
|
|
|
|
} else {
|
|
|
|
nr_io_queues = min(result & 0xffff, result >> 16) + 1;
|
|
|
|
*count = min(*count, nr_io_queues);
|
|
|
|
}
|
|
|
|
|
2015-11-26 18:09:06 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
|
2015-11-26 18:09:06 +08:00
|
|
|
|
2018-05-22 17:09:55 +08:00
|
|
|
#define NVME_AEN_SUPPORTED \
|
2019-07-13 02:02:10 +08:00
|
|
|
(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
|
|
|
|
NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
|
2018-05-22 17:09:55 +08:00
|
|
|
|
|
|
|
static void nvme_enable_aen(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
2018-07-03 00:34:38 +08:00
|
|
|
u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
|
2018-05-22 17:09:55 +08:00
|
|
|
int status;
|
|
|
|
|
2018-07-03 00:34:38 +08:00
|
|
|
if (!supported_aens)
|
|
|
|
return;
|
|
|
|
|
|
|
|
status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
|
|
|
|
NULL, 0, &result);
|
2018-05-22 17:09:55 +08:00
|
|
|
if (status)
|
|
|
|
dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
|
2018-07-03 00:34:38 +08:00
|
|
|
supported_aens);
|
2019-08-23 02:25:46 +08:00
|
|
|
|
|
|
|
queue_work(nvme_wq, &ctrl->async_event_work);
|
2018-05-22 17:09:55 +08:00
|
|
|
}
|
|
|
|
|
2021-04-07 20:36:47 +08:00
|
|
|
static int nvme_ns_open(struct nvme_ns *ns)
|
2020-03-05 19:13:29 +08:00
|
|
|
{
|
|
|
|
|
2017-11-02 19:59:30 +08:00
|
|
|
/* should never be called due to GENHD_FL_HIDDEN */
|
2021-04-07 23:49:29 +08:00
|
|
|
if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
|
2018-01-04 23:56:13 +08:00
|
|
|
goto fail;
|
2021-04-27 14:47:46 +08:00
|
|
|
if (!nvme_get_ns(ns))
|
2018-01-04 23:56:13 +08:00
|
|
|
goto fail;
|
|
|
|
if (!try_module_get(ns->ctrl->ops->module))
|
|
|
|
goto fail_put_ns;
|
|
|
|
|
2017-10-18 19:22:00 +08:00
|
|
|
return 0;
|
2018-01-04 23:56:13 +08:00
|
|
|
|
|
|
|
fail_put_ns:
|
|
|
|
nvme_put_ns(ns);
|
|
|
|
fail:
|
|
|
|
return -ENXIO;
|
2015-11-26 17:54:19 +08:00
|
|
|
}
|
|
|
|
|
2021-04-07 20:36:47 +08:00
|
|
|
static void nvme_ns_release(struct nvme_ns *ns)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
2018-01-04 23:56:13 +08:00
|
|
|
|
|
|
|
module_put(ns->ctrl->ops->module);
|
|
|
|
nvme_put_ns(ns);
|
2015-11-26 17:54:19 +08:00
|
|
|
}
|
|
|
|
|
2021-04-07 20:36:47 +08:00
|
|
|
static int nvme_open(struct block_device *bdev, fmode_t mode)
|
|
|
|
{
|
|
|
|
return nvme_ns_open(bdev->bd_disk->private_data);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_release(struct gendisk *disk, fmode_t mode)
|
|
|
|
{
|
|
|
|
nvme_ns_release(disk->private_data);
|
|
|
|
}
|
|
|
|
|
2021-04-07 20:22:12 +08:00
|
|
|
int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
|
|
|
/* some standard values */
|
|
|
|
geo->heads = 1 << 6;
|
|
|
|
geo->sectors = 1 << 5;
|
|
|
|
geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_BLK_DEV_INTEGRITY
|
2020-05-19 22:05:52 +08:00
|
|
|
static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
|
|
|
|
u32 max_integrity_segments)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
2021-06-17 06:15:52 +08:00
|
|
|
struct blk_integrity integrity = { };
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2017-11-03 02:28:53 +08:00
|
|
|
switch (pi_type) {
|
2015-11-26 17:54:19 +08:00
|
|
|
case NVME_NS_DPS_PI_TYPE3:
|
|
|
|
integrity.profile = &t10_pi_type3_crc;
|
2016-04-09 11:04:42 +08:00
|
|
|
integrity.tag_size = sizeof(u16) + sizeof(u32);
|
|
|
|
integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
|
2015-11-26 17:54:19 +08:00
|
|
|
break;
|
|
|
|
case NVME_NS_DPS_PI_TYPE1:
|
|
|
|
case NVME_NS_DPS_PI_TYPE2:
|
|
|
|
integrity.profile = &t10_pi_type1_crc;
|
2016-04-09 11:04:42 +08:00
|
|
|
integrity.tag_size = sizeof(u16);
|
|
|
|
integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
|
2015-11-26 17:54:19 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
integrity.profile = NULL;
|
|
|
|
break;
|
|
|
|
}
|
2017-11-03 02:28:53 +08:00
|
|
|
integrity.tuple_size = ms;
|
|
|
|
blk_integrity_register(disk, &integrity);
|
2020-05-19 22:05:52 +08:00
|
|
|
blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
|
2015-11-26 17:54:19 +08:00
|
|
|
}
|
|
|
|
#else
|
2020-05-19 22:05:52 +08:00
|
|
|
static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
|
|
|
|
u32 max_integrity_segments)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_BLK_DEV_INTEGRITY */
|
|
|
|
|
2019-03-14 01:55:07 +08:00
|
|
|
static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
2018-05-03 01:06:54 +08:00
|
|
|
struct nvme_ctrl *ctrl = ns->ctrl;
|
2019-03-14 01:55:07 +08:00
|
|
|
struct request_queue *queue = disk->queue;
|
2017-11-03 02:28:54 +08:00
|
|
|
u32 size = queue_logical_block_size(queue);
|
|
|
|
|
2021-03-25 07:18:05 +08:00
|
|
|
if (ctrl->max_discard_sectors == 0) {
|
2018-05-03 01:06:54 +08:00
|
|
|
blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ctrl->nr_streams && ns->sws && ns->sgs)
|
|
|
|
size *= ns->sws * ns->sgs;
|
2016-03-05 04:15:17 +08:00
|
|
|
|
2017-02-08 21:46:50 +08:00
|
|
|
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
|
|
|
|
NVME_DSM_MAX_RANGES);
|
|
|
|
|
2017-11-24 23:30:53 +08:00
|
|
|
queue->limits.discard_alignment = 0;
|
2017-11-03 02:28:54 +08:00
|
|
|
queue->limits.discard_granularity = size;
|
2017-06-28 02:03:06 +08:00
|
|
|
|
2018-05-03 01:06:54 +08:00
|
|
|
/* If discard is already enabled, don't reset queue limits */
|
|
|
|
if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue))
|
|
|
|
return;
|
|
|
|
|
2021-03-25 07:18:05 +08:00
|
|
|
blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
|
|
|
|
blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
|
2017-04-06 01:21:13 +08:00
|
|
|
|
|
|
|
if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
|
2017-11-03 02:28:54 +08:00
|
|
|
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
|
2015-11-26 17:54:19 +08:00
|
|
|
}
|
|
|
|
|
2017-11-09 20:50:16 +08:00
|
|
|
static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
|
|
|
|
{
|
|
|
|
return uuid_equal(&a->uuid, &b->uuid) &&
|
|
|
|
memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
|
2020-06-30 03:06:39 +08:00
|
|
|
memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
|
|
|
|
a->csi == b->csi;
|
2017-11-09 20:50:16 +08:00
|
|
|
}
|
|
|
|
|
2020-04-10 00:09:08 +08:00
|
|
|
static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
|
|
|
u32 *phys_bs, u32 *io_opt)
|
2020-04-10 00:09:07 +08:00
|
|
|
{
|
|
|
|
struct streams_directive_params s;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!ctrl->nr_streams)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ns->sws = le32_to_cpu(s.sws);
|
|
|
|
ns->sgs = le16_to_cpu(s.sgs);
|
|
|
|
|
|
|
|
if (ns->sws) {
|
2020-04-10 00:09:08 +08:00
|
|
|
*phys_bs = ns->sws * (1 << ns->lba_shift);
|
2020-04-10 00:09:07 +08:00
|
|
|
if (ns->sgs)
|
2020-04-10 00:09:08 +08:00
|
|
|
*io_opt = *phys_bs * ns->sgs;
|
2020-04-10 00:09:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-09-25 13:19:13 +08:00
|
|
|
static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = ns->ctrl;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The PI implementation requires the metadata size to be equal to the
|
|
|
|
* t10 pi tuple size.
|
|
|
|
*/
|
|
|
|
ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
|
|
|
|
if (ns->ms == sizeof(struct t10_pi_tuple))
|
|
|
|
ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
|
|
|
|
else
|
|
|
|
ns->pi_type = 0;
|
|
|
|
|
|
|
|
ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
|
|
|
|
if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
|
|
|
|
return 0;
|
|
|
|
if (ctrl->ops->flags & NVME_F_FABRICS) {
|
|
|
|
/*
|
|
|
|
* The NVMe over Fabrics specification only supports metadata as
|
|
|
|
* part of the extended data LBA. We rely on HCA/HBA support to
|
|
|
|
* remap the separate metadata buffer from the block layer.
|
|
|
|
*/
|
|
|
|
if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
|
|
|
|
return -EINVAL;
|
2021-12-01 00:14:54 +08:00
|
|
|
|
|
|
|
ns->features |= NVME_NS_EXT_LBAS;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The current fabrics transport drivers support namespace
|
|
|
|
* metadata formats only if nvme_ns_has_pi() returns true.
|
|
|
|
* Suppress support for all other formats so the namespace will
|
|
|
|
* have a 0 capacity and not be usable through the block stack.
|
|
|
|
*
|
|
|
|
* Note, this check will need to be modified if any drivers
|
|
|
|
* gain the ability to use other metadata formats.
|
|
|
|
*/
|
|
|
|
if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
|
|
|
|
ns->features |= NVME_NS_METADATA_SUPPORTED;
|
2020-09-25 13:19:13 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* For PCIe controllers, we can't easily remap the separate
|
|
|
|
* metadata buffer from the block layer and thus require a
|
|
|
|
* separate metadata buffer for block layer metadata/PI support.
|
|
|
|
* We allow extended LBAs for the passthrough interface, though.
|
|
|
|
*/
|
|
|
|
if (id->flbas & NVME_NS_FLBAS_META_EXT)
|
|
|
|
ns->features |= NVME_NS_EXT_LBAS;
|
|
|
|
else
|
|
|
|
ns->features |= NVME_NS_METADATA_SUPPORTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-09-28 18:05:28 +08:00
|
|
|
static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
|
|
|
|
struct request_queue *q)
|
|
|
|
{
|
2020-10-02 02:54:32 +08:00
|
|
|
bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
|
2020-09-28 18:05:28 +08:00
|
|
|
|
|
|
|
if (ctrl->max_hw_sectors) {
|
|
|
|
u32 max_segments =
|
|
|
|
(ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
|
|
|
|
|
|
|
|
max_segments = min_not_zero(max_segments, ctrl->max_segments);
|
|
|
|
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
|
|
|
|
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
|
|
|
|
}
|
|
|
|
blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
|
|
|
|
blk_queue_dma_alignment(q, 7);
|
|
|
|
blk_queue_write_cache(q, vwc, vwc);
|
|
|
|
}
|
|
|
|
|
2017-11-03 02:28:56 +08:00
|
|
|
static void nvme_update_disk_info(struct gendisk *disk,
|
|
|
|
struct nvme_ns *ns, struct nvme_id_ns *id)
|
|
|
|
{
|
2019-10-21 11:40:04 +08:00
|
|
|
sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
|
2017-12-20 03:24:15 +08:00
|
|
|
unsigned short bs = 1 << ns->lba_shift;
|
2020-05-14 13:56:26 +08:00
|
|
|
u32 atomic_bs, phys_bs, io_opt = 0;
|
2017-11-03 02:28:56 +08:00
|
|
|
|
2020-09-28 18:03:13 +08:00
|
|
|
/*
|
|
|
|
* The block layer can't support LBA sizes larger than the page size
|
|
|
|
* yet, so catch this early and don't allow block I/O.
|
|
|
|
*/
|
2019-03-12 06:02:25 +08:00
|
|
|
if (ns->lba_shift > PAGE_SHIFT) {
|
2020-09-28 18:03:13 +08:00
|
|
|
capacity = 0;
|
2019-03-12 06:02:25 +08:00
|
|
|
bs = (1 << 9);
|
|
|
|
}
|
2020-09-28 18:11:42 +08:00
|
|
|
|
2017-11-03 02:28:56 +08:00
|
|
|
blk_integrity_unregister(disk);
|
|
|
|
|
2020-05-14 13:56:26 +08:00
|
|
|
atomic_bs = phys_bs = bs;
|
2020-04-10 00:09:08 +08:00
|
|
|
nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt);
|
2019-06-29 00:53:31 +08:00
|
|
|
if (id->nabo == 0) {
|
|
|
|
/*
|
|
|
|
* Bit 1 indicates whether NAWUPF is defined for this namespace
|
|
|
|
* and whether it should be used instead of AWUPF. If NAWUPF ==
|
|
|
|
* 0 then AWUPF must be used instead.
|
|
|
|
*/
|
2020-04-04 01:53:46 +08:00
|
|
|
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
|
2019-06-29 00:53:31 +08:00
|
|
|
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
|
|
|
|
else
|
|
|
|
atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
|
|
|
|
}
|
2020-04-10 00:09:08 +08:00
|
|
|
|
2020-04-04 01:53:46 +08:00
|
|
|
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
|
2019-06-29 00:53:31 +08:00
|
|
|
/* NPWG = Namespace Preferred Write Granularity */
|
2020-04-10 00:09:08 +08:00
|
|
|
phys_bs = bs * (1 + le16_to_cpu(id->npwg));
|
2019-06-29 00:53:31 +08:00
|
|
|
/* NOWS = Namespace Optimal Write Size */
|
2020-04-10 00:09:08 +08:00
|
|
|
io_opt = bs * (1 + le16_to_cpu(id->nows));
|
2019-06-29 00:53:31 +08:00
|
|
|
}
|
|
|
|
|
2017-12-20 03:24:15 +08:00
|
|
|
blk_queue_logical_block_size(disk->queue, bs);
|
2019-06-29 00:53:31 +08:00
|
|
|
/*
|
|
|
|
* Linux filesystems assume writing a single physical block is
|
|
|
|
* an atomic operation. Hence limit the physical block size to the
|
|
|
|
* value of the Atomic Write Unit Power Fail parameter.
|
|
|
|
*/
|
|
|
|
blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
|
|
|
|
blk_queue_io_min(disk->queue, phys_bs);
|
|
|
|
blk_queue_io_opt(disk->queue, io_opt);
|
2017-12-20 03:24:15 +08:00
|
|
|
|
2020-05-19 22:05:50 +08:00
|
|
|
/*
|
|
|
|
* Register a metadata profile for PI, or the plain non-integrity NVMe
|
|
|
|
* metadata masquerading as Type 0 if supported, otherwise reject block
|
|
|
|
* I/O to namespaces with metadata except when the namespace supports
|
|
|
|
* PI, as it can strip/insert in that case.
|
|
|
|
*/
|
|
|
|
if (ns->ms) {
|
|
|
|
if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
|
|
|
|
(ns->features & NVME_NS_METADATA_SUPPORTED))
|
2020-05-19 22:05:52 +08:00
|
|
|
nvme_init_integrity(disk, ns->ms, ns->pi_type,
|
|
|
|
ns->ctrl->max_integrity_segments);
|
2020-05-19 22:05:50 +08:00
|
|
|
else if (!nvme_ns_has_pi(ns))
|
|
|
|
capacity = 0;
|
|
|
|
}
|
|
|
|
|
2020-11-16 22:56:56 +08:00
|
|
|
set_capacity_and_notify(disk, capacity);
|
2019-03-14 01:55:06 +08:00
|
|
|
|
2019-03-14 01:55:07 +08:00
|
|
|
nvme_config_discard(disk, ns);
|
2021-03-25 07:18:05 +08:00
|
|
|
blk_queue_max_write_zeroes_sectors(disk->queue,
|
|
|
|
ns->ctrl->max_zeroes_sectors);
|
2018-08-08 14:01:06 +08:00
|
|
|
|
2021-01-09 18:42:54 +08:00
|
|
|
set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) ||
|
|
|
|
test_bit(NVME_NS_FORCE_RO, &ns->flags));
|
2017-11-03 02:28:56 +08:00
|
|
|
}
|
|
|
|
|
2020-08-28 01:38:57 +08:00
|
|
|
static inline bool nvme_first_scan(struct gendisk *disk)
|
|
|
|
{
|
|
|
|
/* nvme_alloc_ns() scans the disk prior to adding it */
|
2021-08-09 14:40:28 +08:00
|
|
|
return !disk_live(disk);
|
2020-08-28 01:38:57 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = ns->ctrl;
|
|
|
|
u32 iob;
|
|
|
|
|
|
|
|
if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
|
|
|
|
is_power_of_2(ctrl->max_hw_sectors))
|
|
|
|
iob = ctrl->max_hw_sectors;
|
|
|
|
else
|
|
|
|
iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
|
|
|
|
|
|
|
|
if (!iob)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!is_power_of_2(iob)) {
|
|
|
|
if (nvme_first_scan(ns->disk))
|
|
|
|
pr_warn("%s: ignoring unaligned IO boundary:%u\n",
|
|
|
|
ns->disk->disk_name, iob);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (blk_queue_is_zoned(ns->disk->queue)) {
|
|
|
|
if (nvme_first_scan(ns->disk))
|
|
|
|
pr_warn("%s: ignoring zoned namespace IO boundary\n",
|
|
|
|
ns->disk->disk_name);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
blk_queue_chunk_sectors(ns->queue, iob);
|
|
|
|
}
|
|
|
|
|
2020-09-28 18:14:20 +08:00
|
|
|
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
|
2016-09-16 20:25:04 +08:00
|
|
|
{
|
2020-06-30 03:06:41 +08:00
|
|
|
unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
|
|
|
|
int ret;
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2020-09-28 18:11:42 +08:00
|
|
|
blk_mq_freeze_queue(ns->disk->queue);
|
2020-06-30 03:06:41 +08:00
|
|
|
ns->lba_shift = id->lbaf[lbaf].ds;
|
2020-09-28 20:07:56 +08:00
|
|
|
nvme_set_queue_limits(ns->ctrl, ns->queue);
|
2020-04-10 00:09:06 +08:00
|
|
|
|
2021-01-28 12:47:27 +08:00
|
|
|
ret = nvme_configure_metadata(ns, id);
|
|
|
|
if (ret)
|
|
|
|
goto out_unfreeze;
|
|
|
|
nvme_set_chunk_sectors(ns, id);
|
|
|
|
nvme_update_disk_info(ns->disk, ns, id);
|
|
|
|
|
2020-09-28 20:07:56 +08:00
|
|
|
if (ns->head->ids.csi == NVME_CSI_ZNS) {
|
2020-08-20 20:02:18 +08:00
|
|
|
ret = nvme_update_zone_info(ns, lbaf);
|
2020-09-28 20:07:56 +08:00
|
|
|
if (ret)
|
2020-09-28 18:11:42 +08:00
|
|
|
goto out_unfreeze;
|
2020-06-30 03:06:39 +08:00
|
|
|
}
|
|
|
|
|
2021-08-24 22:57:42 +08:00
|
|
|
set_bit(NVME_NS_READY, &ns->flags);
|
2020-09-28 18:11:42 +08:00
|
|
|
blk_mq_unfreeze_queue(ns->disk->queue);
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2020-09-28 18:30:16 +08:00
|
|
|
if (blk_queue_is_zoned(ns->queue)) {
|
|
|
|
ret = nvme_revalidate_zones(ns);
|
2020-10-24 03:16:28 +08:00
|
|
|
if (ret && !nvme_first_scan(ns->disk))
|
2021-04-07 21:03:16 +08:00
|
|
|
goto out;
|
2020-05-19 22:05:50 +08:00
|
|
|
}
|
|
|
|
|
2021-04-07 23:49:29 +08:00
|
|
|
if (nvme_ns_head_multipath(ns->head)) {
|
2020-09-28 18:11:42 +08:00
|
|
|
blk_mq_freeze_queue(ns->head->disk->queue);
|
2017-11-02 19:59:30 +08:00
|
|
|
nvme_update_disk_info(ns->head->disk, ns, id);
|
2021-08-24 22:57:42 +08:00
|
|
|
nvme_mpath_revalidate_paths(ns);
|
2020-07-20 14:12:51 +08:00
|
|
|
blk_stack_limits(&ns->head->disk->queue->limits,
|
|
|
|
&ns->queue->limits, 0);
|
2021-08-09 22:17:41 +08:00
|
|
|
disk_update_readahead(ns->head->disk);
|
2020-09-28 18:11:42 +08:00
|
|
|
blk_mq_unfreeze_queue(ns->head->disk->queue);
|
2018-11-03 02:22:13 +08:00
|
|
|
}
|
2020-05-19 22:05:53 +08:00
|
|
|
return 0;
|
2016-09-16 20:25:04 +08:00
|
|
|
|
2020-09-28 18:11:42 +08:00
|
|
|
out_unfreeze:
|
|
|
|
blk_mq_unfreeze_queue(ns->disk->queue);
|
2021-04-07 21:03:16 +08:00
|
|
|
out:
|
|
|
|
/*
|
|
|
|
* If probing fails due an unsupported feature, hide the block device,
|
|
|
|
* but still allow other access.
|
|
|
|
*/
|
|
|
|
if (ret == -ENODEV) {
|
|
|
|
ns->disk->flags |= GENHD_FL_HIDDEN;
|
|
|
|
ret = 0;
|
|
|
|
}
|
2020-06-30 03:06:41 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-11-26 17:54:19 +08:00
|
|
|
static char nvme_pr_type(enum pr_type type)
|
|
|
|
{
|
|
|
|
switch (type) {
|
|
|
|
case PR_WRITE_EXCLUSIVE:
|
|
|
|
return 1;
|
|
|
|
case PR_EXCLUSIVE_ACCESS:
|
|
|
|
return 2;
|
|
|
|
case PR_WRITE_EXCLUSIVE_REG_ONLY:
|
|
|
|
return 3;
|
|
|
|
case PR_EXCLUSIVE_ACCESS_REG_ONLY:
|
|
|
|
return 4;
|
|
|
|
case PR_WRITE_EXCLUSIVE_ALL_REGS:
|
|
|
|
return 5;
|
|
|
|
case PR_EXCLUSIVE_ACCESS_ALL_REGS:
|
|
|
|
return 6;
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
2022-01-19 15:49:54 +08:00
|
|
|
}
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2021-05-19 15:22:35 +08:00
|
|
|
static int nvme_send_ns_head_pr_command(struct block_device *bdev,
|
|
|
|
struct nvme_command *c, u8 data[16])
|
|
|
|
{
|
|
|
|
struct nvme_ns_head *head = bdev->bd_disk->private_data;
|
|
|
|
int srcu_idx = srcu_read_lock(&head->srcu);
|
|
|
|
struct nvme_ns *ns = nvme_find_path(head);
|
|
|
|
int ret = -EWOULDBLOCK;
|
|
|
|
|
|
|
|
if (ns) {
|
|
|
|
c->common.nsid = cpu_to_le32(ns->head->ns_id);
|
|
|
|
ret = nvme_submit_sync_cmd(ns->queue, c, data, 16);
|
|
|
|
}
|
|
|
|
srcu_read_unlock(&head->srcu, srcu_idx);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_send_ns_pr_command(struct nvme_ns *ns, struct nvme_command *c,
|
|
|
|
u8 data[16])
|
|
|
|
{
|
|
|
|
c->common.nsid = cpu_to_le32(ns->head->ns_id);
|
|
|
|
return nvme_submit_sync_cmd(ns->queue, c, data, 16);
|
|
|
|
}
|
|
|
|
|
2015-11-26 17:54:19 +08:00
|
|
|
static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
|
|
|
|
u64 key, u64 sa_key, u8 op)
|
|
|
|
{
|
2021-06-17 06:15:52 +08:00
|
|
|
struct nvme_command c = { };
|
2015-11-26 17:54:19 +08:00
|
|
|
u8 data[16] = { 0, };
|
|
|
|
|
|
|
|
put_unaligned_le64(key, &data[0]);
|
|
|
|
put_unaligned_le64(sa_key, &data[8]);
|
|
|
|
|
|
|
|
c.common.opcode = op;
|
2018-12-13 07:11:37 +08:00
|
|
|
c.common.cdw10 = cpu_to_le32(cdw10);
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2021-05-19 15:22:35 +08:00
|
|
|
if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
|
|
|
|
bdev->bd_disk->fops == &nvme_ns_head_ops)
|
|
|
|
return nvme_send_ns_head_pr_command(bdev, &c, data);
|
|
|
|
return nvme_send_ns_pr_command(bdev->bd_disk->private_data, &c, data);
|
2015-11-26 17:54:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_register(struct block_device *bdev, u64 old,
|
|
|
|
u64 new, unsigned flags)
|
|
|
|
{
|
|
|
|
u32 cdw10;
|
|
|
|
|
|
|
|
if (flags & ~PR_FL_IGNORE_KEY)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
cdw10 = old ? 2 : 0;
|
|
|
|
cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
|
|
|
|
cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
|
|
|
|
return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_reserve(struct block_device *bdev, u64 key,
|
|
|
|
enum pr_type type, unsigned flags)
|
|
|
|
{
|
|
|
|
u32 cdw10;
|
|
|
|
|
|
|
|
if (flags & ~PR_FL_IGNORE_KEY)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
cdw10 = nvme_pr_type(type) << 8;
|
|
|
|
cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
|
|
|
|
return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
|
|
|
|
enum pr_type type, bool abort)
|
|
|
|
{
|
2018-05-23 22:56:11 +08:00
|
|
|
u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
|
2021-03-01 10:06:11 +08:00
|
|
|
|
2015-11-26 17:54:19 +08:00
|
|
|
return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_clear(struct block_device *bdev, u64 key)
|
|
|
|
{
|
2015-12-09 18:24:06 +08:00
|
|
|
u32 cdw10 = 1 | (key ? 1 << 3 : 0);
|
2021-03-01 10:06:11 +08:00
|
|
|
|
2015-11-26 17:54:19 +08:00
|
|
|
return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
|
|
|
|
{
|
2018-05-23 22:56:11 +08:00
|
|
|
u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0);
|
2021-03-01 10:06:11 +08:00
|
|
|
|
2015-11-26 17:54:19 +08:00
|
|
|
return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
|
|
|
|
}
|
|
|
|
|
2021-04-07 20:22:12 +08:00
|
|
|
const struct pr_ops nvme_pr_ops = {
|
2015-11-26 17:54:19 +08:00
|
|
|
.pr_register = nvme_pr_register,
|
|
|
|
.pr_reserve = nvme_pr_reserve,
|
|
|
|
.pr_release = nvme_pr_release,
|
|
|
|
.pr_preempt = nvme_pr_preempt,
|
|
|
|
.pr_clear = nvme_pr_clear,
|
|
|
|
};
|
|
|
|
|
2017-02-04 03:50:32 +08:00
|
|
|
#ifdef CONFIG_BLK_SED_OPAL
|
2017-02-17 20:59:39 +08:00
|
|
|
int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
|
|
|
|
bool send)
|
2017-02-04 03:50:32 +08:00
|
|
|
{
|
2017-02-17 20:59:39 +08:00
|
|
|
struct nvme_ctrl *ctrl = data;
|
2021-06-17 06:15:52 +08:00
|
|
|
struct nvme_command cmd = { };
|
2017-02-04 03:50:32 +08:00
|
|
|
|
|
|
|
if (send)
|
|
|
|
cmd.common.opcode = nvme_admin_security_send;
|
|
|
|
else
|
|
|
|
cmd.common.opcode = nvme_admin_security_recv;
|
|
|
|
cmd.common.nsid = 0;
|
2018-12-13 07:11:37 +08:00
|
|
|
cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
|
|
|
|
cmd.common.cdw11 = cpu_to_le32(len);
|
2017-02-04 03:50:32 +08:00
|
|
|
|
2020-11-10 08:33:45 +08:00
|
|
|
return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0,
|
2021-06-11 05:44:35 +08:00
|
|
|
NVME_QID_ANY, 1, 0);
|
2017-02-04 03:50:32 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_sec_submit);
|
|
|
|
#endif /* CONFIG_BLK_SED_OPAL */
|
|
|
|
|
2021-05-19 15:17:06 +08:00
|
|
|
#ifdef CONFIG_BLK_DEV_ZONED
|
|
|
|
static int nvme_report_zones(struct gendisk *disk, sector_t sector,
|
|
|
|
unsigned int nr_zones, report_zones_cb cb, void *data)
|
|
|
|
{
|
|
|
|
return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
|
|
|
|
data);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#define nvme_report_zones NULL
|
|
|
|
#endif /* CONFIG_BLK_DEV_ZONED */
|
|
|
|
|
2020-12-01 20:56:09 +08:00
|
|
|
static const struct block_device_operations nvme_bdev_ops = {
|
2015-11-26 17:54:19 +08:00
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.ioctl = nvme_ioctl,
|
|
|
|
.open = nvme_open,
|
|
|
|
.release = nvme_release,
|
|
|
|
.getgeo = nvme_getgeo,
|
2020-06-30 03:06:41 +08:00
|
|
|
.report_zones = nvme_report_zones,
|
2015-11-26 17:54:19 +08:00
|
|
|
.pr_ops = &nvme_pr_ops,
|
|
|
|
};
|
|
|
|
|
2015-11-28 22:03:49 +08:00
|
|
|
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
|
|
|
|
{
|
|
|
|
unsigned long timeout =
|
|
|
|
((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
|
|
|
|
u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
|
2016-10-12 01:31:58 +08:00
|
|
|
if (csts == ~0)
|
|
|
|
return -ENODEV;
|
2015-11-28 22:03:49 +08:00
|
|
|
if ((csts & NVME_CSTS_RDY) == bit)
|
|
|
|
break;
|
|
|
|
|
2020-02-29 10:52:28 +08:00
|
|
|
usleep_range(1000, 2000);
|
2015-11-28 22:03:49 +08:00
|
|
|
if (fatal_signal_pending(current))
|
|
|
|
return -EINTR;
|
|
|
|
if (time_after(jiffies, timeout)) {
|
2016-02-10 23:51:15 +08:00
|
|
|
dev_err(ctrl->device,
|
2020-02-28 00:45:26 +08:00
|
|
|
"Device not ready; aborting %s, CSTS=0x%x\n",
|
|
|
|
enabled ? "initialisation" : "reset", csts);
|
2015-11-28 22:03:49 +08:00
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the device has been passed off to us in an enabled state, just clear
|
|
|
|
* the enabled bit. The spec says we should set the 'shutdown notification
|
|
|
|
* bits', but doing so may cause the device to complete commands to the
|
|
|
|
* admin queue ... and we don't know what memory that might be pointing at!
|
|
|
|
*/
|
2019-07-23 08:06:54 +08:00
|
|
|
int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
|
2015-11-28 22:03:49 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
|
|
|
|
ctrl->ctrl_config &= ~NVME_CC_ENABLE;
|
|
|
|
|
|
|
|
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2016-06-15 05:22:41 +08:00
|
|
|
|
nvme: apply DELAY_BEFORE_CHK_RDY quirk at probe time too
Commit 54adc01055b7 ("nvme/quirk: Add a delay before checking for adapter
readiness") introduced a quirk to adapters that cannot read the bit
NVME_CSTS_RDY right after register NVME_REG_CC is set; these adapters
need a delay or else the action of reading the bit NVME_CSTS_RDY could
somehow corrupt adapter's registers state and it never recovers.
When this quirk was added, we checked ctrl->tagset in order to avoid
quirking in probe time, supposing we would never require such delay
during probe. Well, it was too optimistic; we in fact need this quirk
at probe time in some cases, like after a kexec.
In some experiments, after abnormal shutdown of machine (aka power cord
unplug), we booted into our bootloader in Power, which is a Linux kernel,
and kexec'ed into another distro. If this kexec is too quick, we end up
reaching the probe of NVMe adapter in that distro when adapter is in
bad state (not fully initialized on our bootloader). What happens next
is that nvme_wait_ready() is unable to complete, except if the quirk is
enabled.
So, this patch removes the original ctrl->tagset verification in order
to enable the quirk even on probe time.
Fixes: 54adc01055b7 ("nvme/quirk: Add a delay before checking for adapter readiness")
Reported-by: Andrew Byrne <byrneadw@ie.ibm.com>
Reported-by: Jaime A. H. Gomez <jahgomez@mx1.ibm.com>
Reported-by: Zachary D. Myers <zdmyers@us.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
Acked-by: Jeffrey Lien <Jeff.Lien@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2016-12-29 08:13:15 +08:00
|
|
|
if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
|
2016-06-15 05:22:41 +08:00
|
|
|
msleep(NVME_QUIRK_DELAY_AMOUNT);
|
|
|
|
|
2019-07-23 08:06:54 +08:00
|
|
|
return nvme_wait_ready(ctrl, ctrl->cap, false);
|
2015-11-28 22:03:49 +08:00
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
|
2015-11-28 22:03:49 +08:00
|
|
|
|
2019-07-23 08:06:53 +08:00
|
|
|
int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
|
2015-11-28 22:03:49 +08:00
|
|
|
{
|
2020-07-17 08:51:37 +08:00
|
|
|
unsigned dev_page_min;
|
2015-11-28 22:03:49 +08:00
|
|
|
int ret;
|
|
|
|
|
2019-07-23 08:06:53 +08:00
|
|
|
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
|
|
|
|
if (ret) {
|
|
|
|
dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
|
|
|
|
|
2020-07-17 08:51:37 +08:00
|
|
|
if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
|
2016-02-10 23:51:15 +08:00
|
|
|
dev_err(ctrl->device,
|
2015-11-28 22:03:49 +08:00
|
|
|
"Minimum device page size %u too large for host (%u)\n",
|
2020-07-17 08:51:37 +08:00
|
|
|
1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
|
2015-11-28 22:03:49 +08:00
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
|
2020-06-30 03:06:39 +08:00
|
|
|
if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
|
|
|
|
ctrl->ctrl_config = NVME_CC_CSS_CSI;
|
|
|
|
else
|
|
|
|
ctrl->ctrl_config = NVME_CC_CSS_NVM;
|
2020-07-17 08:51:37 +08:00
|
|
|
ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
|
2017-08-14 00:21:07 +08:00
|
|
|
ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
|
2015-11-28 22:03:49 +08:00
|
|
|
ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
|
|
|
|
ctrl->ctrl_config |= NVME_CC_ENABLE;
|
|
|
|
|
|
|
|
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2019-07-23 08:06:53 +08:00
|
|
|
return nvme_wait_ready(ctrl, ctrl->cap, true);
|
2015-11-28 22:03:49 +08:00
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
|
2015-11-28 22:03:49 +08:00
|
|
|
|
|
|
|
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
2017-08-26 07:14:50 +08:00
|
|
|
unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
|
2015-11-28 22:03:49 +08:00
|
|
|
u32 csts;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
|
|
|
|
ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
|
|
|
|
|
|
|
|
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
|
|
|
|
if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
|
|
|
|
break;
|
|
|
|
|
|
|
|
msleep(100);
|
|
|
|
if (fatal_signal_pending(current))
|
|
|
|
return -EINTR;
|
|
|
|
if (time_after(jiffies, timeout)) {
|
2016-02-10 23:51:15 +08:00
|
|
|
dev_err(ctrl->device,
|
2015-11-28 22:03:49 +08:00
|
|
|
"Device shutdown incomplete; abort shutdown\n");
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
|
2015-11-28 22:03:49 +08:00
|
|
|
|
2017-08-16 15:51:29 +08:00
|
|
|
static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
__le64 ts;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
|
|
|
|
ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
|
|
|
|
NULL);
|
|
|
|
if (ret)
|
|
|
|
dev_warn_once(ctrl->device,
|
|
|
|
"could not set timestamp (%d)\n", ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-11-28 00:40:57 +08:00
|
|
|
static int nvme_configure_acre(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_feat_host_behavior *host;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* Don't bother enabling the feature if retry delay is not reported */
|
|
|
|
if (!ctrl->crdt[0])
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
host = kzalloc(sizeof(*host), GFP_KERNEL);
|
|
|
|
if (!host)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
host->acre = NVME_ENABLE_ACRE;
|
|
|
|
ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
|
|
|
|
host, sizeof(*host), NULL);
|
|
|
|
kfree(host);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
nvme: extend and modify the APST configuration algorithm
The algorithm that was used until now for building the APST configuration
table has been found to produce entries with excessively long ITPT
(idle time prior to transition) for devices declaring relatively long
entry and exit latencies for non-operational power states. This leads
to unnecessary waste of power and, as a result, failure to pass
mandatory power consumption tests on Chromebook platforms.
The new algorithm is based on two predefined ITPT values and two
predefined latency tolerances. Based on these values, as well as on
exit and entry latencies reported by the device, the algorithm looks
for up to 2 suitable non-operational power states to use as primary
and secondary APST transition targets. The predefined values are
supplied to the nvme driver as module parameters:
- apst_primary_timeout_ms (default: 100)
- apst_secondary_timeout_ms (default: 2000)
- apst_primary_latency_tol_us (default: 15000)
- apst_secondary_latency_tol_us (default: 100000)
The algorithm echoes the approach used by Intel's and Microsoft's drivers
on Windows. The specific default parameter values are also based on those
drivers. Yet, this patch doesn't introduce the ability to dynamically
regenerate the APST table in the event of switching the power source from
AC to battery and back. Adding this functionality may be considered in the
future. In the meantime, the timeouts and tolerances reflect a compromise
between values used by Microsoft for AC and battery scenarios.
In most NVMe devices the new algorithm causes them to implement a more
aggressive power saving policy. While beneficial in most cases, this
sometimes comes at the price of a higher IO processing latency in certain
scenarios as well as at the price of a potential impact on the drive's
endurance (due to more frequent context saving when entering deep non-
operational states). So in order to provide a fallback for systems where
these regressions cannot be tolerated, the patch allows to revert to
the legacy behavior by setting either apst_primary_timeout_ms or
apst_primary_latency_tol_us parameter to 0. Eventually (and possibly after
fine tuning the default values of the module parameters) the legacy behavior
can be removed.
TESTING.
The new algorithm has been extensively tested. Initially, simulations were
used to compare APST tables generated by old and new algorithms for a wide
range of devices. After that, power consumption, performance and latencies
were measured under different workloads on devices from multiple vendors
(WD, Intel, Samsung, Hynix, Kioxia). Below is the description of the tests
and the findings.
General observations.
The effect the patch has on the APST table varies depending on the entry and
exit latencies advertised by the devices. For some devices, the effect is
negligible (e.g. Kioxia KBG40ZNS), for some significant, making the
transitions to PS3 and PS4 much quicker (e.g. WD SN530, Intel 760P), or making
the sleep deeper, PS4 rather than PS3 after a similar amount of time (e.g.
SK Hynix BC511). For some devices (e.g. Samsung PM991) the effect is mixed:
the initial transition happens after a longer idle time, but takes the device
to a lower power state.
Workflows.
In order to evaluate the patch's effect on the power consumption and latency,
7 workflows were used for each device. The workflows were designed to test
the scenarios where significant differences between the old and new behaviors
are most likely. Each workflow was tested twice: with the new and with the
old APST table generation implementation. Power consumption, performance and
latency were measured in the process. The following workflows were used:
1) Consecutive write at the maximum rate with IO depth of 2, with no pauses
2) Repeated pattern of 1000 consecutive writes of 4K packets followed by 50ms
idle time
3) Repeated pattern of 1000 consecutive writes of 4K packets followed by 150ms
idle time
4) Repeated pattern of 1000 consecutive writes of 4K packets followed by 500ms
idle time
5) Repeated pattern of 1000 consecutive writes of 4K packets followed by 1.5s
idle time
6) Repeated pattern of 1000 consecutive writes of 4K packets followed by 5s
idle time
7) Repeated pattern of a single random read of a 4K packet followed by 150ms
idle time
Power consumption
Actual power consumption measurements produced predictable results in
accordance with the APST mechanism's theory of operation.
Devices with long entry and exit latencies such as WD SN530 showed huge
improvement on scenarios 4,5 and 6 of up to 62%. Devices such as Kioxia
KBG40ZNS where the resulting APST table looks virtually identical with
both legacy and new algorithms, showed little or no change in the average power
consumption on all workflows. Devices with extra short latencies such as
Samsung PM991 showed moderate increase in power consumption of up to 18% in
worst case scenarios.
In addition, on Intel and Samsung devices a more complex impact was observed
on scenarios 3, 4 and 7. Our understanding is that due to longer stay in deep
non-operational states between the writes the devices start performing background
operations leading to an increase of power consumption. With the old APST tables
part of these operations are delayed until the scenario is over and a longer idle
period begins, but eventually this extra power is consumed anyway.
Performance.
In terms of performance measured on sustained write or read scenarios, the
effect of the patch is minimal as in this case the device doesn't enter low power
states.
Latency
As expected, in devices where the patch causes a more aggressive power saving
policy (e.g. WD SN530, Intel 760P), an increase in latency was observed in
certain scenarios. Workflow number 7, specifically designed to simulate the
worst case scenario as far as latency is concerned, indeed shows a sharp
increase in average latency (~2ms -> ~53ms on Intel 760P and 0.6 -> 10ms on
WD SN530). The latency increase on other workloads and other devices is much
milder or non-existent.
Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-04-28 17:27:36 +08:00
|
|
|
/*
|
|
|
|
* The function checks whether the given total (exlat + enlat) latency of
|
|
|
|
* a power state allows the latter to be used as an APST transition target.
|
|
|
|
* It does so by comparing the latency to the primary and secondary latency
|
|
|
|
* tolerances defined by module params. If there's a match, the corresponding
|
|
|
|
* timeout value is returned and the matching tolerance index (1 or 2) is
|
|
|
|
* reported.
|
|
|
|
*/
|
|
|
|
static bool nvme_apst_get_transition_time(u64 total_latency,
|
|
|
|
u64 *transition_time, unsigned *last_index)
|
|
|
|
{
|
|
|
|
if (total_latency <= apst_primary_latency_tol_us) {
|
|
|
|
if (*last_index == 1)
|
|
|
|
return false;
|
|
|
|
*last_index = 1;
|
|
|
|
*transition_time = apst_primary_timeout_ms;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (apst_secondary_timeout_ms &&
|
|
|
|
total_latency <= apst_secondary_latency_tol_us) {
|
|
|
|
if (*last_index <= 2)
|
|
|
|
return false;
|
|
|
|
*last_index = 2;
|
|
|
|
*transition_time = apst_secondary_timeout_ms;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
/*
|
|
|
|
* APST (Autonomous Power State Transition) lets us program a table of power
|
|
|
|
* state transitions that the controller will perform automatically.
|
nvme: extend and modify the APST configuration algorithm
The algorithm that was used until now for building the APST configuration
table has been found to produce entries with excessively long ITPT
(idle time prior to transition) for devices declaring relatively long
entry and exit latencies for non-operational power states. This leads
to unnecessary waste of power and, as a result, failure to pass
mandatory power consumption tests on Chromebook platforms.
The new algorithm is based on two predefined ITPT values and two
predefined latency tolerances. Based on these values, as well as on
exit and entry latencies reported by the device, the algorithm looks
for up to 2 suitable non-operational power states to use as primary
and secondary APST transition targets. The predefined values are
supplied to the nvme driver as module parameters:
- apst_primary_timeout_ms (default: 100)
- apst_secondary_timeout_ms (default: 2000)
- apst_primary_latency_tol_us (default: 15000)
- apst_secondary_latency_tol_us (default: 100000)
The algorithm echoes the approach used by Intel's and Microsoft's drivers
on Windows. The specific default parameter values are also based on those
drivers. Yet, this patch doesn't introduce the ability to dynamically
regenerate the APST table in the event of switching the power source from
AC to battery and back. Adding this functionality may be considered in the
future. In the meantime, the timeouts and tolerances reflect a compromise
between values used by Microsoft for AC and battery scenarios.
In most NVMe devices the new algorithm causes them to implement a more
aggressive power saving policy. While beneficial in most cases, this
sometimes comes at the price of a higher IO processing latency in certain
scenarios as well as at the price of a potential impact on the drive's
endurance (due to more frequent context saving when entering deep non-
operational states). So in order to provide a fallback for systems where
these regressions cannot be tolerated, the patch allows to revert to
the legacy behavior by setting either apst_primary_timeout_ms or
apst_primary_latency_tol_us parameter to 0. Eventually (and possibly after
fine tuning the default values of the module parameters) the legacy behavior
can be removed.
TESTING.
The new algorithm has been extensively tested. Initially, simulations were
used to compare APST tables generated by old and new algorithms for a wide
range of devices. After that, power consumption, performance and latencies
were measured under different workloads on devices from multiple vendors
(WD, Intel, Samsung, Hynix, Kioxia). Below is the description of the tests
and the findings.
General observations.
The effect the patch has on the APST table varies depending on the entry and
exit latencies advertised by the devices. For some devices, the effect is
negligible (e.g. Kioxia KBG40ZNS), for some significant, making the
transitions to PS3 and PS4 much quicker (e.g. WD SN530, Intel 760P), or making
the sleep deeper, PS4 rather than PS3 after a similar amount of time (e.g.
SK Hynix BC511). For some devices (e.g. Samsung PM991) the effect is mixed:
the initial transition happens after a longer idle time, but takes the device
to a lower power state.
Workflows.
In order to evaluate the patch's effect on the power consumption and latency,
7 workflows were used for each device. The workflows were designed to test
the scenarios where significant differences between the old and new behaviors
are most likely. Each workflow was tested twice: with the new and with the
old APST table generation implementation. Power consumption, performance and
latency were measured in the process. The following workflows were used:
1) Consecutive write at the maximum rate with IO depth of 2, with no pauses
2) Repeated pattern of 1000 consecutive writes of 4K packets followed by 50ms
idle time
3) Repeated pattern of 1000 consecutive writes of 4K packets followed by 150ms
idle time
4) Repeated pattern of 1000 consecutive writes of 4K packets followed by 500ms
idle time
5) Repeated pattern of 1000 consecutive writes of 4K packets followed by 1.5s
idle time
6) Repeated pattern of 1000 consecutive writes of 4K packets followed by 5s
idle time
7) Repeated pattern of a single random read of a 4K packet followed by 150ms
idle time
Power consumption
Actual power consumption measurements produced predictable results in
accordance with the APST mechanism's theory of operation.
Devices with long entry and exit latencies such as WD SN530 showed huge
improvement on scenarios 4,5 and 6 of up to 62%. Devices such as Kioxia
KBG40ZNS where the resulting APST table looks virtually identical with
both legacy and new algorithms, showed little or no change in the average power
consumption on all workflows. Devices with extra short latencies such as
Samsung PM991 showed moderate increase in power consumption of up to 18% in
worst case scenarios.
In addition, on Intel and Samsung devices a more complex impact was observed
on scenarios 3, 4 and 7. Our understanding is that due to longer stay in deep
non-operational states between the writes the devices start performing background
operations leading to an increase of power consumption. With the old APST tables
part of these operations are delayed until the scenario is over and a longer idle
period begins, but eventually this extra power is consumed anyway.
Performance.
In terms of performance measured on sustained write or read scenarios, the
effect of the patch is minimal as in this case the device doesn't enter low power
states.
Latency
As expected, in devices where the patch causes a more aggressive power saving
policy (e.g. WD SN530, Intel 760P), an increase in latency was observed in
certain scenarios. Workflow number 7, specifically designed to simulate the
worst case scenario as far as latency is concerned, indeed shows a sharp
increase in average latency (~2ms -> ~53ms on Intel 760P and 0.6 -> 10ms on
WD SN530). The latency increase on other workloads and other devices is much
milder or non-existent.
Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-04-28 17:27:36 +08:00
|
|
|
*
|
|
|
|
* Depending on module params, one of the two supported techniques will be used:
|
|
|
|
*
|
|
|
|
* - If the parameters provide explicit timeouts and tolerances, they will be
|
|
|
|
* used to build a table with up to 2 non-operational states to transition to.
|
|
|
|
* The default parameter values were selected based on the values used by
|
|
|
|
* Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
|
|
|
|
* regeneration of the APST table in the event of switching between external
|
|
|
|
* and battery power, the timeouts and tolerances reflect a compromise
|
|
|
|
* between values used by Microsoft for AC and battery scenarios.
|
|
|
|
* - If not, we'll configure the table with a simple heuristic: we are willing
|
|
|
|
* to spend at most 2% of the time transitioning between power states.
|
|
|
|
* Therefore, when running in any given state, we will enter the next
|
|
|
|
* lower-power non-operational state after waiting 50 * (enlat + exlat)
|
|
|
|
* microseconds, as long as that state's exit latency is under the requested
|
|
|
|
* maximum latency.
|
2021-04-09 14:47:44 +08:00
|
|
|
*
|
|
|
|
* We will not autonomously enter any non-operational state for which the total
|
|
|
|
* latency exceeds ps_max_latency_us.
|
|
|
|
*
|
|
|
|
* Users can set ps_max_latency_us to zero to turn off APST.
|
|
|
|
*/
|
2017-08-10 17:23:31 +08:00
|
|
|
static int nvme_configure_apst(struct nvme_ctrl *ctrl)
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
{
|
|
|
|
struct nvme_feat_auto_pst *table;
|
2021-04-09 14:47:44 +08:00
|
|
|
unsigned apste = 0;
|
2017-04-22 07:19:23 +08:00
|
|
|
u64 max_lat_us = 0;
|
2021-04-09 14:47:44 +08:00
|
|
|
__le64 target = 0;
|
2017-04-22 07:19:23 +08:00
|
|
|
int max_ps = -1;
|
2021-04-09 14:47:44 +08:00
|
|
|
int state;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
int ret;
|
nvme: extend and modify the APST configuration algorithm
The algorithm that was used until now for building the APST configuration
table has been found to produce entries with excessively long ITPT
(idle time prior to transition) for devices declaring relatively long
entry and exit latencies for non-operational power states. This leads
to unnecessary waste of power and, as a result, failure to pass
mandatory power consumption tests on Chromebook platforms.
The new algorithm is based on two predefined ITPT values and two
predefined latency tolerances. Based on these values, as well as on
exit and entry latencies reported by the device, the algorithm looks
for up to 2 suitable non-operational power states to use as primary
and secondary APST transition targets. The predefined values are
supplied to the nvme driver as module parameters:
- apst_primary_timeout_ms (default: 100)
- apst_secondary_timeout_ms (default: 2000)
- apst_primary_latency_tol_us (default: 15000)
- apst_secondary_latency_tol_us (default: 100000)
The algorithm echoes the approach used by Intel's and Microsoft's drivers
on Windows. The specific default parameter values are also based on those
drivers. Yet, this patch doesn't introduce the ability to dynamically
regenerate the APST table in the event of switching the power source from
AC to battery and back. Adding this functionality may be considered in the
future. In the meantime, the timeouts and tolerances reflect a compromise
between values used by Microsoft for AC and battery scenarios.
In most NVMe devices the new algorithm causes them to implement a more
aggressive power saving policy. While beneficial in most cases, this
sometimes comes at the price of a higher IO processing latency in certain
scenarios as well as at the price of a potential impact on the drive's
endurance (due to more frequent context saving when entering deep non-
operational states). So in order to provide a fallback for systems where
these regressions cannot be tolerated, the patch allows to revert to
the legacy behavior by setting either apst_primary_timeout_ms or
apst_primary_latency_tol_us parameter to 0. Eventually (and possibly after
fine tuning the default values of the module parameters) the legacy behavior
can be removed.
TESTING.
The new algorithm has been extensively tested. Initially, simulations were
used to compare APST tables generated by old and new algorithms for a wide
range of devices. After that, power consumption, performance and latencies
were measured under different workloads on devices from multiple vendors
(WD, Intel, Samsung, Hynix, Kioxia). Below is the description of the tests
and the findings.
General observations.
The effect the patch has on the APST table varies depending on the entry and
exit latencies advertised by the devices. For some devices, the effect is
negligible (e.g. Kioxia KBG40ZNS), for some significant, making the
transitions to PS3 and PS4 much quicker (e.g. WD SN530, Intel 760P), or making
the sleep deeper, PS4 rather than PS3 after a similar amount of time (e.g.
SK Hynix BC511). For some devices (e.g. Samsung PM991) the effect is mixed:
the initial transition happens after a longer idle time, but takes the device
to a lower power state.
Workflows.
In order to evaluate the patch's effect on the power consumption and latency,
7 workflows were used for each device. The workflows were designed to test
the scenarios where significant differences between the old and new behaviors
are most likely. Each workflow was tested twice: with the new and with the
old APST table generation implementation. Power consumption, performance and
latency were measured in the process. The following workflows were used:
1) Consecutive write at the maximum rate with IO depth of 2, with no pauses
2) Repeated pattern of 1000 consecutive writes of 4K packets followed by 50ms
idle time
3) Repeated pattern of 1000 consecutive writes of 4K packets followed by 150ms
idle time
4) Repeated pattern of 1000 consecutive writes of 4K packets followed by 500ms
idle time
5) Repeated pattern of 1000 consecutive writes of 4K packets followed by 1.5s
idle time
6) Repeated pattern of 1000 consecutive writes of 4K packets followed by 5s
idle time
7) Repeated pattern of a single random read of a 4K packet followed by 150ms
idle time
Power consumption
Actual power consumption measurements produced predictable results in
accordance with the APST mechanism's theory of operation.
Devices with long entry and exit latencies such as WD SN530 showed huge
improvement on scenarios 4,5 and 6 of up to 62%. Devices such as Kioxia
KBG40ZNS where the resulting APST table looks virtually identical with
both legacy and new algorithms, showed little or no change in the average power
consumption on all workflows. Devices with extra short latencies such as
Samsung PM991 showed moderate increase in power consumption of up to 18% in
worst case scenarios.
In addition, on Intel and Samsung devices a more complex impact was observed
on scenarios 3, 4 and 7. Our understanding is that due to longer stay in deep
non-operational states between the writes the devices start performing background
operations leading to an increase of power consumption. With the old APST tables
part of these operations are delayed until the scenario is over and a longer idle
period begins, but eventually this extra power is consumed anyway.
Performance.
In terms of performance measured on sustained write or read scenarios, the
effect of the patch is minimal as in this case the device doesn't enter low power
states.
Latency
As expected, in devices where the patch causes a more aggressive power saving
policy (e.g. WD SN530, Intel 760P), an increase in latency was observed in
certain scenarios. Workflow number 7, specifically designed to simulate the
worst case scenario as far as latency is concerned, indeed shows a sharp
increase in average latency (~2ms -> ~53ms on Intel 760P and 0.6 -> 10ms on
WD SN530). The latency increase on other workloads and other devices is much
milder or non-existent.
Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-04-28 17:27:36 +08:00
|
|
|
unsigned last_lt_index = UINT_MAX;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If APST isn't supported or if we haven't been initialized yet,
|
|
|
|
* then don't do anything.
|
|
|
|
*/
|
|
|
|
if (!ctrl->apsta)
|
2017-08-10 17:23:31 +08:00
|
|
|
return 0;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
|
|
|
if (ctrl->npss > 31) {
|
|
|
|
dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
|
2017-08-10 17:23:31 +08:00
|
|
|
return 0;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
table = kzalloc(sizeof(*table), GFP_KERNEL);
|
|
|
|
if (!table)
|
2017-08-10 17:23:31 +08:00
|
|
|
return 0;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
2017-06-27 04:39:54 +08:00
|
|
|
if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
/* Turn off APST. */
|
2017-04-22 07:19:23 +08:00
|
|
|
dev_dbg(ctrl->device, "APST disabled\n");
|
2021-04-09 14:47:44 +08:00
|
|
|
goto done;
|
|
|
|
}
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
/*
|
|
|
|
* Walk through all states from lowest- to highest-power.
|
|
|
|
* According to the spec, lower-numbered states use more power. NPSS,
|
|
|
|
* despite the name, is the index of the lowest-power state, not the
|
|
|
|
* number of states.
|
|
|
|
*/
|
|
|
|
for (state = (int)ctrl->npss; state >= 0; state--) {
|
|
|
|
u64 total_latency_us, exit_latency_us, transition_ms;
|
2017-06-07 15:25:42 +08:00
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
if (target)
|
|
|
|
table->entries[state] = target;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
|
|
|
/*
|
2021-04-09 14:47:44 +08:00
|
|
|
* Don't allow transitions to the deepest state if it's quirked
|
|
|
|
* off.
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
*/
|
2021-04-09 14:47:44 +08:00
|
|
|
if (state == ctrl->npss &&
|
|
|
|
(ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
|
|
|
|
continue;
|
2017-04-22 07:19:23 +08:00
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
/*
|
|
|
|
* Is this state a useful non-operational state for higher-power
|
|
|
|
* states to autonomously transition to?
|
|
|
|
*/
|
|
|
|
if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE))
|
|
|
|
continue;
|
2017-04-22 07:19:23 +08:00
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
|
|
|
|
if (exit_latency_us > ctrl->ps_max_latency_us)
|
|
|
|
continue;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
total_latency_us = exit_latency_us +
|
|
|
|
le32_to_cpu(ctrl->psd[state].entry_lat);
|
2017-04-22 07:19:23 +08:00
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
/*
|
nvme: extend and modify the APST configuration algorithm
The algorithm that was used until now for building the APST configuration
table has been found to produce entries with excessively long ITPT
(idle time prior to transition) for devices declaring relatively long
entry and exit latencies for non-operational power states. This leads
to unnecessary waste of power and, as a result, failure to pass
mandatory power consumption tests on Chromebook platforms.
The new algorithm is based on two predefined ITPT values and two
predefined latency tolerances. Based on these values, as well as on
exit and entry latencies reported by the device, the algorithm looks
for up to 2 suitable non-operational power states to use as primary
and secondary APST transition targets. The predefined values are
supplied to the nvme driver as module parameters:
- apst_primary_timeout_ms (default: 100)
- apst_secondary_timeout_ms (default: 2000)
- apst_primary_latency_tol_us (default: 15000)
- apst_secondary_latency_tol_us (default: 100000)
The algorithm echoes the approach used by Intel's and Microsoft's drivers
on Windows. The specific default parameter values are also based on those
drivers. Yet, this patch doesn't introduce the ability to dynamically
regenerate the APST table in the event of switching the power source from
AC to battery and back. Adding this functionality may be considered in the
future. In the meantime, the timeouts and tolerances reflect a compromise
between values used by Microsoft for AC and battery scenarios.
In most NVMe devices the new algorithm causes them to implement a more
aggressive power saving policy. While beneficial in most cases, this
sometimes comes at the price of a higher IO processing latency in certain
scenarios as well as at the price of a potential impact on the drive's
endurance (due to more frequent context saving when entering deep non-
operational states). So in order to provide a fallback for systems where
these regressions cannot be tolerated, the patch allows to revert to
the legacy behavior by setting either apst_primary_timeout_ms or
apst_primary_latency_tol_us parameter to 0. Eventually (and possibly after
fine tuning the default values of the module parameters) the legacy behavior
can be removed.
TESTING.
The new algorithm has been extensively tested. Initially, simulations were
used to compare APST tables generated by old and new algorithms for a wide
range of devices. After that, power consumption, performance and latencies
were measured under different workloads on devices from multiple vendors
(WD, Intel, Samsung, Hynix, Kioxia). Below is the description of the tests
and the findings.
General observations.
The effect the patch has on the APST table varies depending on the entry and
exit latencies advertised by the devices. For some devices, the effect is
negligible (e.g. Kioxia KBG40ZNS), for some significant, making the
transitions to PS3 and PS4 much quicker (e.g. WD SN530, Intel 760P), or making
the sleep deeper, PS4 rather than PS3 after a similar amount of time (e.g.
SK Hynix BC511). For some devices (e.g. Samsung PM991) the effect is mixed:
the initial transition happens after a longer idle time, but takes the device
to a lower power state.
Workflows.
In order to evaluate the patch's effect on the power consumption and latency,
7 workflows were used for each device. The workflows were designed to test
the scenarios where significant differences between the old and new behaviors
are most likely. Each workflow was tested twice: with the new and with the
old APST table generation implementation. Power consumption, performance and
latency were measured in the process. The following workflows were used:
1) Consecutive write at the maximum rate with IO depth of 2, with no pauses
2) Repeated pattern of 1000 consecutive writes of 4K packets followed by 50ms
idle time
3) Repeated pattern of 1000 consecutive writes of 4K packets followed by 150ms
idle time
4) Repeated pattern of 1000 consecutive writes of 4K packets followed by 500ms
idle time
5) Repeated pattern of 1000 consecutive writes of 4K packets followed by 1.5s
idle time
6) Repeated pattern of 1000 consecutive writes of 4K packets followed by 5s
idle time
7) Repeated pattern of a single random read of a 4K packet followed by 150ms
idle time
Power consumption
Actual power consumption measurements produced predictable results in
accordance with the APST mechanism's theory of operation.
Devices with long entry and exit latencies such as WD SN530 showed huge
improvement on scenarios 4,5 and 6 of up to 62%. Devices such as Kioxia
KBG40ZNS where the resulting APST table looks virtually identical with
both legacy and new algorithms, showed little or no change in the average power
consumption on all workflows. Devices with extra short latencies such as
Samsung PM991 showed moderate increase in power consumption of up to 18% in
worst case scenarios.
In addition, on Intel and Samsung devices a more complex impact was observed
on scenarios 3, 4 and 7. Our understanding is that due to longer stay in deep
non-operational states between the writes the devices start performing background
operations leading to an increase of power consumption. With the old APST tables
part of these operations are delayed until the scenario is over and a longer idle
period begins, but eventually this extra power is consumed anyway.
Performance.
In terms of performance measured on sustained write or read scenarios, the
effect of the patch is minimal as in this case the device doesn't enter low power
states.
Latency
As expected, in devices where the patch causes a more aggressive power saving
policy (e.g. WD SN530, Intel 760P), an increase in latency was observed in
certain scenarios. Workflow number 7, specifically designed to simulate the
worst case scenario as far as latency is concerned, indeed shows a sharp
increase in average latency (~2ms -> ~53ms on Intel 760P and 0.6 -> 10ms on
WD SN530). The latency increase on other workloads and other devices is much
milder or non-existent.
Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-04-28 17:27:36 +08:00
|
|
|
* This state is good. It can be used as the APST idle target
|
|
|
|
* for higher power states.
|
2021-04-09 14:47:44 +08:00
|
|
|
*/
|
nvme: extend and modify the APST configuration algorithm
The algorithm that was used until now for building the APST configuration
table has been found to produce entries with excessively long ITPT
(idle time prior to transition) for devices declaring relatively long
entry and exit latencies for non-operational power states. This leads
to unnecessary waste of power and, as a result, failure to pass
mandatory power consumption tests on Chromebook platforms.
The new algorithm is based on two predefined ITPT values and two
predefined latency tolerances. Based on these values, as well as on
exit and entry latencies reported by the device, the algorithm looks
for up to 2 suitable non-operational power states to use as primary
and secondary APST transition targets. The predefined values are
supplied to the nvme driver as module parameters:
- apst_primary_timeout_ms (default: 100)
- apst_secondary_timeout_ms (default: 2000)
- apst_primary_latency_tol_us (default: 15000)
- apst_secondary_latency_tol_us (default: 100000)
The algorithm echoes the approach used by Intel's and Microsoft's drivers
on Windows. The specific default parameter values are also based on those
drivers. Yet, this patch doesn't introduce the ability to dynamically
regenerate the APST table in the event of switching the power source from
AC to battery and back. Adding this functionality may be considered in the
future. In the meantime, the timeouts and tolerances reflect a compromise
between values used by Microsoft for AC and battery scenarios.
In most NVMe devices the new algorithm causes them to implement a more
aggressive power saving policy. While beneficial in most cases, this
sometimes comes at the price of a higher IO processing latency in certain
scenarios as well as at the price of a potential impact on the drive's
endurance (due to more frequent context saving when entering deep non-
operational states). So in order to provide a fallback for systems where
these regressions cannot be tolerated, the patch allows to revert to
the legacy behavior by setting either apst_primary_timeout_ms or
apst_primary_latency_tol_us parameter to 0. Eventually (and possibly after
fine tuning the default values of the module parameters) the legacy behavior
can be removed.
TESTING.
The new algorithm has been extensively tested. Initially, simulations were
used to compare APST tables generated by old and new algorithms for a wide
range of devices. After that, power consumption, performance and latencies
were measured under different workloads on devices from multiple vendors
(WD, Intel, Samsung, Hynix, Kioxia). Below is the description of the tests
and the findings.
General observations.
The effect the patch has on the APST table varies depending on the entry and
exit latencies advertised by the devices. For some devices, the effect is
negligible (e.g. Kioxia KBG40ZNS), for some significant, making the
transitions to PS3 and PS4 much quicker (e.g. WD SN530, Intel 760P), or making
the sleep deeper, PS4 rather than PS3 after a similar amount of time (e.g.
SK Hynix BC511). For some devices (e.g. Samsung PM991) the effect is mixed:
the initial transition happens after a longer idle time, but takes the device
to a lower power state.
Workflows.
In order to evaluate the patch's effect on the power consumption and latency,
7 workflows were used for each device. The workflows were designed to test
the scenarios where significant differences between the old and new behaviors
are most likely. Each workflow was tested twice: with the new and with the
old APST table generation implementation. Power consumption, performance and
latency were measured in the process. The following workflows were used:
1) Consecutive write at the maximum rate with IO depth of 2, with no pauses
2) Repeated pattern of 1000 consecutive writes of 4K packets followed by 50ms
idle time
3) Repeated pattern of 1000 consecutive writes of 4K packets followed by 150ms
idle time
4) Repeated pattern of 1000 consecutive writes of 4K packets followed by 500ms
idle time
5) Repeated pattern of 1000 consecutive writes of 4K packets followed by 1.5s
idle time
6) Repeated pattern of 1000 consecutive writes of 4K packets followed by 5s
idle time
7) Repeated pattern of a single random read of a 4K packet followed by 150ms
idle time
Power consumption
Actual power consumption measurements produced predictable results in
accordance with the APST mechanism's theory of operation.
Devices with long entry and exit latencies such as WD SN530 showed huge
improvement on scenarios 4,5 and 6 of up to 62%. Devices such as Kioxia
KBG40ZNS where the resulting APST table looks virtually identical with
both legacy and new algorithms, showed little or no change in the average power
consumption on all workflows. Devices with extra short latencies such as
Samsung PM991 showed moderate increase in power consumption of up to 18% in
worst case scenarios.
In addition, on Intel and Samsung devices a more complex impact was observed
on scenarios 3, 4 and 7. Our understanding is that due to longer stay in deep
non-operational states between the writes the devices start performing background
operations leading to an increase of power consumption. With the old APST tables
part of these operations are delayed until the scenario is over and a longer idle
period begins, but eventually this extra power is consumed anyway.
Performance.
In terms of performance measured on sustained write or read scenarios, the
effect of the patch is minimal as in this case the device doesn't enter low power
states.
Latency
As expected, in devices where the patch causes a more aggressive power saving
policy (e.g. WD SN530, Intel 760P), an increase in latency was observed in
certain scenarios. Workflow number 7, specifically designed to simulate the
worst case scenario as far as latency is concerned, indeed shows a sharp
increase in average latency (~2ms -> ~53ms on Intel 760P and 0.6 -> 10ms on
WD SN530). The latency increase on other workloads and other devices is much
milder or non-existent.
Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-04-28 17:27:36 +08:00
|
|
|
if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
|
|
|
|
if (!nvme_apst_get_transition_time(total_latency_us,
|
|
|
|
&transition_ms, &last_lt_index))
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
transition_ms = total_latency_us + 19;
|
|
|
|
do_div(transition_ms, 20);
|
|
|
|
if (transition_ms > (1 << 24) - 1)
|
|
|
|
transition_ms = (1 << 24) - 1;
|
|
|
|
}
|
2021-04-09 14:47:44 +08:00
|
|
|
|
|
|
|
target = cpu_to_le64((state << 3) | (transition_ms << 8));
|
|
|
|
if (max_ps == -1)
|
|
|
|
max_ps = state;
|
|
|
|
if (total_latency_us > max_lat_us)
|
|
|
|
max_lat_us = total_latency_us;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
}
|
|
|
|
|
2021-04-09 14:47:44 +08:00
|
|
|
if (max_ps == -1)
|
|
|
|
dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
|
|
|
|
else
|
|
|
|
dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
|
|
|
|
max_ps, max_lat_us, (int)sizeof(*table), table);
|
|
|
|
apste = 1;
|
|
|
|
|
|
|
|
done:
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
|
|
|
|
table, sizeof(*table), NULL);
|
|
|
|
if (ret)
|
|
|
|
dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
|
|
|
|
kfree(table);
|
2017-08-10 17:23:31 +08:00
|
|
|
return ret;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_set_latency_tolerance(struct device *dev, s32 val)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
u64 latency;
|
|
|
|
|
|
|
|
switch (val) {
|
|
|
|
case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
|
|
|
|
case PM_QOS_LATENCY_ANY:
|
|
|
|
latency = U64_MAX;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
latency = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ctrl->ps_max_latency_us != latency) {
|
|
|
|
ctrl->ps_max_latency_us = latency;
|
2021-04-09 17:46:12 +08:00
|
|
|
if (ctrl->state == NVME_CTRL_LIVE)
|
|
|
|
nvme_configure_apst(ctrl);
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-02-23 04:32:36 +08:00
|
|
|
struct nvme_core_quirk_entry {
|
|
|
|
/*
|
|
|
|
* NVMe model and firmware strings are padded with spaces. For
|
|
|
|
* simplicity, strings in the quirk table are padded with NULLs
|
|
|
|
* instead.
|
|
|
|
*/
|
|
|
|
u16 vid;
|
|
|
|
const char *mn;
|
|
|
|
const char *fr;
|
|
|
|
unsigned long quirks;
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct nvme_core_quirk_entry core_quirks[] = {
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
{
|
2017-04-21 04:37:56 +08:00
|
|
|
/*
|
|
|
|
* This Toshiba device seems to die using any APST states. See:
|
|
|
|
* https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
|
|
|
|
*/
|
|
|
|
.vid = 0x1179,
|
|
|
|
.mn = "THNSF5256GPUK TOSHIBA",
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
.quirks = NVME_QUIRK_NO_APST,
|
2019-08-17 04:16:19 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* This LiteON CL1-3D*-Q11 firmware version has a race
|
|
|
|
* condition associated with actions related to suspend to idle
|
|
|
|
* LiteON has resolved the problem in future firmware
|
|
|
|
*/
|
|
|
|
.vid = 0x14a4,
|
|
|
|
.fr = "22301111",
|
|
|
|
.quirks = NVME_QUIRK_SIMPLE_SUSPEND,
|
2021-11-06 10:08:57 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* This Kioxia CD6-V Series / HPE PE8030 device times out and
|
|
|
|
* aborts I/O during any load, but more easily reproducible
|
|
|
|
* with discards (fstrim).
|
|
|
|
*
|
|
|
|
* The device is left in a state where it is also not possible
|
|
|
|
* to use "nvme set-feature" to disable APST, but booting with
|
|
|
|
* nvme_core.default_ps_max_latency=0 works.
|
|
|
|
*/
|
|
|
|
.vid = 0x1e0f,
|
|
|
|
.mn = "KCD6XVUL6T40",
|
|
|
|
.quirks = NVME_QUIRK_NO_APST,
|
2017-04-21 04:37:56 +08:00
|
|
|
}
|
2017-02-23 04:32:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/* match is null-terminated but idstr is space-padded. */
|
|
|
|
static bool string_matches(const char *idstr, const char *match, size_t len)
|
|
|
|
{
|
|
|
|
size_t matchlen;
|
|
|
|
|
|
|
|
if (!match)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
matchlen = strlen(match);
|
|
|
|
WARN_ON_ONCE(matchlen > len);
|
|
|
|
|
|
|
|
if (memcmp(idstr, match, matchlen))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
for (; matchlen < len; matchlen++)
|
|
|
|
if (idstr[matchlen] != ' ')
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool quirk_matches(const struct nvme_id_ctrl *id,
|
|
|
|
const struct nvme_core_quirk_entry *q)
|
|
|
|
{
|
|
|
|
return q->vid == le16_to_cpu(id->vid) &&
|
|
|
|
string_matches(id->mn, q->mn, sizeof(id->mn)) &&
|
|
|
|
string_matches(id->fr, q->fr, sizeof(id->fr));
|
|
|
|
}
|
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
|
|
|
|
struct nvme_id_ctrl *id)
|
2017-06-26 18:39:02 +08:00
|
|
|
{
|
|
|
|
size_t nqnlen;
|
|
|
|
int off;
|
|
|
|
|
2019-01-09 01:20:51 +08:00
|
|
|
if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
|
|
|
|
nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
|
|
|
|
if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
|
|
|
|
strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
|
|
|
|
return;
|
|
|
|
}
|
2017-06-26 18:39:02 +08:00
|
|
|
|
2019-01-09 01:20:51 +08:00
|
|
|
if (ctrl->vs >= NVME_VS(1, 2, 1))
|
|
|
|
dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
|
|
|
|
}
|
2017-06-26 18:39:02 +08:00
|
|
|
|
|
|
|
/* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
|
2017-11-09 20:48:55 +08:00
|
|
|
off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
|
2019-01-09 00:37:43 +08:00
|
|
|
"nqn.2014.08.org.nvmexpress:%04x%04x",
|
2017-06-26 18:39:02 +08:00
|
|
|
le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
|
2017-11-09 20:48:55 +08:00
|
|
|
memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
|
2017-06-26 18:39:02 +08:00
|
|
|
off += sizeof(id->sn);
|
2017-11-09 20:48:55 +08:00
|
|
|
memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
|
2017-06-26 18:39:02 +08:00
|
|
|
off += sizeof(id->mn);
|
2017-11-09 20:48:55 +08:00
|
|
|
memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
|
|
|
|
}
|
|
|
|
|
2019-07-19 07:53:50 +08:00
|
|
|
static void nvme_release_subsystem(struct device *dev)
|
2017-11-09 20:48:55 +08:00
|
|
|
{
|
2019-07-19 07:53:50 +08:00
|
|
|
struct nvme_subsystem *subsys =
|
|
|
|
container_of(dev, struct nvme_subsystem, dev);
|
|
|
|
|
2019-09-06 00:33:54 +08:00
|
|
|
if (subsys->instance >= 0)
|
2022-02-14 17:07:27 +08:00
|
|
|
ida_free(&nvme_instance_ida, subsys->instance);
|
2017-11-09 20:48:55 +08:00
|
|
|
kfree(subsys);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_destroy_subsystem(struct kref *ref)
|
|
|
|
{
|
|
|
|
struct nvme_subsystem *subsys =
|
|
|
|
container_of(ref, struct nvme_subsystem, ref);
|
|
|
|
|
|
|
|
mutex_lock(&nvme_subsystems_lock);
|
|
|
|
list_del(&subsys->entry);
|
|
|
|
mutex_unlock(&nvme_subsystems_lock);
|
|
|
|
|
2017-11-09 20:50:43 +08:00
|
|
|
ida_destroy(&subsys->ns_ida);
|
2017-11-09 20:48:55 +08:00
|
|
|
device_del(&subsys->dev);
|
|
|
|
put_device(&subsys->dev);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_put_subsystem(struct nvme_subsystem *subsys)
|
|
|
|
{
|
|
|
|
kref_put(&subsys->ref, nvme_destroy_subsystem);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
|
|
|
|
{
|
|
|
|
struct nvme_subsystem *subsys;
|
|
|
|
|
|
|
|
lockdep_assert_held(&nvme_subsystems_lock);
|
|
|
|
|
2019-09-04 05:20:37 +08:00
|
|
|
/*
|
|
|
|
* Fail matches for discovery subsystems. This results
|
|
|
|
* in each discovery controller bound to a unique subsystem.
|
|
|
|
* This avoids issues with validating controller values
|
|
|
|
* that can only be true when there is a single unique subsystem.
|
|
|
|
* There may be multiple and completely independent entities
|
|
|
|
* that provide discovery controllers.
|
|
|
|
*/
|
|
|
|
if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
|
|
|
|
return NULL;
|
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
list_for_each_entry(subsys, &nvme_subsystems, entry) {
|
|
|
|
if (strcmp(subsys->subnqn, subsysnqn))
|
|
|
|
continue;
|
|
|
|
if (!kref_get_unless_zero(&subsys->ref))
|
|
|
|
continue;
|
|
|
|
return subsys;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2017-11-10 17:58:23 +08:00
|
|
|
#define SUBSYS_ATTR_RO(_name, _mode, _show) \
|
|
|
|
struct device_attribute subsys_attr_##_name = \
|
|
|
|
__ATTR(_name, _mode, _show, NULL)
|
|
|
|
|
|
|
|
static ssize_t nvme_subsys_show_nqn(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_subsystem *subsys =
|
|
|
|
container_of(dev, struct nvme_subsystem, dev);
|
|
|
|
|
2021-02-02 15:06:17 +08:00
|
|
|
return sysfs_emit(buf, "%s\n", subsys->subnqn);
|
2017-11-10 17:58:23 +08:00
|
|
|
}
|
|
|
|
static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
|
|
|
|
|
2021-09-22 14:35:23 +08:00
|
|
|
static ssize_t nvme_subsys_show_type(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_subsystem *subsys =
|
|
|
|
container_of(dev, struct nvme_subsystem, dev);
|
|
|
|
|
|
|
|
switch (subsys->subtype) {
|
|
|
|
case NVME_NQN_DISC:
|
|
|
|
return sysfs_emit(buf, "discovery\n");
|
|
|
|
case NVME_NQN_NVME:
|
|
|
|
return sysfs_emit(buf, "nvm\n");
|
|
|
|
default:
|
|
|
|
return sysfs_emit(buf, "reserved\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
static SUBSYS_ATTR_RO(subsystype, S_IRUGO, nvme_subsys_show_type);
|
|
|
|
|
2017-11-10 17:58:23 +08:00
|
|
|
#define nvme_subsys_show_str_function(field) \
|
|
|
|
static ssize_t subsys_##field##_show(struct device *dev, \
|
|
|
|
struct device_attribute *attr, char *buf) \
|
|
|
|
{ \
|
|
|
|
struct nvme_subsystem *subsys = \
|
|
|
|
container_of(dev, struct nvme_subsystem, dev); \
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%.*s\n", \
|
|
|
|
(int)sizeof(subsys->field), subsys->field); \
|
2017-11-10 17:58:23 +08:00
|
|
|
} \
|
|
|
|
static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
|
|
|
|
|
|
|
|
nvme_subsys_show_str_function(model);
|
|
|
|
nvme_subsys_show_str_function(serial);
|
|
|
|
nvme_subsys_show_str_function(firmware_rev);
|
|
|
|
|
|
|
|
static struct attribute *nvme_subsys_attrs[] = {
|
|
|
|
&subsys_attr_model.attr,
|
|
|
|
&subsys_attr_serial.attr,
|
|
|
|
&subsys_attr_firmware_rev.attr,
|
|
|
|
&subsys_attr_subsysnqn.attr,
|
2021-09-22 14:35:23 +08:00
|
|
|
&subsys_attr_subsystype.attr,
|
2019-02-18 18:43:26 +08:00
|
|
|
#ifdef CONFIG_NVME_MULTIPATH
|
|
|
|
&subsys_attr_iopolicy.attr,
|
|
|
|
#endif
|
2017-11-10 17:58:23 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2021-01-09 07:41:47 +08:00
|
|
|
static const struct attribute_group nvme_subsys_attrs_group = {
|
2017-11-10 17:58:23 +08:00
|
|
|
.attrs = nvme_subsys_attrs,
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct attribute_group *nvme_subsys_attrs_groups[] = {
|
|
|
|
&nvme_subsys_attrs_group,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2021-01-14 08:00:22 +08:00
|
|
|
static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
return ctrl->opts && ctrl->opts->discovery_nqn;
|
|
|
|
}
|
|
|
|
|
2019-05-09 15:01:26 +08:00
|
|
|
static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
|
|
|
|
struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
2018-01-04 23:56:14 +08:00
|
|
|
{
|
2019-05-09 15:01:26 +08:00
|
|
|
struct nvme_ctrl *tmp;
|
2018-01-04 23:56:14 +08:00
|
|
|
|
2019-05-08 15:48:27 +08:00
|
|
|
lockdep_assert_held(&nvme_subsystems_lock);
|
|
|
|
|
2019-05-09 15:01:26 +08:00
|
|
|
list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
|
2020-03-10 22:39:10 +08:00
|
|
|
if (nvme_state_terminal(tmp))
|
2019-05-09 15:01:26 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
if (tmp->cntlid == ctrl->cntlid) {
|
|
|
|
dev_err(ctrl->device,
|
2021-11-30 00:24:34 +08:00
|
|
|
"Duplicate cntlid %u with %s, subsys %s, rejecting\n",
|
|
|
|
ctrl->cntlid, dev_name(tmp->device),
|
|
|
|
subsys->subnqn);
|
2019-05-09 15:01:26 +08:00
|
|
|
return false;
|
|
|
|
}
|
2018-01-04 23:56:14 +08:00
|
|
|
|
2020-04-04 01:53:46 +08:00
|
|
|
if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
|
2021-01-14 08:00:22 +08:00
|
|
|
nvme_discovery_ctrl(ctrl))
|
2019-05-09 15:01:26 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
dev_err(ctrl->device,
|
|
|
|
"Subsystem does not support multiple controllers\n");
|
|
|
|
return false;
|
2018-01-04 23:56:14 +08:00
|
|
|
}
|
|
|
|
|
2019-05-09 15:01:26 +08:00
|
|
|
return true;
|
2018-01-04 23:56:14 +08:00
|
|
|
}
|
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
|
|
|
{
|
|
|
|
struct nvme_subsystem *subsys, *found;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
|
|
|
|
if (!subsys)
|
|
|
|
return -ENOMEM;
|
2019-09-06 00:33:54 +08:00
|
|
|
|
|
|
|
subsys->instance = -1;
|
2017-11-09 20:48:55 +08:00
|
|
|
mutex_init(&subsys->lock);
|
|
|
|
kref_init(&subsys->ref);
|
|
|
|
INIT_LIST_HEAD(&subsys->ctrls);
|
2017-11-09 20:50:43 +08:00
|
|
|
INIT_LIST_HEAD(&subsys->nsheads);
|
2017-11-09 20:48:55 +08:00
|
|
|
nvme_init_subnqn(subsys, ctrl, id);
|
|
|
|
memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
|
|
|
|
memcpy(subsys->model, id->mn, sizeof(subsys->model));
|
|
|
|
memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
|
|
|
|
subsys->vendor_id = le16_to_cpu(id->vid);
|
|
|
|
subsys->cmic = id->cmic;
|
2021-09-22 14:35:23 +08:00
|
|
|
|
|
|
|
/* Versions prior to 1.4 don't necessarily report a valid type */
|
|
|
|
if (id->cntrltype == NVME_CTRL_DISC ||
|
|
|
|
!strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
|
|
|
|
subsys->subtype = NVME_NQN_DISC;
|
|
|
|
else
|
|
|
|
subsys->subtype = NVME_NQN_NVME;
|
|
|
|
|
2021-09-22 14:35:24 +08:00
|
|
|
if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
|
|
|
|
dev_err(ctrl->device,
|
|
|
|
"Subsystem %s is not a discovery controller",
|
|
|
|
subsys->subnqn);
|
|
|
|
kfree(subsys);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2019-06-29 00:53:31 +08:00
|
|
|
subsys->awupf = le16_to_cpu(id->awupf);
|
2021-12-20 20:51:45 +08:00
|
|
|
nvme_mpath_default_iopolicy(subsys);
|
2017-11-09 20:48:55 +08:00
|
|
|
|
|
|
|
subsys->dev.class = nvme_subsys_class;
|
|
|
|
subsys->dev.release = nvme_release_subsystem;
|
2017-11-10 17:58:23 +08:00
|
|
|
subsys->dev.groups = nvme_subsys_attrs_groups;
|
2019-09-06 00:33:54 +08:00
|
|
|
dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
|
2017-11-09 20:48:55 +08:00
|
|
|
device_initialize(&subsys->dev);
|
|
|
|
|
|
|
|
mutex_lock(&nvme_subsystems_lock);
|
|
|
|
found = __nvme_find_get_subsystem(subsys->subnqn);
|
|
|
|
if (found) {
|
2019-07-19 07:53:50 +08:00
|
|
|
put_device(&subsys->dev);
|
2017-11-09 20:48:55 +08:00
|
|
|
subsys = found;
|
2019-05-08 15:48:27 +08:00
|
|
|
|
2019-05-09 15:01:26 +08:00
|
|
|
if (!nvme_validate_cntlid(subsys, ctrl, id)) {
|
2017-11-09 20:48:55 +08:00
|
|
|
ret = -EINVAL;
|
2019-05-08 15:48:27 +08:00
|
|
|
goto out_put_subsystem;
|
2017-11-09 20:48:55 +08:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ret = device_add(&subsys->dev);
|
|
|
|
if (ret) {
|
|
|
|
dev_err(ctrl->device,
|
|
|
|
"failed to register subsystem device.\n");
|
2019-08-01 07:35:34 +08:00
|
|
|
put_device(&subsys->dev);
|
2017-11-09 20:48:55 +08:00
|
|
|
goto out_unlock;
|
|
|
|
}
|
2017-11-09 20:50:43 +08:00
|
|
|
ida_init(&subsys->ns_ida);
|
2017-11-09 20:48:55 +08:00
|
|
|
list_add_tail(&subsys->entry, &nvme_subsystems);
|
|
|
|
}
|
|
|
|
|
2019-09-23 22:18:36 +08:00
|
|
|
ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
|
|
|
|
dev_name(ctrl->device));
|
|
|
|
if (ret) {
|
2017-11-09 20:48:55 +08:00
|
|
|
dev_err(ctrl->device,
|
|
|
|
"failed to create sysfs link from subsystem.\n");
|
2019-05-08 15:48:27 +08:00
|
|
|
goto out_put_subsystem;
|
2017-11-09 20:48:55 +08:00
|
|
|
}
|
|
|
|
|
2019-09-06 00:33:54 +08:00
|
|
|
if (!found)
|
|
|
|
subsys->instance = ctrl->instance;
|
2019-05-08 15:48:27 +08:00
|
|
|
ctrl->subsys = subsys;
|
2017-11-09 20:48:55 +08:00
|
|
|
list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
|
2019-05-08 15:48:27 +08:00
|
|
|
mutex_unlock(&nvme_subsystems_lock);
|
2017-11-09 20:48:55 +08:00
|
|
|
return 0;
|
|
|
|
|
2019-05-08 15:48:27 +08:00
|
|
|
out_put_subsystem:
|
|
|
|
nvme_put_subsystem(subsys);
|
2017-11-09 20:48:55 +08:00
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&nvme_subsystems_lock);
|
|
|
|
return ret;
|
2017-06-26 18:39:02 +08:00
|
|
|
}
|
|
|
|
|
2020-06-30 03:06:40 +08:00
|
|
|
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
|
2018-06-06 20:39:00 +08:00
|
|
|
void *log, size_t size, u64 offset)
|
2017-11-08 01:28:31 +08:00
|
|
|
{
|
|
|
|
struct nvme_command c = { };
|
2020-04-04 00:24:01 +08:00
|
|
|
u32 dwlen = nvme_bytes_to_numd(size);
|
2018-02-26 20:55:40 +08:00
|
|
|
|
|
|
|
c.get_log_page.opcode = nvme_admin_get_log_page;
|
2018-06-06 20:39:00 +08:00
|
|
|
c.get_log_page.nsid = cpu_to_le32(nsid);
|
2018-02-26 20:55:40 +08:00
|
|
|
c.get_log_page.lid = log_page;
|
2018-06-06 20:39:00 +08:00
|
|
|
c.get_log_page.lsp = lsp;
|
2018-02-26 20:55:40 +08:00
|
|
|
c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
|
|
|
|
c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
|
2018-04-12 23:16:03 +08:00
|
|
|
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
|
|
|
|
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
|
2020-06-30 03:06:40 +08:00
|
|
|
c.get_log_page.csi = csi;
|
2017-11-08 01:28:31 +08:00
|
|
|
|
|
|
|
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
|
|
|
|
}
|
|
|
|
|
2020-06-30 03:06:40 +08:00
|
|
|
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
|
|
|
|
struct nvme_effects_log **log)
|
2017-11-08 01:28:32 +08:00
|
|
|
{
|
2020-11-14 02:28:30 +08:00
|
|
|
struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
|
2017-11-08 01:28:32 +08:00
|
|
|
int ret;
|
|
|
|
|
2020-06-30 03:06:40 +08:00
|
|
|
if (cel)
|
|
|
|
goto out;
|
2017-11-08 01:28:32 +08:00
|
|
|
|
2020-06-30 03:06:40 +08:00
|
|
|
cel = kzalloc(sizeof(*cel), GFP_KERNEL);
|
|
|
|
if (!cel)
|
|
|
|
return -ENOMEM;
|
2017-11-08 01:28:32 +08:00
|
|
|
|
2020-09-23 03:49:38 +08:00
|
|
|
ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
|
2020-11-14 02:28:30 +08:00
|
|
|
cel, sizeof(*cel), 0);
|
2017-11-08 01:28:32 +08:00
|
|
|
if (ret) {
|
2020-06-30 03:06:40 +08:00
|
|
|
kfree(cel);
|
|
|
|
return ret;
|
2017-11-08 01:28:32 +08:00
|
|
|
}
|
2020-06-30 03:06:40 +08:00
|
|
|
|
2020-11-14 02:28:30 +08:00
|
|
|
xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
|
2020-06-30 03:06:40 +08:00
|
|
|
out:
|
2020-11-14 02:28:30 +08:00
|
|
|
*log = cel;
|
2020-06-30 03:06:40 +08:00
|
|
|
return 0;
|
2017-06-26 18:39:02 +08:00
|
|
|
}
|
|
|
|
|
2021-03-25 07:18:05 +08:00
|
|
|
static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)
|
2015-11-28 22:37:52 +08:00
|
|
|
{
|
2021-04-03 00:58:20 +08:00
|
|
|
u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val;
|
2015-11-28 22:37:52 +08:00
|
|
|
|
2021-04-03 00:58:20 +08:00
|
|
|
if (check_shl_overflow(1U, units + page_shift - 9, &val))
|
|
|
|
return UINT_MAX;
|
|
|
|
return val;
|
2021-03-25 07:18:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_command c = { };
|
|
|
|
struct nvme_id_ctrl_nvm *id;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
|
|
|
|
ctrl->max_discard_sectors = UINT_MAX;
|
|
|
|
ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
|
|
|
|
} else {
|
|
|
|
ctrl->max_discard_sectors = 0;
|
|
|
|
ctrl->max_discard_segments = 0;
|
2015-11-28 22:40:19 +08:00
|
|
|
}
|
2015-11-28 22:37:52 +08:00
|
|
|
|
2021-03-25 07:18:05 +08:00
|
|
|
/*
|
|
|
|
* Even though NVMe spec explicitly states that MDTS is not applicable
|
|
|
|
* to the write-zeroes, we are cautious and limit the size to the
|
|
|
|
* controllers max_hw_sectors value, which is based on the MDTS field
|
|
|
|
* and possibly other limiting factors.
|
|
|
|
*/
|
|
|
|
if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
|
|
|
|
!(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
|
|
|
|
ctrl->max_zeroes_sectors = ctrl->max_hw_sectors;
|
|
|
|
else
|
|
|
|
ctrl->max_zeroes_sectors = 0;
|
|
|
|
|
|
|
|
if (nvme_ctrl_limited_cns(ctrl))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
id = kzalloc(sizeof(*id), GFP_KERNEL);
|
|
|
|
if (!id)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
c.identify.opcode = nvme_admin_identify;
|
|
|
|
c.identify.cns = NVME_ID_CNS_CS_CTRL;
|
|
|
|
c.identify.csi = NVME_CSI_NVM;
|
|
|
|
|
|
|
|
ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
|
|
|
|
if (ret)
|
|
|
|
goto free_data;
|
|
|
|
|
|
|
|
if (id->dmrl)
|
|
|
|
ctrl->max_discard_segments = id->dmrl;
|
|
|
|
if (id->dmrsl)
|
|
|
|
ctrl->max_discard_sectors = le32_to_cpu(id->dmrsl);
|
|
|
|
if (id->wzsl)
|
|
|
|
ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
|
|
|
|
|
|
|
|
free_data:
|
|
|
|
kfree(id);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2021-03-01 10:06:05 +08:00
|
|
|
static int nvme_init_identify(struct nvme_ctrl *ctrl)
|
2015-11-28 22:37:52 +08:00
|
|
|
{
|
|
|
|
struct nvme_id_ctrl *id;
|
2016-06-07 05:20:48 +08:00
|
|
|
u32 max_hw_sectors;
|
2017-06-27 04:39:54 +08:00
|
|
|
bool prev_apst_enabled;
|
2021-03-25 07:18:05 +08:00
|
|
|
int ret;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2015-11-28 22:37:52 +08:00
|
|
|
ret = nvme_identify_ctrl(ctrl, &id);
|
|
|
|
if (ret) {
|
2016-02-10 23:51:15 +08:00
|
|
|
dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
|
2015-11-28 22:37:52 +08:00
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
2017-11-08 01:28:32 +08:00
|
|
|
if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
|
2020-06-30 03:06:40 +08:00
|
|
|
ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
|
2017-11-08 01:28:32 +08:00
|
|
|
if (ret < 0)
|
2018-05-25 17:06:27 +08:00
|
|
|
goto out_free;
|
2017-11-08 01:28:32 +08:00
|
|
|
}
|
2017-06-26 18:39:02 +08:00
|
|
|
|
2019-08-14 22:26:10 +08:00
|
|
|
if (!(ctrl->ops->flags & NVME_F_FABRICS))
|
|
|
|
ctrl->cntlid = le16_to_cpu(id->cntlid);
|
|
|
|
|
2017-02-23 04:32:36 +08:00
|
|
|
if (!ctrl->identified) {
|
2021-03-01 10:06:05 +08:00
|
|
|
unsigned int i;
|
2017-11-09 20:48:55 +08:00
|
|
|
|
|
|
|
ret = nvme_init_subsystem(ctrl, id);
|
|
|
|
if (ret)
|
|
|
|
goto out_free;
|
|
|
|
|
2017-02-23 04:32:36 +08:00
|
|
|
/*
|
|
|
|
* Check for quirks. Quirk can depend on firmware version,
|
|
|
|
* so, in principle, the set of quirks present can change
|
|
|
|
* across a reset. As a possible future enhancement, we
|
|
|
|
* could re-scan for quirks every time we reinitialize
|
|
|
|
* the device, but we'd have to make sure that the driver
|
|
|
|
* behaves intelligently if the quirks change.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
|
|
|
|
if (quirk_matches(id, &core_quirks[i]))
|
|
|
|
ctrl->quirks |= core_quirks[i].quirks;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-04-22 07:19:24 +08:00
|
|
|
if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
|
2017-06-09 22:17:21 +08:00
|
|
|
dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
|
2017-04-22 07:19:24 +08:00
|
|
|
ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
|
|
|
|
}
|
|
|
|
|
2018-11-28 00:40:57 +08:00
|
|
|
ctrl->crdt[0] = le16_to_cpu(id->crdt1);
|
|
|
|
ctrl->crdt[1] = le16_to_cpu(id->crdt2);
|
|
|
|
ctrl->crdt[2] = le16_to_cpu(id->crdt3);
|
|
|
|
|
2017-02-17 20:59:40 +08:00
|
|
|
ctrl->oacs = le16_to_cpu(id->oacs);
|
2019-02-25 19:00:04 +08:00
|
|
|
ctrl->oncs = le16_to_cpu(id->oncs);
|
2019-05-21 01:13:04 +08:00
|
|
|
ctrl->mtfa = le16_to_cpu(id->mtfa);
|
2018-05-22 17:09:55 +08:00
|
|
|
ctrl->oaes = le32_to_cpu(id->oaes);
|
2019-11-06 22:35:18 +08:00
|
|
|
ctrl->wctemp = le16_to_cpu(id->wctemp);
|
|
|
|
ctrl->cctemp = le16_to_cpu(id->cctemp);
|
|
|
|
|
2015-11-20 16:36:44 +08:00
|
|
|
atomic_set(&ctrl->abort_limit, id->acl + 1);
|
2015-11-28 22:37:52 +08:00
|
|
|
ctrl->vwc = id->vwc;
|
|
|
|
if (id->mdts)
|
2021-03-25 07:18:05 +08:00
|
|
|
max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
|
2015-11-28 22:37:52 +08:00
|
|
|
else
|
2016-06-07 05:20:48 +08:00
|
|
|
max_hw_sectors = UINT_MAX;
|
|
|
|
ctrl->max_hw_sectors =
|
|
|
|
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
|
2015-11-28 22:37:52 +08:00
|
|
|
|
2016-03-03 01:07:11 +08:00
|
|
|
nvme_set_queue_limits(ctrl, ctrl->admin_q);
|
2016-06-13 22:45:26 +08:00
|
|
|
ctrl->sgls = le32_to_cpu(id->sgls);
|
2016-06-13 22:45:28 +08:00
|
|
|
ctrl->kas = le16_to_cpu(id->kas);
|
2018-05-14 14:48:54 +08:00
|
|
|
ctrl->max_namespaces = le32_to_cpu(id->mnan);
|
2018-11-03 01:28:14 +08:00
|
|
|
ctrl->ctratt = le32_to_cpu(id->ctratt);
|
2016-06-13 22:45:26 +08:00
|
|
|
|
2022-02-09 03:33:46 +08:00
|
|
|
ctrl->cntrltype = id->cntrltype;
|
|
|
|
ctrl->dctype = id->dctype;
|
|
|
|
|
2017-08-26 07:14:50 +08:00
|
|
|
if (id->rtd3e) {
|
|
|
|
/* us -> s */
|
2020-06-24 14:49:58 +08:00
|
|
|
u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
|
2017-08-26 07:14:50 +08:00
|
|
|
|
|
|
|
ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
|
|
|
|
shutdown_timeout, 60);
|
|
|
|
|
|
|
|
if (ctrl->shutdown_timeout != shutdown_timeout)
|
2017-12-31 21:33:27 +08:00
|
|
|
dev_info(ctrl->device,
|
2017-08-26 07:14:50 +08:00
|
|
|
"Shutdown timeout set to %u seconds\n",
|
|
|
|
ctrl->shutdown_timeout);
|
|
|
|
} else
|
|
|
|
ctrl->shutdown_timeout = shutdown_timeout;
|
|
|
|
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
ctrl->npss = id->npss;
|
2017-06-27 04:39:54 +08:00
|
|
|
ctrl->apsta = id->apsta;
|
|
|
|
prev_apst_enabled = ctrl->apst_enabled;
|
2017-04-22 07:19:24 +08:00
|
|
|
if (ctrl->quirks & NVME_QUIRK_NO_APST) {
|
|
|
|
if (force_apst && id->apsta) {
|
2017-06-09 22:17:21 +08:00
|
|
|
dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
|
2017-06-27 04:39:54 +08:00
|
|
|
ctrl->apst_enabled = true;
|
2017-04-22 07:19:24 +08:00
|
|
|
} else {
|
2017-06-27 04:39:54 +08:00
|
|
|
ctrl->apst_enabled = false;
|
2017-04-22 07:19:24 +08:00
|
|
|
}
|
|
|
|
} else {
|
2017-06-27 04:39:54 +08:00
|
|
|
ctrl->apst_enabled = id->apsta;
|
2017-04-22 07:19:24 +08:00
|
|
|
}
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
|
|
|
|
|
2017-05-20 21:14:44 +08:00
|
|
|
if (ctrl->ops->flags & NVME_F_FABRICS) {
|
2016-06-13 22:45:26 +08:00
|
|
|
ctrl->icdoff = le16_to_cpu(id->icdoff);
|
|
|
|
ctrl->ioccsz = le32_to_cpu(id->ioccsz);
|
|
|
|
ctrl->iorcsz = le32_to_cpu(id->iorcsz);
|
|
|
|
ctrl->maxcmd = le16_to_cpu(id->maxcmd);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In fabrics we need to verify the cntlid matches the
|
|
|
|
* admin connect
|
|
|
|
*/
|
2017-08-10 17:23:31 +08:00
|
|
|
if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
|
2019-11-22 01:58:10 +08:00
|
|
|
dev_err(ctrl->device,
|
|
|
|
"Mismatching cntlid: Connect %u vs Identify "
|
|
|
|
"%u, rejecting\n",
|
|
|
|
ctrl->cntlid, le16_to_cpu(id->cntlid));
|
2016-06-13 22:45:26 +08:00
|
|
|
ret = -EINVAL;
|
2017-08-10 17:23:31 +08:00
|
|
|
goto out_free;
|
|
|
|
}
|
2016-06-13 22:45:28 +08:00
|
|
|
|
2021-01-14 08:00:22 +08:00
|
|
|
if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
|
2017-06-09 22:17:21 +08:00
|
|
|
dev_err(ctrl->device,
|
2016-06-13 22:45:28 +08:00
|
|
|
"keep-alive support is mandatory for fabrics\n");
|
|
|
|
ret = -EINVAL;
|
2017-08-10 17:23:31 +08:00
|
|
|
goto out_free;
|
2016-06-13 22:45:28 +08:00
|
|
|
}
|
2016-06-13 22:45:26 +08:00
|
|
|
} else {
|
2017-05-12 23:16:10 +08:00
|
|
|
ctrl->hmpre = le32_to_cpu(id->hmpre);
|
|
|
|
ctrl->hmmin = le32_to_cpu(id->hmmin);
|
2017-09-12 00:09:28 +08:00
|
|
|
ctrl->hmminds = le32_to_cpu(id->hmminds);
|
|
|
|
ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
|
2016-06-13 22:45:26 +08:00
|
|
|
}
|
2016-03-03 01:07:11 +08:00
|
|
|
|
2021-04-29 20:18:53 +08:00
|
|
|
ret = nvme_mpath_init_identify(ctrl, id);
|
2018-05-14 14:48:54 +08:00
|
|
|
if (ret < 0)
|
2021-03-01 10:06:05 +08:00
|
|
|
goto out_free;
|
2018-05-14 14:48:54 +08:00
|
|
|
|
2017-06-27 04:39:54 +08:00
|
|
|
if (ctrl->apst_enabled && !prev_apst_enabled)
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
dev_pm_qos_expose_latency_tolerance(ctrl->device);
|
2017-06-27 04:39:54 +08:00
|
|
|
else if (!ctrl->apst_enabled && prev_apst_enabled)
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
dev_pm_qos_hide_latency_tolerance(ctrl->device);
|
|
|
|
|
2021-03-01 10:06:05 +08:00
|
|
|
out_free:
|
|
|
|
kfree(id);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize the cached copies of the Identify data and various controller
|
|
|
|
* register in our nvme_ctrl structure. This should be called as soon as
|
|
|
|
* the admin queue is fully up and running.
|
|
|
|
*/
|
|
|
|
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
|
|
|
|
if (ret) {
|
|
|
|
dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
|
|
|
|
|
|
|
|
if (ctrl->vs >= NVME_VS(1, 1, 0))
|
|
|
|
ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
|
|
|
|
|
|
|
|
ret = nvme_init_identify(ctrl);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2021-03-25 07:18:05 +08:00
|
|
|
ret = nvme_init_non_mdts_limits(ctrl);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
2017-08-10 17:23:31 +08:00
|
|
|
ret = nvme_configure_apst(ctrl);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2021-04-11 04:16:21 +08:00
|
|
|
|
2017-08-16 15:51:29 +08:00
|
|
|
ret = nvme_configure_timestamp(ctrl);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2017-08-10 17:23:31 +08:00
|
|
|
|
|
|
|
ret = nvme_configure_directives(ctrl);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
2018-11-28 00:40:57 +08:00
|
|
|
ret = nvme_configure_acre(ctrl);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
2021-01-14 08:00:22 +08:00
|
|
|
if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
|
2020-09-17 23:50:25 +08:00
|
|
|
ret = nvme_hwmon_init(ctrl);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
}
|
2019-11-06 22:35:18 +08:00
|
|
|
|
2017-02-23 04:32:36 +08:00
|
|
|
ctrl->identified = true;
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
|
2017-08-10 17:23:31 +08:00
|
|
|
return 0;
|
2015-11-28 22:37:52 +08:00
|
|
|
}
|
2021-03-01 10:06:04 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
|
2015-11-28 22:37:52 +08:00
|
|
|
|
2015-11-28 22:40:19 +08:00
|
|
|
static int nvme_dev_open(struct inode *inode, struct file *file)
|
2015-11-26 17:54:19 +08:00
|
|
|
{
|
2017-10-18 22:59:25 +08:00
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(inode->i_cdev, struct nvme_ctrl, cdev);
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2018-01-06 08:01:58 +08:00
|
|
|
switch (ctrl->state) {
|
|
|
|
case NVME_CTRL_LIVE:
|
|
|
|
break;
|
|
|
|
default:
|
2017-10-18 22:59:25 +08:00
|
|
|
return -EWOULDBLOCK;
|
2018-01-06 08:01:58 +08:00
|
|
|
}
|
|
|
|
|
2020-09-16 11:53:25 +08:00
|
|
|
nvme_get_ctrl(ctrl);
|
2020-10-07 07:36:47 +08:00
|
|
|
if (!try_module_get(ctrl->ops->module)) {
|
|
|
|
nvme_put_ctrl(ctrl);
|
2020-09-16 11:53:25 +08:00
|
|
|
return -EINVAL;
|
2020-10-07 07:36:47 +08:00
|
|
|
}
|
2020-09-16 11:53:25 +08:00
|
|
|
|
2017-10-18 22:59:25 +08:00
|
|
|
file->private_data = ctrl;
|
2015-11-28 22:40:19 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-09-16 11:53:25 +08:00
|
|
|
static int nvme_dev_release(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(inode->i_cdev, struct nvme_ctrl, cdev);
|
|
|
|
|
|
|
|
module_put(ctrl->ops->module);
|
|
|
|
nvme_put_ctrl(ctrl);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-11-28 22:40:19 +08:00
|
|
|
static const struct file_operations nvme_dev_fops = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.open = nvme_dev_open,
|
2020-09-16 11:53:25 +08:00
|
|
|
.release = nvme_dev_release,
|
2015-11-28 22:40:19 +08:00
|
|
|
.unlocked_ioctl = nvme_dev_ioctl,
|
2018-09-12 03:59:08 +08:00
|
|
|
.compat_ioctl = compat_ptr_ioctl,
|
2015-11-28 22:40:19 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static ssize_t nvme_sysfs_reset(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf,
|
|
|
|
size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
int ret;
|
|
|
|
|
2017-06-15 21:41:08 +08:00
|
|
|
ret = nvme_reset_ctrl_sync(ctrl);
|
2015-11-28 22:40:19 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
return count;
|
2015-11-26 17:54:19 +08:00
|
|
|
}
|
2015-11-28 22:40:19 +08:00
|
|
|
static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
|
2015-11-26 17:54:19 +08:00
|
|
|
|
2016-04-30 05:45:18 +08:00
|
|
|
static ssize_t nvme_sysfs_rescan(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf,
|
|
|
|
size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
nvme_queue_scan(ctrl);
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
|
|
|
|
|
2017-11-09 20:51:03 +08:00
|
|
|
static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
|
|
|
|
{
|
|
|
|
struct gendisk *disk = dev_to_disk(dev);
|
|
|
|
|
2020-12-01 20:56:09 +08:00
|
|
|
if (disk->fops == &nvme_bdev_ops)
|
2017-11-09 20:51:03 +08:00
|
|
|
return nvme_get_ns_from_dev(dev)->head;
|
|
|
|
else
|
|
|
|
return disk->private_data;
|
|
|
|
}
|
|
|
|
|
2016-02-19 00:57:48 +08:00
|
|
|
static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
|
2017-11-09 20:51:03 +08:00
|
|
|
char *buf)
|
2016-02-19 00:57:48 +08:00
|
|
|
{
|
2017-11-09 20:51:03 +08:00
|
|
|
struct nvme_ns_head *head = dev_to_ns_head(dev);
|
|
|
|
struct nvme_ns_ids *ids = &head->ids;
|
|
|
|
struct nvme_subsystem *subsys = head->subsys;
|
2017-11-09 20:48:55 +08:00
|
|
|
int serial_len = sizeof(subsys->serial);
|
|
|
|
int model_len = sizeof(subsys->model);
|
2016-02-19 00:57:48 +08:00
|
|
|
|
2017-11-09 20:50:16 +08:00
|
|
|
if (!uuid_is_null(&ids->uuid))
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid);
|
2017-07-12 21:38:56 +08:00
|
|
|
|
2017-11-09 20:50:16 +08:00
|
|
|
if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "eui.%16phN\n", ids->nguid);
|
2016-02-19 00:57:48 +08:00
|
|
|
|
2017-11-09 20:50:16 +08:00
|
|
|
if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "eui.%8phN\n", ids->eui64);
|
2016-02-19 00:57:48 +08:00
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
|
|
|
|
subsys->serial[serial_len - 1] == '\0'))
|
2016-02-19 00:57:48 +08:00
|
|
|
serial_len--;
|
2017-11-09 20:48:55 +08:00
|
|
|
while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
|
|
|
|
subsys->model[model_len - 1] == '\0'))
|
2016-02-19 00:57:48 +08:00
|
|
|
model_len--;
|
|
|
|
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
|
2017-11-09 20:48:55 +08:00
|
|
|
serial_len, subsys->serial, model_len, subsys->model,
|
2017-11-09 20:51:03 +08:00
|
|
|
head->ns_id);
|
2016-02-19 00:57:48 +08:00
|
|
|
}
|
2017-12-20 02:15:08 +08:00
|
|
|
static DEVICE_ATTR_RO(wwid);
|
2016-02-19 00:57:48 +08:00
|
|
|
|
2017-06-07 17:45:35 +08:00
|
|
|
static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
|
2017-11-09 20:51:03 +08:00
|
|
|
char *buf)
|
2017-06-07 17:45:35 +08:00
|
|
|
{
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
|
2017-06-07 17:45:35 +08:00
|
|
|
}
|
2017-12-20 02:15:08 +08:00
|
|
|
static DEVICE_ATTR_RO(nguid);
|
2017-06-07 17:45:35 +08:00
|
|
|
|
2015-12-23 01:10:45 +08:00
|
|
|
static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
|
2017-11-09 20:51:03 +08:00
|
|
|
char *buf)
|
2015-12-23 01:10:45 +08:00
|
|
|
{
|
2017-11-09 20:51:03 +08:00
|
|
|
struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
|
2017-06-07 17:45:35 +08:00
|
|
|
|
|
|
|
/* For backward compatibility expose the NGUID to userspace if
|
|
|
|
* we have no UUID set
|
|
|
|
*/
|
2017-11-09 20:50:16 +08:00
|
|
|
if (uuid_is_null(&ids->uuid)) {
|
2017-06-07 17:45:35 +08:00
|
|
|
printk_ratelimited(KERN_WARNING
|
|
|
|
"No UUID available providing old NGUID\n");
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%pU\n", ids->nguid);
|
2017-06-07 17:45:35 +08:00
|
|
|
}
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%pU\n", &ids->uuid);
|
2015-12-23 01:10:45 +08:00
|
|
|
}
|
2017-12-20 02:15:08 +08:00
|
|
|
static DEVICE_ATTR_RO(uuid);
|
2015-12-23 01:10:45 +08:00
|
|
|
|
|
|
|
static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
|
2017-11-09 20:51:03 +08:00
|
|
|
char *buf)
|
2015-12-23 01:10:45 +08:00
|
|
|
{
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
|
2015-12-23 01:10:45 +08:00
|
|
|
}
|
2017-12-20 02:15:08 +08:00
|
|
|
static DEVICE_ATTR_RO(eui);
|
2015-12-23 01:10:45 +08:00
|
|
|
|
|
|
|
static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
|
2017-11-09 20:51:03 +08:00
|
|
|
char *buf)
|
2015-12-23 01:10:45 +08:00
|
|
|
{
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
|
2015-12-23 01:10:45 +08:00
|
|
|
}
|
2017-12-20 02:15:08 +08:00
|
|
|
static DEVICE_ATTR_RO(nsid);
|
2015-12-23 01:10:45 +08:00
|
|
|
|
2017-11-09 20:51:03 +08:00
|
|
|
static struct attribute *nvme_ns_id_attrs[] = {
|
2016-02-19 00:57:48 +08:00
|
|
|
&dev_attr_wwid.attr,
|
2015-12-23 01:10:45 +08:00
|
|
|
&dev_attr_uuid.attr,
|
2017-06-07 17:45:35 +08:00
|
|
|
&dev_attr_nguid.attr,
|
2015-12-23 01:10:45 +08:00
|
|
|
&dev_attr_eui.attr,
|
|
|
|
&dev_attr_nsid.attr,
|
2018-05-14 14:48:54 +08:00
|
|
|
#ifdef CONFIG_NVME_MULTIPATH
|
|
|
|
&dev_attr_ana_grpid.attr,
|
|
|
|
&dev_attr_ana_state.attr,
|
|
|
|
#endif
|
2015-12-23 01:10:45 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2017-11-09 20:51:03 +08:00
|
|
|
static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
|
2015-12-23 01:10:45 +08:00
|
|
|
struct attribute *a, int n)
|
|
|
|
{
|
|
|
|
struct device *dev = container_of(kobj, struct device, kobj);
|
2017-11-09 20:51:03 +08:00
|
|
|
struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
|
2015-12-23 01:10:45 +08:00
|
|
|
|
|
|
|
if (a == &dev_attr_uuid.attr) {
|
2017-09-29 03:33:23 +08:00
|
|
|
if (uuid_is_null(&ids->uuid) &&
|
2017-11-09 20:50:16 +08:00
|
|
|
!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
|
2017-06-07 17:45:35 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (a == &dev_attr_nguid.attr) {
|
2017-11-09 20:50:16 +08:00
|
|
|
if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
|
2015-12-23 01:10:45 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (a == &dev_attr_eui.attr) {
|
2017-11-09 20:50:16 +08:00
|
|
|
if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
|
2015-12-23 01:10:45 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2018-05-14 14:48:54 +08:00
|
|
|
#ifdef CONFIG_NVME_MULTIPATH
|
|
|
|
if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
|
2020-12-01 20:56:09 +08:00
|
|
|
if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */
|
2018-05-14 14:48:54 +08:00
|
|
|
return 0;
|
|
|
|
if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
2015-12-23 01:10:45 +08:00
|
|
|
return a->mode;
|
|
|
|
}
|
|
|
|
|
2018-10-09 05:28:39 +08:00
|
|
|
static const struct attribute_group nvme_ns_id_attr_group = {
|
2017-11-09 20:51:03 +08:00
|
|
|
.attrs = nvme_ns_id_attrs,
|
|
|
|
.is_visible = nvme_ns_id_attrs_are_visible,
|
2015-12-23 01:10:45 +08:00
|
|
|
};
|
|
|
|
|
2018-09-28 14:17:20 +08:00
|
|
|
const struct attribute_group *nvme_ns_id_attr_groups[] = {
|
|
|
|
&nvme_ns_id_attr_group,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2016-02-27 05:24:19 +08:00
|
|
|
#define nvme_show_str_function(field) \
|
2016-01-13 06:09:31 +08:00
|
|
|
static ssize_t field##_show(struct device *dev, \
|
|
|
|
struct device_attribute *attr, char *buf) \
|
|
|
|
{ \
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%.*s\n", \
|
2017-11-09 20:48:55 +08:00
|
|
|
(int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \
|
2016-01-13 06:09:31 +08:00
|
|
|
} \
|
|
|
|
static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
|
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
nvme_show_str_function(model);
|
|
|
|
nvme_show_str_function(serial);
|
|
|
|
nvme_show_str_function(firmware_rev);
|
|
|
|
|
2016-02-27 05:24:19 +08:00
|
|
|
#define nvme_show_int_function(field) \
|
|
|
|
static ssize_t field##_show(struct device *dev, \
|
|
|
|
struct device_attribute *attr, char *buf) \
|
|
|
|
{ \
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%d\n", ctrl->field); \
|
2016-02-27 05:24:19 +08:00
|
|
|
} \
|
|
|
|
static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
|
|
|
|
|
|
|
|
nvme_show_int_function(cntlid);
|
2018-11-16 16:22:29 +08:00
|
|
|
nvme_show_int_function(numa_node);
|
2019-09-25 05:22:08 +08:00
|
|
|
nvme_show_int_function(queue_count);
|
|
|
|
nvme_show_int_function(sqsize);
|
2021-04-16 19:46:21 +08:00
|
|
|
nvme_show_int_function(kato);
|
2016-01-13 06:09:31 +08:00
|
|
|
|
2016-06-13 22:45:24 +08:00
|
|
|
static ssize_t nvme_sysfs_delete(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf,
|
|
|
|
size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
if (device_remove_file_self(dev, attr))
|
2017-10-29 16:44:29 +08:00
|
|
|
nvme_delete_ctrl_sync(ctrl);
|
2016-06-13 22:45:24 +08:00
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
|
|
|
|
|
|
|
|
static ssize_t nvme_sysfs_show_transport(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
2021-02-02 15:06:17 +08:00
|
|
|
return sysfs_emit(buf, "%s\n", ctrl->ops->name);
|
2016-06-13 22:45:24 +08:00
|
|
|
}
|
|
|
|
static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
|
|
|
|
|
2016-11-28 07:47:40 +08:00
|
|
|
static ssize_t nvme_sysfs_show_state(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
static const char *const state_name[] = {
|
|
|
|
[NVME_CTRL_NEW] = "new",
|
|
|
|
[NVME_CTRL_LIVE] = "live",
|
|
|
|
[NVME_CTRL_RESETTING] = "resetting",
|
2018-02-01 00:31:24 +08:00
|
|
|
[NVME_CTRL_CONNECTING] = "connecting",
|
2016-11-28 07:47:40 +08:00
|
|
|
[NVME_CTRL_DELETING] = "deleting",
|
2020-07-23 07:32:19 +08:00
|
|
|
[NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
|
2016-11-28 07:47:40 +08:00
|
|
|
[NVME_CTRL_DEAD] = "dead",
|
|
|
|
};
|
|
|
|
|
|
|
|
if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
|
|
|
|
state_name[ctrl->state])
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "%s\n", state_name[ctrl->state]);
|
2016-11-28 07:47:40 +08:00
|
|
|
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "unknown state\n");
|
2016-11-28 07:47:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
|
|
|
|
|
2016-06-13 22:45:24 +08:00
|
|
|
static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
2021-02-02 15:06:17 +08:00
|
|
|
return sysfs_emit(buf, "%s\n", ctrl->subsys->subnqn);
|
2016-06-13 22:45:24 +08:00
|
|
|
}
|
|
|
|
static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
|
|
|
|
|
2020-02-08 09:13:53 +08:00
|
|
|
static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
2021-02-02 15:06:17 +08:00
|
|
|
return sysfs_emit(buf, "%s\n", ctrl->opts->host->nqn);
|
2020-02-08 09:13:53 +08:00
|
|
|
}
|
|
|
|
static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
|
|
|
|
|
2020-02-08 09:13:54 +08:00
|
|
|
static ssize_t nvme_sysfs_show_hostid(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
2021-02-02 15:06:17 +08:00
|
|
|
return sysfs_emit(buf, "%pU\n", &ctrl->opts->host->id);
|
2020-02-08 09:13:54 +08:00
|
|
|
}
|
|
|
|
static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
|
|
|
|
|
2016-06-13 22:45:24 +08:00
|
|
|
static ssize_t nvme_sysfs_show_address(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
|
|
|
|
|
2020-07-05 15:57:55 +08:00
|
|
|
static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
struct nvmf_ctrl_options *opts = ctrl->opts;
|
|
|
|
|
|
|
|
if (ctrl->opts->max_reconnects == -1)
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "off\n");
|
|
|
|
return sysfs_emit(buf, "%d\n",
|
|
|
|
opts->max_reconnects * opts->reconnect_delay);
|
2020-07-05 15:57:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
struct nvmf_ctrl_options *opts = ctrl->opts;
|
|
|
|
int ctrl_loss_tmo, err;
|
|
|
|
|
|
|
|
err = kstrtoint(buf, 10, &ctrl_loss_tmo);
|
|
|
|
if (err)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2021-04-01 17:54:11 +08:00
|
|
|
if (ctrl_loss_tmo < 0)
|
2020-07-05 15:57:55 +08:00
|
|
|
opts->max_reconnects = -1;
|
|
|
|
else
|
|
|
|
opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
|
|
|
|
opts->reconnect_delay);
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
|
|
|
|
nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
|
|
|
|
|
|
|
|
static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
if (ctrl->opts->reconnect_delay == -1)
|
2021-04-01 17:54:10 +08:00
|
|
|
return sysfs_emit(buf, "off\n");
|
|
|
|
return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay);
|
2020-07-05 15:57:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
unsigned int v;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = kstrtou32(buf, 10, &v);
|
2020-07-14 18:57:32 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
2020-07-05 15:57:55 +08:00
|
|
|
|
|
|
|
ctrl->opts->reconnect_delay = v;
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
|
|
|
|
nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
|
|
|
|
|
2021-04-01 17:54:12 +08:00
|
|
|
static ssize_t nvme_ctrl_fast_io_fail_tmo_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
if (ctrl->opts->fast_io_fail_tmo == -1)
|
|
|
|
return sysfs_emit(buf, "off\n");
|
|
|
|
return sysfs_emit(buf, "%d\n", ctrl->opts->fast_io_fail_tmo);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
struct nvmf_ctrl_options *opts = ctrl->opts;
|
|
|
|
int fast_io_fail_tmo, err;
|
|
|
|
|
|
|
|
err = kstrtoint(buf, 10, &fast_io_fail_tmo);
|
|
|
|
if (err)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (fast_io_fail_tmo < 0)
|
|
|
|
opts->fast_io_fail_tmo = -1;
|
|
|
|
else
|
|
|
|
opts->fast_io_fail_tmo = fast_io_fail_tmo;
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR,
|
|
|
|
nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store);
|
|
|
|
|
2022-02-09 03:33:46 +08:00
|
|
|
static ssize_t cntrltype_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
static const char * const type[] = {
|
|
|
|
[NVME_CTRL_IO] = "io\n",
|
|
|
|
[NVME_CTRL_DISC] = "discovery\n",
|
|
|
|
[NVME_CTRL_ADMIN] = "admin\n",
|
|
|
|
};
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
if (ctrl->cntrltype > NVME_CTRL_ADMIN || !type[ctrl->cntrltype])
|
|
|
|
return sysfs_emit(buf, "reserved\n");
|
|
|
|
|
|
|
|
return sysfs_emit(buf, type[ctrl->cntrltype]);
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR_RO(cntrltype);
|
|
|
|
|
|
|
|
static ssize_t dctype_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
static const char * const type[] = {
|
|
|
|
[NVME_DCTYPE_NOT_REPORTED] = "none\n",
|
|
|
|
[NVME_DCTYPE_DDC] = "ddc\n",
|
|
|
|
[NVME_DCTYPE_CDC] = "cdc\n",
|
|
|
|
};
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
|
|
|
if (ctrl->dctype > NVME_DCTYPE_CDC || !type[ctrl->dctype])
|
|
|
|
return sysfs_emit(buf, "reserved\n");
|
|
|
|
|
|
|
|
return sysfs_emit(buf, type[ctrl->dctype]);
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR_RO(dctype);
|
|
|
|
|
2016-01-13 06:09:31 +08:00
|
|
|
static struct attribute *nvme_dev_attrs[] = {
|
|
|
|
&dev_attr_reset_controller.attr,
|
2016-04-30 05:45:18 +08:00
|
|
|
&dev_attr_rescan_controller.attr,
|
2016-01-13 06:09:31 +08:00
|
|
|
&dev_attr_model.attr,
|
|
|
|
&dev_attr_serial.attr,
|
|
|
|
&dev_attr_firmware_rev.attr,
|
2016-02-27 05:24:19 +08:00
|
|
|
&dev_attr_cntlid.attr,
|
2016-06-13 22:45:24 +08:00
|
|
|
&dev_attr_delete_controller.attr,
|
|
|
|
&dev_attr_transport.attr,
|
|
|
|
&dev_attr_subsysnqn.attr,
|
|
|
|
&dev_attr_address.attr,
|
2016-11-28 07:47:40 +08:00
|
|
|
&dev_attr_state.attr,
|
2018-11-16 16:22:29 +08:00
|
|
|
&dev_attr_numa_node.attr,
|
2019-09-25 05:22:08 +08:00
|
|
|
&dev_attr_queue_count.attr,
|
|
|
|
&dev_attr_sqsize.attr,
|
2020-02-08 09:13:53 +08:00
|
|
|
&dev_attr_hostnqn.attr,
|
2020-02-08 09:13:54 +08:00
|
|
|
&dev_attr_hostid.attr,
|
2020-07-05 15:57:55 +08:00
|
|
|
&dev_attr_ctrl_loss_tmo.attr,
|
|
|
|
&dev_attr_reconnect_delay.attr,
|
2021-04-01 17:54:12 +08:00
|
|
|
&dev_attr_fast_io_fail_tmo.attr,
|
2021-04-16 19:46:21 +08:00
|
|
|
&dev_attr_kato.attr,
|
2022-02-09 03:33:46 +08:00
|
|
|
&dev_attr_cntrltype.attr,
|
|
|
|
&dev_attr_dctype.attr,
|
2016-01-13 06:09:31 +08:00
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2016-06-13 22:45:24 +08:00
|
|
|
static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
|
|
|
|
struct attribute *a, int n)
|
|
|
|
{
|
|
|
|
struct device *dev = container_of(kobj, struct device, kobj);
|
|
|
|
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
|
|
2017-06-26 18:39:03 +08:00
|
|
|
if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
|
|
|
|
return 0;
|
|
|
|
if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
|
|
|
|
return 0;
|
2020-02-08 09:13:53 +08:00
|
|
|
if (a == &dev_attr_hostnqn.attr && !ctrl->opts)
|
|
|
|
return 0;
|
2020-02-08 09:13:54 +08:00
|
|
|
if (a == &dev_attr_hostid.attr && !ctrl->opts)
|
|
|
|
return 0;
|
2020-08-25 06:47:25 +08:00
|
|
|
if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
|
|
|
|
return 0;
|
|
|
|
if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
|
|
|
|
return 0;
|
2021-04-14 16:46:45 +08:00
|
|
|
if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts)
|
|
|
|
return 0;
|
2016-06-13 22:45:24 +08:00
|
|
|
|
|
|
|
return a->mode;
|
|
|
|
}
|
|
|
|
|
2021-01-09 07:41:47 +08:00
|
|
|
static const struct attribute_group nvme_dev_attrs_group = {
|
2016-06-13 22:45:24 +08:00
|
|
|
.attrs = nvme_dev_attrs,
|
|
|
|
.is_visible = nvme_dev_attrs_are_visible,
|
2016-01-13 06:09:31 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static const struct attribute_group *nvme_dev_attr_groups[] = {
|
|
|
|
&nvme_dev_attrs_group,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2020-03-25 21:19:36 +08:00
|
|
|
static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys,
|
2017-11-09 20:50:43 +08:00
|
|
|
unsigned nsid)
|
|
|
|
{
|
|
|
|
struct nvme_ns_head *h;
|
|
|
|
|
|
|
|
lockdep_assert_held(&subsys->lock);
|
|
|
|
|
|
|
|
list_for_each_entry(h, &subsys->nsheads, entry) {
|
2021-09-02 17:20:02 +08:00
|
|
|
if (h->ns_id != nsid)
|
|
|
|
continue;
|
|
|
|
if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
|
2017-11-09 20:50:43 +08:00
|
|
|
return h;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2022-02-24 17:57:15 +08:00
|
|
|
static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
|
|
|
|
struct nvme_ns_ids *ids)
|
2017-11-09 20:50:43 +08:00
|
|
|
{
|
2022-02-24 18:32:58 +08:00
|
|
|
bool has_uuid = !uuid_is_null(&ids->uuid);
|
|
|
|
bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid));
|
|
|
|
bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
|
2017-11-09 20:50:43 +08:00
|
|
|
struct nvme_ns_head *h;
|
|
|
|
|
|
|
|
lockdep_assert_held(&subsys->lock);
|
|
|
|
|
|
|
|
list_for_each_entry(h, &subsys->nsheads, entry) {
|
2022-02-24 18:32:58 +08:00
|
|
|
if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid))
|
|
|
|
return -EINVAL;
|
|
|
|
if (has_nguid &&
|
|
|
|
memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0)
|
|
|
|
return -EINVAL;
|
|
|
|
if (has_eui64 &&
|
|
|
|
memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0)
|
2017-11-09 20:50:43 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-10-13 23:04:19 +08:00
|
|
|
static void nvme_cdev_rel(struct device *dev)
|
|
|
|
{
|
2022-02-14 17:07:27 +08:00
|
|
|
ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt));
|
2021-10-13 23:04:19 +08:00
|
|
|
}
|
|
|
|
|
2021-04-21 15:45:04 +08:00
|
|
|
void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device)
|
|
|
|
{
|
|
|
|
cdev_device_del(cdev, cdev_device);
|
2021-10-13 23:04:19 +08:00
|
|
|
put_device(cdev_device);
|
2021-04-21 15:45:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
|
|
|
|
const struct file_operations *fops, struct module *owner)
|
|
|
|
{
|
|
|
|
int minor, ret;
|
|
|
|
|
2022-02-14 17:07:27 +08:00
|
|
|
minor = ida_alloc(&nvme_ns_chr_minor_ida, GFP_KERNEL);
|
2021-04-21 15:45:04 +08:00
|
|
|
if (minor < 0)
|
|
|
|
return minor;
|
|
|
|
cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
|
|
|
|
cdev_device->class = nvme_ns_chr_class;
|
2021-10-13 23:04:19 +08:00
|
|
|
cdev_device->release = nvme_cdev_rel;
|
2021-04-21 15:45:04 +08:00
|
|
|
device_initialize(cdev_device);
|
|
|
|
cdev_init(cdev, fops);
|
|
|
|
cdev->owner = owner;
|
|
|
|
ret = cdev_device_add(cdev, cdev_device);
|
2021-10-13 23:04:19 +08:00
|
|
|
if (ret)
|
2021-05-21 15:32:39 +08:00
|
|
|
put_device(cdev_device);
|
2021-10-13 23:04:19 +08:00
|
|
|
|
2021-04-21 15:45:04 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_ns_chr_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_ns_chr_release(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev));
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations nvme_ns_chr_fops = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.open = nvme_ns_chr_open,
|
|
|
|
.release = nvme_ns_chr_release,
|
|
|
|
.unlocked_ioctl = nvme_ns_chr_ioctl,
|
|
|
|
.compat_ioctl = compat_ptr_ioctl,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int nvme_add_ns_cdev(struct nvme_ns *ns)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ns->cdev_device.parent = ns->ctrl->device;
|
|
|
|
ret = dev_set_name(&ns->cdev_device, "ng%dn%d",
|
|
|
|
ns->ctrl->instance, ns->head->instance);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2021-10-13 23:04:19 +08:00
|
|
|
|
|
|
|
return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops,
|
|
|
|
ns->ctrl->ops->module);
|
2021-04-21 15:45:04 +08:00
|
|
|
}
|
|
|
|
|
2017-11-09 20:50:43 +08:00
|
|
|
static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
|
2020-04-04 00:24:09 +08:00
|
|
|
unsigned nsid, struct nvme_ns_ids *ids)
|
2017-11-09 20:50:43 +08:00
|
|
|
{
|
|
|
|
struct nvme_ns_head *head;
|
2018-09-11 15:51:29 +08:00
|
|
|
size_t size = sizeof(*head);
|
2017-11-09 20:50:43 +08:00
|
|
|
int ret = -ENOMEM;
|
|
|
|
|
2018-09-11 15:51:29 +08:00
|
|
|
#ifdef CONFIG_NVME_MULTIPATH
|
|
|
|
size += num_possible_nodes() * sizeof(struct nvme_ns *);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
head = kzalloc(size, GFP_KERNEL);
|
2017-11-09 20:50:43 +08:00
|
|
|
if (!head)
|
|
|
|
goto out;
|
2022-02-14 17:07:27 +08:00
|
|
|
ret = ida_alloc_min(&ctrl->subsys->ns_ida, 1, GFP_KERNEL);
|
2017-11-09 20:50:43 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out_free_head;
|
|
|
|
head->instance = ret;
|
|
|
|
INIT_LIST_HEAD(&head->list);
|
2018-04-12 23:16:12 +08:00
|
|
|
ret = init_srcu_struct(&head->srcu);
|
|
|
|
if (ret)
|
|
|
|
goto out_ida_remove;
|
2017-11-09 20:50:43 +08:00
|
|
|
head->subsys = ctrl->subsys;
|
|
|
|
head->ns_id = nsid;
|
2020-03-25 21:19:37 +08:00
|
|
|
head->ids = *ids;
|
2017-11-09 20:50:43 +08:00
|
|
|
kref_init(&head->ref);
|
|
|
|
|
2020-06-30 03:06:40 +08:00
|
|
|
if (head->ids.csi) {
|
|
|
|
ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
|
|
|
|
if (ret)
|
|
|
|
goto out_cleanup_srcu;
|
|
|
|
} else
|
|
|
|
head->effects = ctrl->effects;
|
|
|
|
|
2017-11-02 19:59:30 +08:00
|
|
|
ret = nvme_mpath_alloc_disk(ctrl, head);
|
|
|
|
if (ret)
|
|
|
|
goto out_cleanup_srcu;
|
|
|
|
|
2017-11-09 20:50:43 +08:00
|
|
|
list_add_tail(&head->entry, &ctrl->subsys->nsheads);
|
2018-05-04 16:01:57 +08:00
|
|
|
|
|
|
|
kref_get(&ctrl->subsys->ref);
|
|
|
|
|
2017-11-09 20:50:43 +08:00
|
|
|
return head;
|
|
|
|
out_cleanup_srcu:
|
|
|
|
cleanup_srcu_struct(&head->srcu);
|
2018-04-12 23:16:12 +08:00
|
|
|
out_ida_remove:
|
2022-02-14 17:07:27 +08:00
|
|
|
ida_free(&ctrl->subsys->ns_ida, head->instance);
|
2017-11-09 20:50:43 +08:00
|
|
|
out_free_head:
|
|
|
|
kfree(head);
|
|
|
|
out:
|
2019-08-03 09:16:12 +08:00
|
|
|
if (ret > 0)
|
|
|
|
ret = blk_status_to_errno(nvme_error_status(ret));
|
2017-11-09 20:50:43 +08:00
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
2022-02-25 00:48:32 +08:00
|
|
|
static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
|
|
|
|
struct nvme_ns_ids *ids)
|
|
|
|
{
|
|
|
|
struct nvme_subsystem *s;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note that this check is racy as we try to avoid holding the global
|
|
|
|
* lock over the whole ns_head creation. But it is only intended as
|
|
|
|
* a sanity check anyway.
|
|
|
|
*/
|
|
|
|
mutex_lock(&nvme_subsystems_lock);
|
|
|
|
list_for_each_entry(s, &nvme_subsystems, entry) {
|
|
|
|
if (s == this)
|
|
|
|
continue;
|
|
|
|
mutex_lock(&s->lock);
|
|
|
|
ret = nvme_subsys_check_duplicate_ids(s, ids);
|
|
|
|
mutex_unlock(&s->lock);
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
mutex_unlock(&nvme_subsystems_lock);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-11-09 20:50:43 +08:00
|
|
|
static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
|
2020-09-28 20:07:56 +08:00
|
|
|
struct nvme_ns_ids *ids, bool is_shared)
|
2017-11-09 20:50:43 +08:00
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = ns->ctrl;
|
|
|
|
struct nvme_ns_head *head = NULL;
|
2022-02-25 00:48:32 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = nvme_global_check_duplicate_ids(ctrl->subsys, ids);
|
|
|
|
if (ret) {
|
|
|
|
dev_err(ctrl->device,
|
|
|
|
"globally duplicate IDs for nsid %d\n", nsid);
|
|
|
|
return ret;
|
|
|
|
}
|
2017-11-09 20:50:43 +08:00
|
|
|
|
|
|
|
mutex_lock(&ctrl->subsys->lock);
|
2020-04-10 00:09:01 +08:00
|
|
|
head = nvme_find_ns_head(ctrl->subsys, nsid);
|
2017-11-09 20:50:43 +08:00
|
|
|
if (!head) {
|
2022-02-25 00:46:50 +08:00
|
|
|
ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, ids);
|
|
|
|
if (ret) {
|
|
|
|
dev_err(ctrl->device,
|
2022-02-25 00:48:32 +08:00
|
|
|
"duplicate IDs in subsystem for nsid %d\n",
|
|
|
|
nsid);
|
2022-02-25 00:46:50 +08:00
|
|
|
goto out_unlock;
|
|
|
|
}
|
2020-09-28 20:07:56 +08:00
|
|
|
head = nvme_alloc_ns_head(ctrl, nsid, ids);
|
2017-11-09 20:50:43 +08:00
|
|
|
if (IS_ERR(head)) {
|
|
|
|
ret = PTR_ERR(head);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2020-04-10 00:09:02 +08:00
|
|
|
head->shared = is_shared;
|
2017-11-09 20:50:43 +08:00
|
|
|
} else {
|
2020-04-22 15:59:08 +08:00
|
|
|
ret = -EINVAL;
|
2020-04-10 00:09:02 +08:00
|
|
|
if (!is_shared || !head->shared) {
|
2020-04-10 00:09:01 +08:00
|
|
|
dev_err(ctrl->device,
|
2020-04-22 15:59:08 +08:00
|
|
|
"Duplicate unshared namespace %d\n", nsid);
|
|
|
|
goto out_put_ns_head;
|
2020-04-10 00:09:01 +08:00
|
|
|
}
|
2020-09-28 20:07:56 +08:00
|
|
|
if (!nvme_ns_ids_equal(&head->ids, ids)) {
|
2017-11-09 20:50:43 +08:00
|
|
|
dev_err(ctrl->device,
|
|
|
|
"IDs don't match for shared namespace %d\n",
|
|
|
|
nsid);
|
2020-04-22 15:59:08 +08:00
|
|
|
goto out_put_ns_head;
|
2017-11-09 20:50:43 +08:00
|
|
|
}
|
2022-03-15 20:27:07 +08:00
|
|
|
|
|
|
|
if (!multipath && !list_empty(&head->list)) {
|
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"Found shared namespace %d, but multipathing not supported.\n",
|
|
|
|
nsid);
|
|
|
|
dev_warn_once(ctrl->device,
|
|
|
|
"Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
|
|
|
|
}
|
2017-11-09 20:50:43 +08:00
|
|
|
}
|
|
|
|
|
2021-01-28 11:33:51 +08:00
|
|
|
list_add_tail_rcu(&ns->siblings, &head->list);
|
2017-11-09 20:50:43 +08:00
|
|
|
ns->head = head;
|
2020-04-22 15:59:08 +08:00
|
|
|
mutex_unlock(&ctrl->subsys->lock);
|
|
|
|
return 0;
|
2017-11-09 20:50:43 +08:00
|
|
|
|
2020-04-22 15:59:08 +08:00
|
|
|
out_put_ns_head:
|
|
|
|
nvme_put_ns_head(head);
|
2017-11-09 20:50:43 +08:00
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&ctrl->subsys->lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-07-25 01:25:16 +08:00
|
|
|
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
2015-11-28 22:39:07 +08:00
|
|
|
{
|
2016-07-14 01:45:02 +08:00
|
|
|
struct nvme_ns *ns, *ret = NULL;
|
2015-12-24 22:27:00 +08:00
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2015-11-28 22:39:07 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list) {
|
2017-11-09 20:50:43 +08:00
|
|
|
if (ns->head->ns_id == nsid) {
|
2021-04-27 14:47:46 +08:00
|
|
|
if (!nvme_get_ns(ns))
|
2017-10-18 19:20:01 +08:00
|
|
|
continue;
|
2016-07-14 01:45:02 +08:00
|
|
|
ret = ns;
|
|
|
|
break;
|
|
|
|
}
|
2017-11-09 20:50:43 +08:00
|
|
|
if (ns->head->ns_id > nsid)
|
2015-11-28 22:39:07 +08:00
|
|
|
break;
|
|
|
|
}
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2016-07-14 01:45:02 +08:00
|
|
|
return ret;
|
2015-11-28 22:39:07 +08:00
|
|
|
}
|
2020-07-25 01:25:16 +08:00
|
|
|
EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2021-09-14 14:38:20 +08:00
|
|
|
/*
|
|
|
|
* Add the namespace to the controller list while keeping the list ordered.
|
|
|
|
*/
|
|
|
|
static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
|
|
|
|
{
|
|
|
|
struct nvme_ns *tmp;
|
|
|
|
|
|
|
|
list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
|
|
|
|
if (tmp->head->ns_id < ns->head->ns_id) {
|
|
|
|
list_add(&ns->list, &tmp->list);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
list_add(&ns->list, &ns->ctrl->namespaces);
|
|
|
|
}
|
|
|
|
|
2020-09-28 20:07:56 +08:00
|
|
|
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
|
|
|
|
struct nvme_ns_ids *ids)
|
2015-11-28 22:39:07 +08:00
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
struct gendisk *disk;
|
2016-09-16 20:25:04 +08:00
|
|
|
struct nvme_id_ns *id;
|
2021-04-07 18:46:46 +08:00
|
|
|
int node = ctrl->numa_node;
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2020-09-28 20:07:56 +08:00
|
|
|
if (nvme_identify_ns(ctrl, nsid, ids, &id))
|
2020-09-28 18:34:04 +08:00
|
|
|
return;
|
|
|
|
|
2015-11-28 22:39:07 +08:00
|
|
|
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
|
|
|
|
if (!ns)
|
2020-09-28 18:34:04 +08:00
|
|
|
goto out_free_id;
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2021-08-16 21:19:02 +08:00
|
|
|
disk = blk_mq_alloc_disk(ctrl->tagset, ns);
|
|
|
|
if (IS_ERR(disk))
|
2017-11-09 20:50:43 +08:00
|
|
|
goto out_free_ns;
|
2021-08-16 21:19:02 +08:00
|
|
|
disk->fops = &nvme_bdev_ops;
|
|
|
|
disk->private_data = ns;
|
|
|
|
|
|
|
|
ns->disk = disk;
|
|
|
|
ns->queue = disk->queue;
|
2018-10-05 05:27:44 +08:00
|
|
|
|
2019-07-12 01:04:47 +08:00
|
|
|
if (ctrl->opts && ctrl->opts->data_digest)
|
2020-09-24 14:51:38 +08:00
|
|
|
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
|
2019-07-04 15:59:18 +08:00
|
|
|
|
2018-03-08 09:10:10 +08:00
|
|
|
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
|
2018-10-05 05:27:44 +08:00
|
|
|
if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
|
|
|
|
blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
|
|
|
|
|
2015-11-28 22:39:07 +08:00
|
|
|
ns->ctrl = ctrl;
|
|
|
|
kref_init(&ns->kref);
|
|
|
|
|
2020-12-01 20:56:07 +08:00
|
|
|
if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED))
|
2021-08-16 21:19:02 +08:00
|
|
|
goto out_cleanup_disk;
|
2016-09-16 20:25:04 +08:00
|
|
|
|
2021-04-07 18:46:46 +08:00
|
|
|
/*
|
2022-03-15 19:58:06 +08:00
|
|
|
* If multipathing is enabled, the device name for all disks and not
|
|
|
|
* just those that represent shared namespaces needs to be based on the
|
|
|
|
* subsystem instance. Using the controller instance for private
|
|
|
|
* namespaces could lead to naming collisions between shared and private
|
|
|
|
* namespaces if they don't use a common numbering scheme.
|
|
|
|
*
|
|
|
|
* If multipathing is not enabled, disk names must use the controller
|
|
|
|
* instance as shared namespaces will show up as multiple block
|
|
|
|
* devices.
|
2021-04-07 18:46:46 +08:00
|
|
|
*/
|
2022-03-15 19:58:06 +08:00
|
|
|
if (ns->head->disk) {
|
|
|
|
sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
|
|
|
|
ctrl->instance, ns->head->instance);
|
|
|
|
disk->flags |= GENHD_FL_HIDDEN;
|
|
|
|
} else if (multipath) {
|
|
|
|
sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
|
|
|
|
ns->head->instance);
|
|
|
|
} else {
|
2021-04-07 18:46:46 +08:00
|
|
|
sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
|
|
|
|
ns->head->instance);
|
2022-03-15 19:58:06 +08:00
|
|
|
}
|
2016-11-29 05:38:53 +08:00
|
|
|
|
2020-09-28 18:14:20 +08:00
|
|
|
if (nvme_update_ns_info(ns, id))
|
2021-08-16 21:19:02 +08:00
|
|
|
goto out_unlink_ns;
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_write(&ctrl->namespaces_rwsem);
|
2021-09-14 14:38:20 +08:00
|
|
|
nvme_ns_add_to_ctrl_list(ns);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_write(&ctrl->namespaces_rwsem);
|
2017-10-18 19:25:42 +08:00
|
|
|
nvme_get_ctrl(ctrl);
|
2016-09-16 20:25:04 +08:00
|
|
|
|
2021-08-31 05:25:33 +08:00
|
|
|
if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
|
|
|
|
goto out_cleanup_ns_from_list;
|
|
|
|
|
2021-04-21 15:45:04 +08:00
|
|
|
if (!nvme_ns_head_multipath(ns->head))
|
|
|
|
nvme_add_ns_cdev(ns);
|
2017-11-02 19:59:30 +08:00
|
|
|
|
2018-05-14 14:48:54 +08:00
|
|
|
nvme_mpath_add_disk(ns, id);
|
2019-06-20 14:49:02 +08:00
|
|
|
nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
|
2018-05-14 14:48:54 +08:00
|
|
|
kfree(id);
|
|
|
|
|
2019-11-28 01:17:43 +08:00
|
|
|
return;
|
2021-08-16 21:19:02 +08:00
|
|
|
|
2021-08-31 05:25:33 +08:00
|
|
|
out_cleanup_ns_from_list:
|
|
|
|
nvme_put_ctrl(ctrl);
|
|
|
|
down_write(&ctrl->namespaces_rwsem);
|
|
|
|
list_del_init(&ns->list);
|
|
|
|
up_write(&ctrl->namespaces_rwsem);
|
2017-11-09 20:50:43 +08:00
|
|
|
out_unlink_ns:
|
|
|
|
mutex_lock(&ctrl->subsys->lock);
|
|
|
|
list_del_rcu(&ns->siblings);
|
2020-04-10 00:08:59 +08:00
|
|
|
if (list_empty(&ns->head->list))
|
|
|
|
list_del_init(&ns->head->entry);
|
2017-11-09 20:50:43 +08:00
|
|
|
mutex_unlock(&ctrl->subsys->lock);
|
2019-03-14 01:54:57 +08:00
|
|
|
nvme_put_ns_head(ns->head);
|
2021-08-16 21:19:02 +08:00
|
|
|
out_cleanup_disk:
|
|
|
|
blk_cleanup_disk(disk);
|
2015-11-28 22:39:07 +08:00
|
|
|
out_free_ns:
|
|
|
|
kfree(ns);
|
2020-09-28 18:34:04 +08:00
|
|
|
out_free_id:
|
|
|
|
kfree(id);
|
2015-11-28 22:39:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_ns_remove(struct nvme_ns *ns)
|
|
|
|
{
|
2021-07-16 19:30:35 +08:00
|
|
|
bool last_path = false;
|
|
|
|
|
2016-02-25 00:15:54 +08:00
|
|
|
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
|
|
|
|
return;
|
2015-12-24 22:27:00 +08:00
|
|
|
|
2021-08-24 22:57:42 +08:00
|
|
|
clear_bit(NVME_NS_READY, &ns->flags);
|
2020-09-28 19:59:06 +08:00
|
|
|
set_capacity(ns->disk, 0);
|
2019-06-20 14:49:02 +08:00
|
|
|
nvme_fault_inject_fini(&ns->fault_inject);
|
2019-06-20 14:48:10 +08:00
|
|
|
|
|
|
|
mutex_lock(&ns->ctrl->subsys->lock);
|
|
|
|
list_del_rcu(&ns->siblings);
|
2021-09-02 17:20:02 +08:00
|
|
|
if (list_empty(&ns->head->list)) {
|
|
|
|
list_del_init(&ns->head->entry);
|
|
|
|
last_path = true;
|
|
|
|
}
|
2019-06-20 14:48:10 +08:00
|
|
|
mutex_unlock(&ns->ctrl->subsys->lock);
|
2020-04-10 00:08:59 +08:00
|
|
|
|
2021-09-01 17:25:24 +08:00
|
|
|
/* guarantee not available in head->list */
|
|
|
|
synchronize_rcu();
|
|
|
|
|
|
|
|
/* wait for concurrent submissions */
|
|
|
|
if (nvme_mpath_clear_current_path(ns))
|
|
|
|
synchronize_srcu(&ns->head->srcu);
|
2019-06-20 14:48:10 +08:00
|
|
|
|
2021-08-09 14:40:23 +08:00
|
|
|
if (!nvme_ns_head_multipath(ns->head))
|
|
|
|
nvme_cdev_del(&ns->cdev, &ns->cdev_device);
|
|
|
|
del_gendisk(ns->disk);
|
|
|
|
blk_cleanup_queue(ns->queue);
|
2016-07-14 01:45:02 +08:00
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_write(&ns->ctrl->namespaces_rwsem);
|
2015-11-28 22:39:07 +08:00
|
|
|
list_del_init(&ns->list);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_write(&ns->ctrl->namespaces_rwsem);
|
2016-07-14 01:45:02 +08:00
|
|
|
|
2021-07-16 19:30:35 +08:00
|
|
|
if (last_path)
|
|
|
|
nvme_mpath_shutdown_disk(ns->head);
|
2015-11-28 22:39:07 +08:00
|
|
|
nvme_put_ns(ns);
|
|
|
|
}
|
|
|
|
|
2020-04-04 16:30:32 +08:00
|
|
|
static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
|
|
|
|
|
|
|
|
if (ns) {
|
|
|
|
nvme_ns_remove(ns);
|
|
|
|
nvme_put_ns(ns);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-09-28 19:59:06 +08:00
|
|
|
static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
|
2020-09-28 19:55:22 +08:00
|
|
|
{
|
|
|
|
struct nvme_id_ns *id;
|
2021-02-26 15:17:25 +08:00
|
|
|
int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
|
2020-09-28 19:55:22 +08:00
|
|
|
|
2020-09-28 19:59:06 +08:00
|
|
|
if (test_bit(NVME_NS_DEAD, &ns->flags))
|
|
|
|
goto out;
|
2020-09-28 19:55:22 +08:00
|
|
|
|
2020-10-02 02:54:31 +08:00
|
|
|
ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id);
|
2020-09-28 19:55:22 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
2021-02-26 15:17:25 +08:00
|
|
|
ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
|
2020-09-28 19:55:22 +08:00
|
|
|
if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
|
2020-10-02 02:54:31 +08:00
|
|
|
dev_err(ns->ctrl->device,
|
2020-09-28 19:55:22 +08:00
|
|
|
"identifiers changed for nsid %d\n", ns->head->ns_id);
|
2020-09-28 19:59:06 +08:00
|
|
|
goto out_free_id;
|
2020-09-28 19:55:22 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ret = nvme_update_ns_info(ns, id);
|
2020-09-28 19:59:06 +08:00
|
|
|
|
|
|
|
out_free_id:
|
2020-09-28 19:55:22 +08:00
|
|
|
kfree(id);
|
|
|
|
out:
|
|
|
|
/*
|
2020-09-28 19:59:06 +08:00
|
|
|
* Only remove the namespace if we got a fatal error back from the
|
2020-09-28 19:55:22 +08:00
|
|
|
* device, otherwise ignore the error and just move on.
|
2020-09-28 19:59:06 +08:00
|
|
|
*
|
|
|
|
* TODO: we should probably schedule a delayed retry here.
|
2020-09-28 19:55:22 +08:00
|
|
|
*/
|
2021-02-26 15:17:25 +08:00
|
|
|
if (ret > 0 && (ret & NVME_SC_DNR))
|
2020-09-28 19:59:06 +08:00
|
|
|
nvme_ns_remove(ns);
|
2020-09-28 19:55:22 +08:00
|
|
|
}
|
|
|
|
|
2020-09-28 15:42:17 +08:00
|
|
|
static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
2015-10-23 05:45:06 +08:00
|
|
|
{
|
2020-09-28 20:07:56 +08:00
|
|
|
struct nvme_ns_ids ids = { };
|
2015-10-23 05:45:06 +08:00
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2020-09-28 20:07:56 +08:00
|
|
|
if (nvme_identify_ns_descs(ctrl, nsid, &ids))
|
|
|
|
return;
|
2015-10-23 05:45:06 +08:00
|
|
|
|
2016-07-14 01:45:02 +08:00
|
|
|
ns = nvme_find_get_ns(ctrl, nsid);
|
2020-09-28 20:07:56 +08:00
|
|
|
if (ns) {
|
2020-09-28 19:59:06 +08:00
|
|
|
nvme_validate_ns(ns, &ids);
|
2020-09-28 20:07:56 +08:00
|
|
|
nvme_put_ns(ns);
|
2020-09-01 23:57:45 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2020-09-28 20:07:56 +08:00
|
|
|
switch (ids.csi) {
|
|
|
|
case NVME_CSI_NVM:
|
|
|
|
nvme_alloc_ns(ctrl, nsid, &ids);
|
|
|
|
break;
|
|
|
|
case NVME_CSI_ZNS:
|
|
|
|
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
|
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
|
|
|
|
nsid);
|
|
|
|
break;
|
|
|
|
}
|
2021-03-09 12:58:21 +08:00
|
|
|
if (!nvme_multi_css(ctrl)) {
|
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"command set not reported for nsid: %d\n",
|
2021-03-13 03:55:36 +08:00
|
|
|
nsid);
|
2021-03-09 12:58:21 +08:00
|
|
|
break;
|
|
|
|
}
|
2020-09-28 20:07:56 +08:00
|
|
|
nvme_alloc_ns(ctrl, nsid, &ids);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
dev_warn(ctrl->device, "unknown csi %u for nsid %u\n",
|
|
|
|
ids.csi, nsid);
|
|
|
|
break;
|
|
|
|
}
|
2015-10-23 05:45:06 +08:00
|
|
|
}
|
|
|
|
|
2016-05-27 18:29:43 +08:00
|
|
|
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
|
|
|
|
unsigned nsid)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns, *next;
|
2018-02-12 20:54:44 +08:00
|
|
|
LIST_HEAD(rm_list);
|
2016-05-27 18:29:43 +08:00
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_write(&ctrl->namespaces_rwsem);
|
2016-05-27 18:29:43 +08:00
|
|
|
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
|
2018-06-30 03:03:28 +08:00
|
|
|
if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
|
2018-02-12 20:54:44 +08:00
|
|
|
list_move_tail(&ns->list, &rm_list);
|
2016-05-27 18:29:43 +08:00
|
|
|
}
|
2018-02-12 20:54:46 +08:00
|
|
|
up_write(&ctrl->namespaces_rwsem);
|
2018-02-12 20:54:44 +08:00
|
|
|
|
|
|
|
list_for_each_entry_safe(ns, next, &rm_list, list)
|
|
|
|
nvme_ns_remove(ns);
|
|
|
|
|
2016-05-27 18:29:43 +08:00
|
|
|
}
|
|
|
|
|
2020-04-04 16:31:35 +08:00
|
|
|
static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
|
2015-10-23 05:45:06 +08:00
|
|
|
{
|
2020-04-04 16:34:21 +08:00
|
|
|
const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
|
2015-10-23 05:45:06 +08:00
|
|
|
__le32 *ns_list;
|
2020-04-04 16:31:35 +08:00
|
|
|
u32 prev = 0;
|
|
|
|
int ret = 0, i;
|
2015-10-23 05:45:06 +08:00
|
|
|
|
2020-04-04 16:16:03 +08:00
|
|
|
if (nvme_ctrl_limited_cns(ctrl))
|
|
|
|
return -EOPNOTSUPP;
|
2015-10-23 05:45:06 +08:00
|
|
|
|
2018-02-08 21:56:31 +08:00
|
|
|
ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
|
2015-10-23 05:45:06 +08:00
|
|
|
if (!ns_list)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2020-04-04 16:31:35 +08:00
|
|
|
for (;;) {
|
2020-09-28 20:08:28 +08:00
|
|
|
struct nvme_command cmd = {
|
|
|
|
.identify.opcode = nvme_admin_identify,
|
|
|
|
.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST,
|
|
|
|
.identify.nsid = cpu_to_le32(prev),
|
|
|
|
};
|
|
|
|
|
|
|
|
ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
|
|
|
|
NVME_IDENTIFY_DATA_SIZE);
|
2020-11-30 20:47:47 +08:00
|
|
|
if (ret) {
|
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"Identify NS List failed (status=0x%x)\n", ret);
|
2016-05-27 18:29:43 +08:00
|
|
|
goto free;
|
2020-11-30 20:47:47 +08:00
|
|
|
}
|
2015-10-23 05:45:06 +08:00
|
|
|
|
2020-04-04 16:34:21 +08:00
|
|
|
for (i = 0; i < nr_entries; i++) {
|
2020-04-04 16:31:35 +08:00
|
|
|
u32 nsid = le32_to_cpu(ns_list[i]);
|
2015-10-23 05:45:06 +08:00
|
|
|
|
2020-04-04 16:31:35 +08:00
|
|
|
if (!nsid) /* end of the list? */
|
|
|
|
goto out;
|
2020-09-28 15:42:17 +08:00
|
|
|
nvme_validate_or_alloc_ns(ctrl, nsid);
|
2020-04-04 16:30:32 +08:00
|
|
|
while (++prev < nsid)
|
|
|
|
nvme_ns_remove_by_nsid(ctrl, prev);
|
2015-10-23 05:45:06 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
2016-05-27 18:29:43 +08:00
|
|
|
nvme_remove_invalid_namespaces(ctrl, prev);
|
|
|
|
free:
|
2015-10-23 05:45:06 +08:00
|
|
|
kfree(ns_list);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-04-04 16:31:35 +08:00
|
|
|
static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
|
2015-11-28 22:39:07 +08:00
|
|
|
{
|
2020-04-04 16:31:35 +08:00
|
|
|
struct nvme_id_ctrl *id;
|
|
|
|
u32 nn, i;
|
|
|
|
|
|
|
|
if (nvme_identify_ctrl(ctrl, &id))
|
|
|
|
return;
|
|
|
|
nn = le32_to_cpu(id->nn);
|
|
|
|
kfree(id);
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2015-10-23 05:45:06 +08:00
|
|
|
for (i = 1; i <= nn; i++)
|
2020-09-28 15:42:17 +08:00
|
|
|
nvme_validate_or_alloc_ns(ctrl, i);
|
2015-10-23 05:45:06 +08:00
|
|
|
|
2016-05-27 18:29:43 +08:00
|
|
|
nvme_remove_invalid_namespaces(ctrl, nn);
|
2015-11-28 22:39:07 +08:00
|
|
|
}
|
|
|
|
|
2018-06-07 19:47:33 +08:00
|
|
|
static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
|
2018-05-26 00:17:41 +08:00
|
|
|
{
|
|
|
|
size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
|
|
|
|
__le32 *log;
|
2018-06-07 19:47:33 +08:00
|
|
|
int error;
|
2018-05-26 00:17:41 +08:00
|
|
|
|
|
|
|
log = kzalloc(log_size, GFP_KERNEL);
|
|
|
|
if (!log)
|
2018-06-07 19:47:33 +08:00
|
|
|
return;
|
2018-05-26 00:17:41 +08:00
|
|
|
|
2018-06-07 19:47:33 +08:00
|
|
|
/*
|
|
|
|
* We need to read the log to clear the AEN, but we don't want to rely
|
|
|
|
* on it for the changed namespace information as userspace could have
|
|
|
|
* raced with us in reading the log page, which could cause us to miss
|
|
|
|
* updates.
|
|
|
|
*/
|
2020-06-30 03:06:40 +08:00
|
|
|
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
|
|
|
|
NVME_CSI_NVM, log, log_size, 0);
|
2018-06-07 19:47:33 +08:00
|
|
|
if (error)
|
2018-05-26 00:17:41 +08:00
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"reading changed ns log failed: %d\n", error);
|
|
|
|
|
|
|
|
kfree(log);
|
|
|
|
}
|
|
|
|
|
2016-04-26 19:51:59 +08:00
|
|
|
static void nvme_scan_work(struct work_struct *work)
|
2015-11-28 22:39:07 +08:00
|
|
|
{
|
2016-04-26 19:51:59 +08:00
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(work, struct nvme_ctrl, scan_work);
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2019-09-03 23:22:24 +08:00
|
|
|
/* No tagset on a live ctrl means IO queues could not created */
|
|
|
|
if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
|
2016-04-26 19:51:59 +08:00
|
|
|
return;
|
|
|
|
|
2018-06-07 16:27:41 +08:00
|
|
|
if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
|
2018-05-26 00:17:41 +08:00
|
|
|
dev_info(ctrl->device, "rescanning namespaces.\n");
|
2018-06-07 19:47:33 +08:00
|
|
|
nvme_clear_changed_ns_log(ctrl);
|
2018-05-26 00:17:41 +08:00
|
|
|
}
|
|
|
|
|
2019-01-29 00:46:07 +08:00
|
|
|
mutex_lock(&ctrl->scan_lock);
|
2020-04-04 16:31:35 +08:00
|
|
|
if (nvme_scan_ns_list(ctrl) != 0)
|
|
|
|
nvme_scan_ns_sequential(ctrl);
|
2019-01-29 00:46:07 +08:00
|
|
|
mutex_unlock(&ctrl->scan_lock);
|
2016-04-26 19:51:59 +08:00
|
|
|
}
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2016-07-14 01:45:02 +08:00
|
|
|
/*
|
|
|
|
* This function iterates the namespace list unlocked to allow recovery from
|
|
|
|
* controller failure. It is up to the caller to ensure the namespace list is
|
|
|
|
* not modified by scan work while this function is executing.
|
|
|
|
*/
|
2015-11-28 22:39:07 +08:00
|
|
|
void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns, *next;
|
2018-02-12 20:54:44 +08:00
|
|
|
LIST_HEAD(ns_list);
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2019-07-26 02:56:57 +08:00
|
|
|
/*
|
|
|
|
* make sure to requeue I/O to all namespaces as these
|
|
|
|
* might result from the scan itself and must complete
|
|
|
|
* for the scan_work to make progress
|
|
|
|
*/
|
|
|
|
nvme_mpath_clear_ctrl_paths(ctrl);
|
|
|
|
|
2018-11-22 07:17:37 +08:00
|
|
|
/* prevent racing with ns scanning */
|
|
|
|
flush_work(&ctrl->scan_work);
|
|
|
|
|
2016-05-12 22:37:14 +08:00
|
|
|
/*
|
|
|
|
* The dead states indicates the controller was not gracefully
|
|
|
|
* disconnected. In that case, we won't be able to flush any data while
|
|
|
|
* removing the namespaces' disks; fail all the queues now to avoid
|
|
|
|
* potentially having to clean up the failed sync later.
|
|
|
|
*/
|
|
|
|
if (ctrl->state == NVME_CTRL_DEAD)
|
|
|
|
nvme_kill_queues(ctrl);
|
|
|
|
|
2020-07-23 07:32:19 +08:00
|
|
|
/* this is a no-op when called from the controller reset handler */
|
|
|
|
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_write(&ctrl->namespaces_rwsem);
|
2018-02-12 20:54:44 +08:00
|
|
|
list_splice_init(&ctrl->namespaces, &ns_list);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_write(&ctrl->namespaces_rwsem);
|
2018-02-12 20:54:44 +08:00
|
|
|
|
|
|
|
list_for_each_entry_safe(ns, next, &ns_list, list)
|
2015-11-28 22:39:07 +08:00
|
|
|
nvme_ns_remove(ns);
|
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2019-09-05 05:29:48 +08:00
|
|
|
static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(dev, struct nvme_ctrl, ctrl_device);
|
|
|
|
struct nvmf_ctrl_options *opts = ctrl->opts;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (opts) {
|
|
|
|
ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = add_uevent_var(env, "NVME_TRSVCID=%s",
|
|
|
|
opts->trsvcid ?: "none");
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
|
|
|
|
opts->host_traddr ?: "none");
|
nvme-tcp: allow selecting the network interface for connections
In our application, we need a way to force TCP connections to go out a
specific IP interface instead of letting Linux select the interface
based on the routing tables.
Add the 'host-iface' option to allow specifying the interface to use.
When the option host-iface is specified, the driver uses the specified
interface to set the option SO_BINDTODEVICE on the TCP socket before
connecting.
This new option is needed in addtion to the existing host-traddr for
the following reasons:
Specifying an IP interface by its associated IP address is less
intuitive than specifying the actual interface name and, in some cases,
simply doesn't work. That's because the association between interfaces
and IP addresses is not predictable. IP addresses can be changed or can
change by themselves over time (e.g. DHCP). Interface names are
predictable [1] and will persist over time. Consider the following
configuration.
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state ...
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 100.0.0.100/24 scope global lo
valid_lft forever preferred_lft forever
2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff
inet 100.0.0.100/24 scope global enp0s3
valid_lft forever preferred_lft forever
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff
inet 100.0.0.100/24 scope global enp0s8
valid_lft forever preferred_lft forever
The above is a VM that I configured with the same IP address
(100.0.0.100) on all interfaces. Doing a reverse lookup to identify the
unique interface associated with 100.0.0.100 does not work here. And
this is why the option host_iface is required. I understand that the
above config does not represent a standard host system, but I'm using
this to prove a point: "We can never know how users will configure
their systems". By te way, The above configuration is perfectly fine
by Linux.
The current TCP implementation for host_traddr performs a
bind()-before-connect(). This is a common construct to set the source
IP address on a TCP socket before connecting. This has no effect on how
Linux selects the interface for the connection. That's because Linux
uses the Weak End System model as described in RFC1122 [2]. On the other
hand, setting the Source IP Address has benefits and should be supported
by linux-nvme. In fact, setting the Source IP Address is a mandatory
FedGov requirement (e.g. connection to a RADIUS/TACACS+ server).
Consider the following configuration.
$ ip addr list dev enp0s8
3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc ...
link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff
inet 192.168.56.101/24 brd 192.168.56.255 scope global enp0s8
valid_lft 426sec preferred_lft 426sec
inet 192.168.56.102/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
inet 192.168.56.103/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
inet 192.168.56.104/24 scope global secondary enp0s8
valid_lft forever preferred_lft forever
Here we can see that several addresses are associated with interface
enp0s8. By default, Linux always selects the default IP address,
192.168.56.101, as the source address when connecting over interface
enp0s8. Some users, however, want the ability to specify a different
source address (e.g., 192.168.56.102, 192.168.56.103, ...). The option
host_traddr can be used as-is to perform this function.
In conclusion, I believe that we need 2 options for TCP connections.
One that can be used to specify an interface (host-iface). And one that
can be used to set the source address (host-traddr). Users should be
allowed to use one or the other, or both, or none. Of course, the
documentation for host_traddr will need some clarification. It should
state that when used for TCP connection, this option only sets the
source address. And the documentation for host_iface should say that
this option is only available for TCP connections.
References:
[1] https://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames/
[2] https://tools.ietf.org/html/rfc1122
Tested both IPv4 and IPv6 connections.
Signed-off-by: Martin Belanger <martin.belanger@dell.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-05-21 03:09:34 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = add_uevent_var(env, "NVME_HOST_IFACE=%s",
|
|
|
|
opts->host_iface ?: "none");
|
2019-09-05 05:29:48 +08:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-02-09 03:33:45 +08:00
|
|
|
static void nvme_change_uevent(struct nvme_ctrl *ctrl, char *envdata)
|
|
|
|
{
|
|
|
|
char *envp[2] = { envdata, NULL };
|
|
|
|
|
|
|
|
kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
|
|
|
|
}
|
|
|
|
|
2017-11-08 06:13:14 +08:00
|
|
|
static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
char *envp[2] = { NULL, NULL };
|
|
|
|
u32 aen_result = ctrl->aen_result;
|
|
|
|
|
|
|
|
ctrl->aen_result = 0;
|
|
|
|
if (!aen_result)
|
|
|
|
return;
|
|
|
|
|
|
|
|
envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
|
|
|
|
if (!envp[0])
|
|
|
|
return;
|
|
|
|
kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
|
|
|
|
kfree(envp[0]);
|
|
|
|
}
|
|
|
|
|
2016-04-26 19:52:00 +08:00
|
|
|
static void nvme_async_event_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(work, struct nvme_ctrl, async_event_work);
|
|
|
|
|
2017-11-08 06:13:14 +08:00
|
|
|
nvme_aen_uevent(ctrl);
|
2017-11-08 06:13:12 +08:00
|
|
|
ctrl->ops->submit_async_event(ctrl);
|
2016-04-26 19:52:00 +08:00
|
|
|
}
|
|
|
|
|
2017-07-12 18:40:40 +08:00
|
|
|
static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
|
|
|
|
u32 csts;
|
|
|
|
|
|
|
|
if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (csts == ~0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_fw_slot_info_log *log;
|
|
|
|
|
|
|
|
log = kmalloc(sizeof(*log), GFP_KERNEL);
|
|
|
|
if (!log)
|
|
|
|
return;
|
|
|
|
|
2020-06-30 03:06:40 +08:00
|
|
|
if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
|
|
|
|
log, sizeof(*log), 0))
|
2018-06-06 20:39:00 +08:00
|
|
|
dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
|
2017-07-12 18:40:40 +08:00
|
|
|
kfree(log);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_fw_act_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_ctrl *ctrl = container_of(work,
|
|
|
|
struct nvme_ctrl, fw_act_work);
|
|
|
|
unsigned long fw_act_timeout;
|
|
|
|
|
|
|
|
if (ctrl->mtfa)
|
|
|
|
fw_act_timeout = jiffies +
|
|
|
|
msecs_to_jiffies(ctrl->mtfa * 100);
|
|
|
|
else
|
|
|
|
fw_act_timeout = jiffies +
|
|
|
|
msecs_to_jiffies(admin_timeout * 1000);
|
|
|
|
|
|
|
|
nvme_stop_queues(ctrl);
|
|
|
|
while (nvme_ctrl_pp_status(ctrl)) {
|
|
|
|
if (time_after(jiffies, fw_act_timeout)) {
|
|
|
|
dev_warn(ctrl->device,
|
|
|
|
"Fw activation timeout, reset controller\n");
|
2019-09-07 01:23:08 +08:00
|
|
|
nvme_try_sched_reset(ctrl);
|
|
|
|
return;
|
2017-07-12 18:40:40 +08:00
|
|
|
}
|
|
|
|
msleep(100);
|
|
|
|
}
|
|
|
|
|
2019-09-07 01:23:08 +08:00
|
|
|
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
|
2017-07-12 18:40:40 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
nvme_start_queues(ctrl);
|
2017-11-02 17:07:44 +08:00
|
|
|
/* read FW slot information to clear the AER */
|
2017-07-12 18:40:40 +08:00
|
|
|
nvme_get_fw_slot_info(ctrl);
|
|
|
|
}
|
|
|
|
|
2018-05-22 17:09:54 +08:00
|
|
|
static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
|
|
|
|
{
|
2018-09-18 01:47:06 +08:00
|
|
|
u32 aer_notice_type = (result & 0xff00) >> 8;
|
|
|
|
|
2019-05-14 01:46:05 +08:00
|
|
|
trace_nvme_async_event(ctrl, aer_notice_type);
|
|
|
|
|
2018-09-18 01:47:06 +08:00
|
|
|
switch (aer_notice_type) {
|
2018-05-22 17:09:54 +08:00
|
|
|
case NVME_AER_NOTICE_NS_CHANGED:
|
2018-06-07 16:27:41 +08:00
|
|
|
set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
|
2018-05-22 17:09:54 +08:00
|
|
|
nvme_queue_scan(ctrl);
|
|
|
|
break;
|
|
|
|
case NVME_AER_NOTICE_FW_ACT_STARTING:
|
2019-09-07 01:23:08 +08:00
|
|
|
/*
|
|
|
|
* We are (ab)using the RESETTING state to prevent subsequent
|
|
|
|
* recovery actions from interfering with the controller's
|
|
|
|
* firmware activation.
|
|
|
|
*/
|
|
|
|
if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
|
|
|
|
queue_work(nvme_wq, &ctrl->fw_act_work);
|
2018-05-22 17:09:54 +08:00
|
|
|
break;
|
2018-05-14 14:48:54 +08:00
|
|
|
#ifdef CONFIG_NVME_MULTIPATH
|
|
|
|
case NVME_AER_NOTICE_ANA:
|
|
|
|
if (!ctrl->ana_log_buf)
|
|
|
|
break;
|
|
|
|
queue_work(nvme_wq, &ctrl->ana_work);
|
|
|
|
break;
|
|
|
|
#endif
|
2019-07-13 02:02:10 +08:00
|
|
|
case NVME_AER_NOTICE_DISC_CHANGED:
|
|
|
|
ctrl->aen_result = result;
|
|
|
|
break;
|
2018-05-22 17:09:54 +08:00
|
|
|
default:
|
|
|
|
dev_warn(ctrl->device, "async event result %08x\n", result);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-11-10 23:32:34 +08:00
|
|
|
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
|
2018-05-18 00:31:46 +08:00
|
|
|
volatile union nvme_result *res)
|
2016-04-26 19:52:00 +08:00
|
|
|
{
|
2016-11-10 23:32:34 +08:00
|
|
|
u32 result = le32_to_cpu(res->u32);
|
2018-09-18 01:47:06 +08:00
|
|
|
u32 aer_type = result & 0x07;
|
2016-04-26 19:52:00 +08:00
|
|
|
|
2017-11-08 06:13:12 +08:00
|
|
|
if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
|
2016-04-26 19:52:00 +08:00
|
|
|
return;
|
|
|
|
|
2018-09-18 01:47:06 +08:00
|
|
|
switch (aer_type) {
|
2018-05-22 17:09:54 +08:00
|
|
|
case NVME_AER_NOTICE:
|
|
|
|
nvme_handle_aen_notice(ctrl, result);
|
|
|
|
break;
|
2017-11-08 06:13:14 +08:00
|
|
|
case NVME_AER_ERROR:
|
|
|
|
case NVME_AER_SMART:
|
|
|
|
case NVME_AER_CSS:
|
|
|
|
case NVME_AER_VS:
|
2018-09-18 01:47:06 +08:00
|
|
|
trace_nvme_async_event(ctrl, aer_type);
|
2017-11-08 06:13:14 +08:00
|
|
|
ctrl->aen_result = result;
|
2016-11-10 23:32:34 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
2016-04-26 19:52:00 +08:00
|
|
|
}
|
2017-05-04 18:33:14 +08:00
|
|
|
queue_work(nvme_wq, &ctrl->async_event_work);
|
2016-04-26 19:52:00 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_complete_async_event);
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2017-07-02 15:56:43 +08:00
|
|
|
void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
|
2016-02-11 02:03:32 +08:00
|
|
|
{
|
2018-05-14 14:48:54 +08:00
|
|
|
nvme_mpath_stop(ctrl);
|
2017-07-02 15:56:43 +08:00
|
|
|
nvme_stop_keep_alive(ctrl);
|
2020-11-25 02:34:59 +08:00
|
|
|
nvme_stop_failfast_work(ctrl);
|
2016-04-26 19:52:00 +08:00
|
|
|
flush_work(&ctrl->async_event_work);
|
2017-07-12 18:40:40 +08:00
|
|
|
cancel_work_sync(&ctrl->fw_act_work);
|
2017-07-02 15:56:43 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
|
|
|
|
|
|
|
|
void nvme_start_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
2020-07-13 14:25:21 +08:00
|
|
|
nvme_start_keep_alive(ctrl);
|
2017-07-02 15:56:43 +08:00
|
|
|
|
2019-08-23 02:25:46 +08:00
|
|
|
nvme_enable_aen(ctrl);
|
|
|
|
|
2017-07-02 15:56:43 +08:00
|
|
|
if (ctrl->queue_count > 1) {
|
|
|
|
nvme_queue_scan(ctrl);
|
|
|
|
nvme_start_queues(ctrl);
|
|
|
|
}
|
2022-02-09 03:33:45 +08:00
|
|
|
|
|
|
|
nvme_change_uevent(ctrl, "NVME_EVENT=connected");
|
2017-07-02 15:56:43 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_start_ctrl);
|
2016-04-26 19:51:59 +08:00
|
|
|
|
2017-07-02 15:56:43 +08:00
|
|
|
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
2021-01-19 14:43:18 +08:00
|
|
|
nvme_hwmon_exit(ctrl);
|
2019-06-09 22:17:01 +08:00
|
|
|
nvme_fault_inject_fini(&ctrl->fault_inject);
|
2019-05-17 10:30:07 +08:00
|
|
|
dev_pm_qos_hide_latency_tolerance(ctrl->device);
|
2017-10-18 22:59:25 +08:00
|
|
|
cdev_device_del(&ctrl->cdev, ctrl->device);
|
2020-03-24 23:29:42 +08:00
|
|
|
nvme_put_ctrl(ctrl);
|
2015-11-28 22:41:02 +08:00
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
|
2015-11-28 22:41:02 +08:00
|
|
|
|
2020-11-14 02:45:45 +08:00
|
|
|
static void nvme_free_cels(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_effects_log *cel;
|
|
|
|
unsigned long i;
|
|
|
|
|
2021-01-27 03:47:52 +08:00
|
|
|
xa_for_each(&ctrl->cels, i, cel) {
|
2020-11-14 02:45:45 +08:00
|
|
|
xa_erase(&ctrl->cels, i);
|
|
|
|
kfree(cel);
|
|
|
|
}
|
|
|
|
|
|
|
|
xa_destroy(&ctrl->cels);
|
|
|
|
}
|
|
|
|
|
2017-10-18 19:25:42 +08:00
|
|
|
static void nvme_free_ctrl(struct device *dev)
|
2015-11-28 22:41:02 +08:00
|
|
|
{
|
2017-10-18 19:25:42 +08:00
|
|
|
struct nvme_ctrl *ctrl =
|
|
|
|
container_of(dev, struct nvme_ctrl, ctrl_device);
|
2017-11-09 20:48:55 +08:00
|
|
|
struct nvme_subsystem *subsys = ctrl->subsys;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2020-08-27 01:53:04 +08:00
|
|
|
if (!subsys || ctrl->instance != subsys->instance)
|
2022-02-14 17:07:27 +08:00
|
|
|
ida_free(&nvme_instance_ida, ctrl->instance);
|
2019-09-06 00:33:54 +08:00
|
|
|
|
2020-11-14 02:45:45 +08:00
|
|
|
nvme_free_cels(ctrl);
|
2018-05-14 14:48:54 +08:00
|
|
|
nvme_mpath_uninit(ctrl);
|
2018-12-14 04:34:07 +08:00
|
|
|
__free_page(ctrl->discard_page);
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
if (subsys) {
|
2019-05-08 15:48:27 +08:00
|
|
|
mutex_lock(&nvme_subsystems_lock);
|
2017-11-09 20:48:55 +08:00
|
|
|
list_del(&ctrl->subsys_entry);
|
|
|
|
sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
|
2019-05-08 15:48:27 +08:00
|
|
|
mutex_unlock(&nvme_subsystems_lock);
|
2017-11-09 20:48:55 +08:00
|
|
|
}
|
2015-11-28 22:40:19 +08:00
|
|
|
|
|
|
|
ctrl->ops->free_ctrl(ctrl);
|
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
if (subsys)
|
|
|
|
nvme_put_subsystem(subsys);
|
2015-11-28 22:40:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize a NVMe controller structures. This needs to be called during
|
|
|
|
* earliest initialization so that we have the initialized structured around
|
|
|
|
* during probing.
|
|
|
|
*/
|
|
|
|
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
|
|
|
|
const struct nvme_ctrl_ops *ops, unsigned long quirks)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2016-04-26 19:51:57 +08:00
|
|
|
ctrl->state = NVME_CTRL_NEW;
|
2020-11-25 02:34:59 +08:00
|
|
|
clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
|
2016-04-26 19:51:57 +08:00
|
|
|
spin_lock_init(&ctrl->lock);
|
2019-01-29 00:46:07 +08:00
|
|
|
mutex_init(&ctrl->scan_lock);
|
2015-11-28 22:40:19 +08:00
|
|
|
INIT_LIST_HEAD(&ctrl->namespaces);
|
2020-09-23 05:05:29 +08:00
|
|
|
xa_init(&ctrl->cels);
|
2018-02-12 20:54:46 +08:00
|
|
|
init_rwsem(&ctrl->namespaces_rwsem);
|
2015-11-28 22:40:19 +08:00
|
|
|
ctrl->dev = dev;
|
|
|
|
ctrl->ops = ops;
|
|
|
|
ctrl->quirks = quirks;
|
2020-06-16 17:34:21 +08:00
|
|
|
ctrl->numa_node = NUMA_NO_NODE;
|
2016-04-26 19:51:59 +08:00
|
|
|
INIT_WORK(&ctrl->scan_work, nvme_scan_work);
|
2016-04-26 19:52:00 +08:00
|
|
|
INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
|
2017-07-12 18:40:40 +08:00
|
|
|
INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
|
2017-10-29 16:44:29 +08:00
|
|
|
INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
|
2019-09-05 00:06:11 +08:00
|
|
|
init_waitqueue_head(&ctrl->state_wq);
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2018-06-13 07:28:24 +08:00
|
|
|
INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
|
2020-11-25 02:34:59 +08:00
|
|
|
INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
|
2018-06-13 07:28:24 +08:00
|
|
|
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
|
|
|
|
ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
|
|
|
|
|
2018-12-13 00:18:11 +08:00
|
|
|
BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
|
|
|
|
PAGE_SIZE);
|
|
|
|
ctrl->discard_page = alloc_page(GFP_KERNEL);
|
|
|
|
if (!ctrl->discard_page) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2022-02-14 17:07:27 +08:00
|
|
|
ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
|
2017-10-18 19:10:01 +08:00
|
|
|
if (ret < 0)
|
2015-11-28 22:40:19 +08:00
|
|
|
goto out;
|
2017-10-18 19:10:01 +08:00
|
|
|
ctrl->instance = ret;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2017-10-18 19:25:42 +08:00
|
|
|
device_initialize(&ctrl->ctrl_device);
|
|
|
|
ctrl->device = &ctrl->ctrl_device;
|
2020-12-01 20:56:08 +08:00
|
|
|
ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
|
|
|
|
ctrl->instance);
|
2017-10-18 19:25:42 +08:00
|
|
|
ctrl->device->class = nvme_class;
|
|
|
|
ctrl->device->parent = ctrl->dev;
|
|
|
|
ctrl->device->groups = nvme_dev_attr_groups;
|
|
|
|
ctrl->device->release = nvme_free_ctrl;
|
|
|
|
dev_set_drvdata(ctrl->device, ctrl);
|
|
|
|
ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
|
|
|
|
if (ret)
|
2015-11-28 22:40:19 +08:00
|
|
|
goto out_release_instance;
|
|
|
|
|
2020-03-24 23:29:41 +08:00
|
|
|
nvme_get_ctrl(ctrl);
|
2017-10-18 22:59:25 +08:00
|
|
|
cdev_init(&ctrl->cdev, &nvme_dev_fops);
|
|
|
|
ctrl->cdev.owner = ops->module;
|
|
|
|
ret = cdev_device_add(&ctrl->cdev, ctrl->device);
|
2017-10-18 19:25:42 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_free_name;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.
The hardware configuration is a table. For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.
This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely. On hardware without APST support, the sysfs file will
not be exposed.
The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.
In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful. There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset. This can be configured from
userspace, but it doesn't seem useful to support in the driver.
On my laptop, enabling APST seems to save nearly 1W.
The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.
This feature is quirked off on a known-buggy Samsung device.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
|
|
|
/*
|
|
|
|
* Initialize latency tolerance controls. The sysfs files won't
|
|
|
|
* be visible to userspace unless the device actually supports APST.
|
|
|
|
*/
|
|
|
|
ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
|
|
|
|
dev_pm_qos_update_user_latency_tolerance(ctrl->device,
|
|
|
|
min(default_ps_max_latency_us, (unsigned long)S32_MAX));
|
|
|
|
|
2019-06-09 22:17:01 +08:00
|
|
|
nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
|
2021-04-29 20:18:53 +08:00
|
|
|
nvme_mpath_init_ctrl(ctrl);
|
2019-06-09 22:17:01 +08:00
|
|
|
|
2015-11-28 22:40:19 +08:00
|
|
|
return 0;
|
2017-10-18 19:25:42 +08:00
|
|
|
out_free_name:
|
2020-03-24 23:29:41 +08:00
|
|
|
nvme_put_ctrl(ctrl);
|
2018-11-27 07:39:47 +08:00
|
|
|
kfree_const(ctrl->device->kobj.name);
|
2015-11-28 22:40:19 +08:00
|
|
|
out_release_instance:
|
2022-02-14 17:07:27 +08:00
|
|
|
ida_free(&nvme_instance_ida, ctrl->instance);
|
2015-11-28 22:40:19 +08:00
|
|
|
out:
|
2018-12-13 00:18:11 +08:00
|
|
|
if (ctrl->discard_page)
|
|
|
|
__free_page(ctrl->discard_page);
|
2015-11-28 22:40:19 +08:00
|
|
|
return ret;
|
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2021-10-14 16:17:07 +08:00
|
|
|
static void nvme_start_ns_queue(struct nvme_ns *ns)
|
|
|
|
{
|
2021-10-14 16:17:08 +08:00
|
|
|
if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
|
|
|
|
blk_mq_unquiesce_queue(ns->queue);
|
2021-10-14 16:17:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_stop_ns_queue(struct nvme_ns *ns)
|
|
|
|
{
|
2021-10-14 16:17:08 +08:00
|
|
|
if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
|
|
|
|
blk_mq_quiesce_queue(ns->queue);
|
2021-11-09 15:11:44 +08:00
|
|
|
else
|
|
|
|
blk_mq_wait_quiesce_done(ns->queue);
|
2021-10-14 16:17:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prepare a queue for teardown.
|
|
|
|
*
|
|
|
|
* This must forcibly unquiesce queues to avoid blocking dispatch, and only set
|
|
|
|
* the capacity to 0 after that to avoid blocking dispatchers that may be
|
|
|
|
* holding bd_butex. This will end buffered writers dirtying pages that can't
|
|
|
|
* be synced.
|
|
|
|
*/
|
|
|
|
static void nvme_set_queue_dying(struct nvme_ns *ns)
|
|
|
|
{
|
|
|
|
if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
|
|
|
|
return;
|
|
|
|
|
|
|
|
blk_set_queue_dying(ns->queue);
|
|
|
|
nvme_start_ns_queue(ns);
|
|
|
|
|
|
|
|
set_capacity_and_notify(ns->disk, 0);
|
|
|
|
}
|
|
|
|
|
2016-02-25 00:15:56 +08:00
|
|
|
/**
|
|
|
|
* nvme_kill_queues(): Ends all namespace queues
|
|
|
|
* @ctrl: the dead controller that needs to end
|
|
|
|
*
|
|
|
|
* Call this function when the driver determines it is unable to get the
|
|
|
|
* controller in a state capable of servicing IO.
|
|
|
|
*/
|
|
|
|
void nvme_kill_queues(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2017-06-02 16:32:08 +08:00
|
|
|
|
2017-06-19 10:21:08 +08:00
|
|
|
/* Forcibly unquiesce queues to avoid blocking dispatch */
|
2018-11-23 23:58:10 +08:00
|
|
|
if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
|
2021-10-14 16:17:06 +08:00
|
|
|
nvme_start_admin_queue(ctrl);
|
2017-06-19 10:21:08 +08:00
|
|
|
|
2018-06-30 03:03:28 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list)
|
|
|
|
nvme_set_queue_dying(ns);
|
2017-05-22 23:05:03 +08:00
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2016-02-25 00:15:56 +08:00
|
|
|
}
|
Merge branch 'for-4.6/drivers' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
"This is the block driver pull request for this merge window. It sits
on top of for-4.6/core, that was just sent out.
This contains:
- A set of fixes for lightnvm. One from Alan, fixing an overflow,
and the rest from the usual suspects, Javier and Matias.
- A set of fixes for nbd from Markus and Dan, and a fixup from Arnd
for correct usage of the signed 64-bit divider.
- A set of bug fixes for the Micron mtip32xx, from Asai.
- A fix for the brd discard handling from Bart.
- Update the maintainers entry for cciss, since that hardware has
transferred ownership.
- Three bug fixes for bcache from Eric Wheeler.
- Set of fixes for xen-blk{back,front} from Jan and Konrad.
- Removal of the cpqarray driver. It has been disabled in Kconfig
since 2013, and we were initially scheduled to remove it in 3.15.
- Various updates and fixes for NVMe, with the most important being:
- Removal of the per-device NVMe thread, replacing that with a
watchdog timer instead. From Christoph.
- Exposing the namespace WWID through sysfs, from Keith.
- Set of cleanups from Ming Lin.
- Logging the controller device name instead of the underlying
PCI device name, from Sagi.
- And a bunch of fixes and optimizations from the usual suspects
in this area"
* 'for-4.6/drivers' of git://git.kernel.dk/linux-block: (49 commits)
NVMe: Expose ns wwid through single sysfs entry
drivers:block: cpqarray clean up
brd: Fix discard request processing
cpqarray: remove it from the kernel
cciss: update MAINTAINERS
NVMe: Remove unused sq_head read in completion path
bcache: fix cache_set_flush() NULL pointer dereference on OOM
bcache: cleaned up error handling around register_cache()
bcache: fix race of writeback thread starting before complete initialization
NVMe: Create discard zero quirk white list
nbd: use correct div_s64 helper
mtip32xx: remove unneeded variable in mtip_cmd_timeout()
lightnvm: generalize rrpc ppa calculations
lightnvm: remove struct nvm_dev->total_blocks
lightnvm: rename ->nr_pages to ->nr_sects
lightnvm: update closed list outside of intr context
xen/blback: Fit the important information of the thread in 17 characters
lightnvm: fold get bb tbl when using dual/quad plane mode
lightnvm: fix up nonsensical configure overrun checking
xen-blkback: advertise indirect segment support earlier
...
2016-03-19 08:13:31 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_kill_queues);
|
2016-02-25 00:15:56 +08:00
|
|
|
|
2017-03-02 03:22:12 +08:00
|
|
|
void nvme_unfreeze(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list)
|
|
|
|
blk_mq_unfreeze_queue(ns->queue);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_unfreeze);
|
|
|
|
|
2020-07-31 04:24:45 +08:00
|
|
|
int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
|
2017-03-02 03:22:12 +08:00
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list) {
|
|
|
|
timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
|
|
|
|
if (timeout <= 0)
|
|
|
|
break;
|
|
|
|
}
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2020-07-31 04:24:45 +08:00
|
|
|
return timeout;
|
2017-03-02 03:22:12 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
|
|
|
|
|
|
|
|
void nvme_wait_freeze(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list)
|
|
|
|
blk_mq_freeze_queue_wait(ns->queue);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_wait_freeze);
|
|
|
|
|
|
|
|
void nvme_start_freeze(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list)
|
2017-03-27 20:06:57 +08:00
|
|
|
blk_freeze_queue_start(ns->queue);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2017-03-02 03:22:12 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_start_freeze);
|
|
|
|
|
2016-01-05 00:10:57 +08:00
|
|
|
void nvme_stop_queues(struct nvme_ctrl *ctrl)
|
2015-12-24 22:26:59 +08:00
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2016-10-29 08:23:40 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list)
|
2021-10-14 16:17:07 +08:00
|
|
|
nvme_stop_ns_queue(ns);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2015-12-24 22:26:59 +08:00
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_stop_queues);
|
2015-12-24 22:26:59 +08:00
|
|
|
|
2016-01-05 00:10:57 +08:00
|
|
|
void nvme_start_queues(struct nvme_ctrl *ctrl)
|
2015-12-24 22:26:59 +08:00
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2018-02-12 20:54:46 +08:00
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
2017-07-04 23:16:58 +08:00
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list)
|
2021-10-14 16:17:07 +08:00
|
|
|
nvme_start_ns_queue(ns);
|
2018-02-12 20:54:46 +08:00
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2015-12-24 22:26:59 +08:00
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nvme_start_queues);
|
2015-12-24 22:26:59 +08:00
|
|
|
|
2021-10-14 16:17:05 +08:00
|
|
|
void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
2021-10-14 16:17:08 +08:00
|
|
|
if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
|
|
|
|
blk_mq_quiesce_queue(ctrl->admin_q);
|
2021-11-09 15:11:44 +08:00
|
|
|
else
|
|
|
|
blk_mq_wait_quiesce_done(ctrl->admin_q);
|
2021-10-14 16:17:05 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);
|
|
|
|
|
|
|
|
void nvme_start_admin_queue(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
2021-10-14 16:17:08 +08:00
|
|
|
if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
|
|
|
|
blk_mq_unquiesce_queue(ctrl->admin_q);
|
2021-10-14 16:17:05 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_start_admin_queue);
|
|
|
|
|
2020-10-22 10:15:00 +08:00
|
|
|
void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
|
2019-05-15 04:46:09 +08:00
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
|
|
|
down_read(&ctrl->namespaces_rwsem);
|
|
|
|
list_for_each_entry(ns, &ctrl->namespaces, list)
|
|
|
|
blk_sync_queue(ns->queue);
|
|
|
|
up_read(&ctrl->namespaces_rwsem);
|
2020-10-22 10:15:00 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
|
2019-09-04 04:08:47 +08:00
|
|
|
|
2020-10-22 10:15:00 +08:00
|
|
|
void nvme_sync_queues(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
nvme_sync_io_queues(ctrl);
|
2019-09-04 04:08:47 +08:00
|
|
|
if (ctrl->admin_q)
|
|
|
|
blk_sync_queue(ctrl->admin_q);
|
2019-05-15 04:46:09 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(nvme_sync_queues);
|
|
|
|
|
2020-09-17 09:11:02 +08:00
|
|
|
struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
|
2020-07-25 01:25:15 +08:00
|
|
|
{
|
2020-09-17 09:11:02 +08:00
|
|
|
if (file->f_op != &nvme_dev_fops)
|
|
|
|
return NULL;
|
|
|
|
return file->private_data;
|
2020-07-25 01:25:15 +08:00
|
|
|
}
|
2020-09-17 09:11:02 +08:00
|
|
|
EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
|
2020-07-25 01:25:15 +08:00
|
|
|
|
2019-04-30 23:36:52 +08:00
|
|
|
/*
|
|
|
|
* Check we didn't inadvertently grow the command structure sizes:
|
|
|
|
*/
|
|
|
|
static inline void _nvme_check_size(void)
|
|
|
|
{
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
|
2020-06-30 03:06:41 +08:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
|
2021-03-25 07:18:05 +08:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
|
2019-04-30 23:36:52 +08:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2019-04-30 23:37:43 +08:00
|
|
|
static int __init nvme_core_init(void)
|
2015-11-28 22:39:07 +08:00
|
|
|
{
|
2018-01-14 18:39:02 +08:00
|
|
|
int result = -ENOMEM;
|
2015-11-28 22:39:07 +08:00
|
|
|
|
2019-04-30 23:36:52 +08:00
|
|
|
_nvme_check_size();
|
|
|
|
|
2017-06-08 02:31:55 +08:00
|
|
|
nvme_wq = alloc_workqueue("nvme-wq",
|
|
|
|
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
|
|
|
|
if (!nvme_wq)
|
2018-01-14 18:39:02 +08:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
|
|
|
|
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
|
|
|
|
if (!nvme_reset_wq)
|
|
|
|
goto destroy_wq;
|
|
|
|
|
|
|
|
nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
|
|
|
|
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
|
|
|
|
if (!nvme_delete_wq)
|
|
|
|
goto destroy_reset_wq;
|
2017-06-08 02:31:55 +08:00
|
|
|
|
2020-12-01 20:56:08 +08:00
|
|
|
result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0,
|
|
|
|
NVME_MINORS, "nvme");
|
2015-11-28 22:40:19 +08:00
|
|
|
if (result < 0)
|
2018-01-14 18:39:02 +08:00
|
|
|
goto destroy_delete_wq;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
|
|
|
nvme_class = class_create(THIS_MODULE, "nvme");
|
|
|
|
if (IS_ERR(nvme_class)) {
|
|
|
|
result = PTR_ERR(nvme_class);
|
|
|
|
goto unregister_chrdev;
|
|
|
|
}
|
2019-09-05 05:29:48 +08:00
|
|
|
nvme_class->dev_uevent = nvme_class_uevent;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2017-11-09 20:48:55 +08:00
|
|
|
nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
|
|
|
|
if (IS_ERR(nvme_subsys_class)) {
|
|
|
|
result = PTR_ERR(nvme_subsys_class);
|
|
|
|
goto destroy_class;
|
|
|
|
}
|
2021-04-21 15:45:04 +08:00
|
|
|
|
|
|
|
result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
|
|
|
|
"nvme-generic");
|
|
|
|
if (result < 0)
|
|
|
|
goto destroy_subsys_class;
|
|
|
|
|
|
|
|
nvme_ns_chr_class = class_create(THIS_MODULE, "nvme-generic");
|
|
|
|
if (IS_ERR(nvme_ns_chr_class)) {
|
|
|
|
result = PTR_ERR(nvme_ns_chr_class);
|
|
|
|
goto unregister_generic_ns;
|
|
|
|
}
|
|
|
|
|
2015-11-28 22:39:07 +08:00
|
|
|
return 0;
|
2015-11-28 22:40:19 +08:00
|
|
|
|
2021-04-21 15:45:04 +08:00
|
|
|
unregister_generic_ns:
|
|
|
|
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
|
|
|
|
destroy_subsys_class:
|
|
|
|
class_destroy(nvme_subsys_class);
|
2017-11-09 20:48:55 +08:00
|
|
|
destroy_class:
|
|
|
|
class_destroy(nvme_class);
|
2017-06-08 02:31:55 +08:00
|
|
|
unregister_chrdev:
|
2020-12-01 20:56:08 +08:00
|
|
|
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
|
2018-01-14 18:39:02 +08:00
|
|
|
destroy_delete_wq:
|
|
|
|
destroy_workqueue(nvme_delete_wq);
|
|
|
|
destroy_reset_wq:
|
|
|
|
destroy_workqueue(nvme_reset_wq);
|
2017-06-08 02:31:55 +08:00
|
|
|
destroy_wq:
|
|
|
|
destroy_workqueue(nvme_wq);
|
2018-01-14 18:39:02 +08:00
|
|
|
out:
|
2015-11-28 22:40:19 +08:00
|
|
|
return result;
|
2015-11-28 22:39:07 +08:00
|
|
|
}
|
|
|
|
|
2019-04-30 23:37:43 +08:00
|
|
|
static void __exit nvme_core_exit(void)
|
2015-11-28 22:39:07 +08:00
|
|
|
{
|
2021-04-21 15:45:04 +08:00
|
|
|
class_destroy(nvme_ns_chr_class);
|
2017-11-09 20:48:55 +08:00
|
|
|
class_destroy(nvme_subsys_class);
|
2015-11-28 22:40:19 +08:00
|
|
|
class_destroy(nvme_class);
|
2021-04-21 15:45:04 +08:00
|
|
|
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
|
2020-12-01 20:56:08 +08:00
|
|
|
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
|
2018-01-14 18:39:02 +08:00
|
|
|
destroy_workqueue(nvme_delete_wq);
|
|
|
|
destroy_workqueue(nvme_reset_wq);
|
2017-06-08 02:31:55 +08:00
|
|
|
destroy_workqueue(nvme_wq);
|
2021-04-21 15:45:04 +08:00
|
|
|
ida_destroy(&nvme_ns_chr_minor_ida);
|
2020-03-18 23:27:59 +08:00
|
|
|
ida_destroy(&nvme_instance_ida);
|
2015-11-28 22:39:07 +08:00
|
|
|
}
|
2016-02-11 02:03:32 +08:00
|
|
|
|
|
|
|
MODULE_LICENSE("GPL");
|
|
|
|
MODULE_VERSION("1.0");
|
|
|
|
module_init(nvme_core_init);
|
|
|
|
module_exit(nvme_core_exit);
|