OpenCloudOS-Kernel/include/linux/nvme.h

1069 lines
24 KiB
C
Raw Normal View History

/*
* Definitions for the NVM Express interface
* Copyright (c) 2011-2014, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef _LINUX_NVME_H
#define _LINUX_NVME_H
#include <linux/types.h>
#include <linux/uuid.h>
/* NQN names in commands fields specified one size */
#define NVMF_NQN_FIELD_LEN 256
/* However the max length of a qualified name is another size */
#define NVMF_NQN_SIZE 223
#define NVMF_TRSVCID_SIZE 32
#define NVMF_TRADDR_SIZE 256
#define NVMF_TSAS_SIZE 256
#define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery"
#define NVME_RDMA_IP_PORT 4420
enum nvme_subsys_type {
NVME_NQN_DISC = 1, /* Discovery type target subsystem */
NVME_NQN_NVME = 2, /* NVME type target subsystem */
};
/* Address Family codes for Discovery Log Page entry ADRFAM field */
enum {
NVMF_ADDR_FAMILY_PCI = 0, /* PCIe */
NVMF_ADDR_FAMILY_IP4 = 1, /* IP4 */
NVMF_ADDR_FAMILY_IP6 = 2, /* IP6 */
NVMF_ADDR_FAMILY_IB = 3, /* InfiniBand */
NVMF_ADDR_FAMILY_FC = 4, /* Fibre Channel */
};
/* Transport Type codes for Discovery Log Page entry TRTYPE field */
enum {
NVMF_TRTYPE_RDMA = 1, /* RDMA */
NVMF_TRTYPE_FC = 2, /* Fibre Channel */
NVMF_TRTYPE_LOOP = 254, /* Reserved for host usage */
NVMF_TRTYPE_MAX,
};
/* Transport Requirements codes for Discovery Log Page entry TREQ field */
enum {
NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */
NVMF_TREQ_REQUIRED = 1, /* Required */
NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */
};
/* RDMA QP Service Type codes for Discovery Log Page entry TSAS
* RDMA_QPTYPE field
*/
enum {
NVMF_RDMA_QPTYPE_CONNECTED = 1, /* Reliable Connected */
NVMF_RDMA_QPTYPE_DATAGRAM = 2, /* Reliable Datagram */
};
/* RDMA QP Service Type codes for Discovery Log Page entry TSAS
* RDMA_QPTYPE field
*/
enum {
NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 1, /* No Provider Specified */
NVMF_RDMA_PRTYPE_IB = 2, /* InfiniBand */
NVMF_RDMA_PRTYPE_ROCE = 3, /* InfiniBand RoCE */
NVMF_RDMA_PRTYPE_ROCEV2 = 4, /* InfiniBand RoCEV2 */
NVMF_RDMA_PRTYPE_IWARP = 5, /* IWARP */
};
/* RDMA Connection Management Service Type codes for Discovery Log Page
* entry TSAS RDMA_CMS field
*/
enum {
NVMF_RDMA_CMS_RDMA_CM = 1, /* Sockets based endpoint addressing */
};
#define NVMF_AQ_DEPTH 32
enum {
NVME_REG_CAP = 0x0000, /* Controller Capabilities */
NVME_REG_VS = 0x0008, /* Version */
NVME_REG_INTMS = 0x000c, /* Interrupt Mask Set */
NVME_REG_INTMC = 0x0010, /* Interrupt Mask Clear */
NVME_REG_CC = 0x0014, /* Controller Configuration */
NVME_REG_CSTS = 0x001c, /* Controller Status */
NVME_REG_NSSR = 0x0020, /* NVM Subsystem Reset */
NVME_REG_AQA = 0x0024, /* Admin Queue Attributes */
NVME_REG_ASQ = 0x0028, /* Admin SQ Base Address */
NVME_REG_ACQ = 0x0030, /* Admin CQ Base Address */
NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */
NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */
};
#define NVME_CAP_MQES(cap) ((cap) & 0xffff)
#define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff)
#define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf)
#define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1)
#define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf)
#define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf)
#define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7)
#define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff)
#define NVME_CMB_SZ(cmbsz) (((cmbsz) >> 12) & 0xfffff)
#define NVME_CMB_SZU(cmbsz) (((cmbsz) >> 8) & 0xf)
#define NVME_CMB_WDS(cmbsz) ((cmbsz) & 0x10)
#define NVME_CMB_RDS(cmbsz) ((cmbsz) & 0x8)
#define NVME_CMB_LISTS(cmbsz) ((cmbsz) & 0x4)
#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2)
#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1)
/*
* Submission and Completion Queue Entry Sizes for the NVM command set.
* (In bytes and specified as a power of two (2^n)).
*/
#define NVME_NVM_IOSQES 6
#define NVME_NVM_IOCQES 4
enum {
NVME_CC_ENABLE = 1 << 0,
NVME_CC_CSS_NVM = 0 << 4,
NVME_CC_MPS_SHIFT = 7,
NVME_CC_ARB_RR = 0 << 11,
NVME_CC_ARB_WRRU = 1 << 11,
NVME_CC_ARB_VS = 7 << 11,
NVME_CC_SHN_NONE = 0 << 14,
NVME_CC_SHN_NORMAL = 1 << 14,
NVME_CC_SHN_ABRUPT = 2 << 14,
NVME_CC_SHN_MASK = 3 << 14,
NVME_CC_IOSQES = NVME_NVM_IOSQES << 16,
NVME_CC_IOCQES = NVME_NVM_IOCQES << 20,
NVME_CSTS_RDY = 1 << 0,
NVME_CSTS_CFS = 1 << 1,
NVME_CSTS_NSSRO = 1 << 4,
NVME_CSTS_SHST_NORMAL = 0 << 2,
NVME_CSTS_SHST_OCCUR = 1 << 2,
NVME_CSTS_SHST_CMPLT = 2 << 2,
NVME_CSTS_SHST_MASK = 3 << 2,
};
struct nvme_id_power_state {
__le16 max_power; /* centiwatts */
__u8 rsvd2;
__u8 flags;
__le32 entry_lat; /* microseconds */
__le32 exit_lat; /* microseconds */
__u8 read_tput;
__u8 read_lat;
__u8 write_tput;
__u8 write_lat;
__le16 idle_power;
__u8 idle_scale;
__u8 rsvd19;
__le16 active_power;
__u8 active_work_scale;
__u8 rsvd23[9];
};
enum {
NVME_PS_FLAGS_MAX_POWER_SCALE = 1 << 0,
NVME_PS_FLAGS_NON_OP_STATE = 1 << 1,
};
struct nvme_id_ctrl {
__le16 vid;
__le16 ssvid;
char sn[20];
char mn[40];
char fr[8];
__u8 rab;
__u8 ieee[3];
__u8 cmic;
__u8 mdts;
__le16 cntlid;
__le32 ver;
__le32 rtd3r;
__le32 rtd3e;
__le32 oaes;
__le32 ctratt;
__u8 rsvd100[156];
__le16 oacs;
__u8 acl;
__u8 aerl;
__u8 frmw;
__u8 lpa;
__u8 elpe;
__u8 npss;
__u8 avscc;
__u8 apsta;
__le16 wctemp;
__le16 cctemp;
__le16 mtfa;
__le32 hmpre;
__le32 hmmin;
__u8 tnvmcap[16];
__u8 unvmcap[16];
__le32 rpmbs;
__u8 rsvd316[4];
__le16 kas;
__u8 rsvd322[190];
__u8 sqes;
__u8 cqes;
__le16 maxcmd;
__le32 nn;
__le16 oncs;
__le16 fuses;
__u8 fna;
__u8 vwc;
__le16 awun;
__le16 awupf;
__u8 nvscc;
__u8 rsvd531;
__le16 acwu;
__u8 rsvd534[2];
__le32 sgls;
__u8 rsvd540[228];
char subnqn[256];
__u8 rsvd1024[768];
__le32 ioccsz;
__le32 iorcsz;
__le16 icdoff;
__u8 ctrattr;
__u8 msdbd;
__u8 rsvd1804[244];
struct nvme_id_power_state psd[32];
__u8 vs[1024];
};
enum {
NVME_CTRL_ONCS_COMPARE = 1 << 0,
NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1,
NVME_CTRL_ONCS_DSM = 1 << 2,
NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
NVME_CTRL_VWC_PRESENT = 1 << 0,
NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
nvme: improve performance for virtual NVMe devices This change provides a mechanism to reduce the number of MMIO doorbell writes for the NVMe driver. When running in a virtualized environment like QEMU, the cost of an MMIO is quite hefy here. The main idea for the patch is provide the device two memory location locations: 1) to store the doorbell values so they can be lookup without the doorbell MMIO write 2) to store an event index. I believe the doorbell value is obvious, the event index not so much. Similar to the virtio specification, the virtual device can tell the driver (guest OS) not to write MMIO unless you are writing past this value. FYI: doorbell values are written by the nvme driver (guest OS) and the event index is written by the virtual device (host OS). The patch implements a new admin command that will communicate where these two memory locations reside. If the command fails, the nvme driver will work as before without any optimizations. Contributions: Eric Northup <digitaleric@google.com> Frank Swiderski <fes@google.com> Ted Tso <tytso@mit.edu> Keith Busch <keith.busch@intel.com> Just to give an idea on the performance boost with the vendor extension: Running fio [1], a stock NVMe driver I get about 200K read IOPs with my vendor patch I get about 1000K read IOPs. This was running with a null device i.e. the backing device simply returned success on every read IO request. [1] Running on a 4 core machine: fio --time_based --name=benchmark --runtime=30 --filename=/dev/nvme0n1 --nrfiles=1 --ioengine=libaio --iodepth=32 --direct=1 --invalidate=1 --verify=0 --verify_fatal=0 --numjobs=4 --rw=randread --blocksize=4k --randrepeat=false Signed-off-by: Rob Nelson <rlnelson@google.com> [mlin: port for upstream] Signed-off-by: Ming Lin <mlin@kernel.org> [koike: updated for upstream] Signed-off-by: Helen Koike <helen.koike@collabora.co.uk> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Keith Busch <keith.busch@intel.com>
2017-04-10 23:51:07 +08:00
NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7,
};
struct nvme_lbaf {
__le16 ms;
__u8 ds;
__u8 rp;
};
struct nvme_id_ns {
__le64 nsze;
__le64 ncap;
__le64 nuse;
__u8 nsfeat;
__u8 nlbaf;
__u8 flbas;
__u8 mc;
__u8 dpc;
__u8 dps;
__u8 nmic;
__u8 rescap;
__u8 fpi;
__u8 rsvd33;
__le16 nawun;
__le16 nawupf;
__le16 nacwu;
__le16 nabsn;
__le16 nabo;
__le16 nabspf;
__u16 rsvd46;
__u8 nvmcap[16];
__u8 rsvd64[40];
__u8 nguid[16];
__u8 eui64[8];
struct nvme_lbaf lbaf[16];
__u8 rsvd192[192];
__u8 vs[3712];
};
enum {
NVME_ID_CNS_NS = 0x00,
NVME_ID_CNS_CTRL = 0x01,
NVME_ID_CNS_NS_ACTIVE_LIST = 0x02,
NVME_ID_CNS_NS_PRESENT_LIST = 0x10,
NVME_ID_CNS_NS_PRESENT = 0x11,
NVME_ID_CNS_CTRL_NS_LIST = 0x12,
NVME_ID_CNS_CTRL_LIST = 0x13,
};
enum {
NVME_NS_FEAT_THIN = 1 << 0,
NVME_NS_FLBAS_LBA_MASK = 0xf,
NVME_NS_FLBAS_META_EXT = 0x10,
NVME_LBAF_RP_BEST = 0,
NVME_LBAF_RP_BETTER = 1,
NVME_LBAF_RP_GOOD = 2,
NVME_LBAF_RP_DEGRADED = 3,
NVME_NS_DPC_PI_LAST = 1 << 4,
NVME_NS_DPC_PI_FIRST = 1 << 3,
NVME_NS_DPC_PI_TYPE3 = 1 << 2,
NVME_NS_DPC_PI_TYPE2 = 1 << 1,
NVME_NS_DPC_PI_TYPE1 = 1 << 0,
NVME_NS_DPS_PI_FIRST = 1 << 3,
NVME_NS_DPS_PI_MASK = 0x7,
NVME_NS_DPS_PI_TYPE1 = 1,
NVME_NS_DPS_PI_TYPE2 = 2,
NVME_NS_DPS_PI_TYPE3 = 3,
};
struct nvme_smart_log {
__u8 critical_warning;
__u8 temperature[2];
__u8 avail_spare;
__u8 spare_thresh;
__u8 percent_used;
__u8 rsvd6[26];
__u8 data_units_read[16];
__u8 data_units_written[16];
__u8 host_reads[16];
__u8 host_writes[16];
__u8 ctrl_busy_time[16];
__u8 power_cycles[16];
__u8 power_on_hours[16];
__u8 unsafe_shutdowns[16];
__u8 media_errors[16];
__u8 num_err_log_entries[16];
__le32 warning_temp_time;
__le32 critical_comp_time;
__le16 temp_sensor[8];
__u8 rsvd216[296];
};
enum {
NVME_SMART_CRIT_SPARE = 1 << 0,
NVME_SMART_CRIT_TEMPERATURE = 1 << 1,
NVME_SMART_CRIT_RELIABILITY = 1 << 2,
NVME_SMART_CRIT_MEDIA = 1 << 3,
NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4,
};
enum {
NVME_AER_NOTICE_NS_CHANGED = 0x0002,
};
struct nvme_lba_range_type {
__u8 type;
__u8 attributes;
__u8 rsvd2[14];
__u64 slba;
__u64 nlb;
__u8 guid[16];
__u8 rsvd48[16];
};
enum {
NVME_LBART_TYPE_FS = 0x01,
NVME_LBART_TYPE_RAID = 0x02,
NVME_LBART_TYPE_CACHE = 0x03,
NVME_LBART_TYPE_SWAP = 0x04,
NVME_LBART_ATTRIB_TEMP = 1 << 0,
NVME_LBART_ATTRIB_HIDE = 1 << 1,
};
struct nvme_reservation_status {
__le32 gen;
__u8 rtype;
__u8 regctl[2];
__u8 resv5[2];
__u8 ptpls;
__u8 resv10[13];
struct {
__le16 cntlid;
__u8 rcsts;
__u8 resv3[5];
__le64 hostid;
__le64 rkey;
} regctl_ds[];
};
enum nvme_async_event_type {
NVME_AER_TYPE_ERROR = 0,
NVME_AER_TYPE_SMART = 1,
NVME_AER_TYPE_NOTICE = 2,
};
/* I/O commands */
enum nvme_opcode {
nvme_cmd_flush = 0x00,
nvme_cmd_write = 0x01,
nvme_cmd_read = 0x02,
nvme_cmd_write_uncor = 0x04,
nvme_cmd_compare = 0x05,
nvme_cmd_write_zeroes = 0x08,
nvme_cmd_dsm = 0x09,
nvme_cmd_resv_register = 0x0d,
nvme_cmd_resv_report = 0x0e,
nvme_cmd_resv_acquire = 0x11,
nvme_cmd_resv_release = 0x15,
};
/*
* Descriptor subtype - lower 4 bits of nvme_(keyed_)sgl_desc identifier
*
* @NVME_SGL_FMT_ADDRESS: absolute address of the data block
* @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block
* @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation
* request subtype
*/
enum {
NVME_SGL_FMT_ADDRESS = 0x00,
NVME_SGL_FMT_OFFSET = 0x01,
NVME_SGL_FMT_INVALIDATE = 0x0f,
};
/*
* Descriptor type - upper 4 bits of nvme_(keyed_)sgl_desc identifier
*
* For struct nvme_sgl_desc:
* @NVME_SGL_FMT_DATA_DESC: data block descriptor
* @NVME_SGL_FMT_SEG_DESC: sgl segment descriptor
* @NVME_SGL_FMT_LAST_SEG_DESC: last sgl segment descriptor
*
* For struct nvme_keyed_sgl_desc:
* @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor
*/
enum {
NVME_SGL_FMT_DATA_DESC = 0x00,
NVME_SGL_FMT_SEG_DESC = 0x02,
NVME_SGL_FMT_LAST_SEG_DESC = 0x03,
NVME_KEY_SGL_FMT_DATA_DESC = 0x04,
};
struct nvme_sgl_desc {
__le64 addr;
__le32 length;
__u8 rsvd[3];
__u8 type;
};
struct nvme_keyed_sgl_desc {
__le64 addr;
__u8 length[3];
__u8 key[4];
__u8 type;
};
union nvme_data_ptr {
struct {
__le64 prp1;
__le64 prp2;
};
struct nvme_sgl_desc sgl;
struct nvme_keyed_sgl_desc ksgl;
};
/*
* Lowest two bits of our flags field (FUSE field in the spec):
*
* @NVME_CMD_FUSE_FIRST: Fused Operation, first command
* @NVME_CMD_FUSE_SECOND: Fused Operation, second command
*
* Highest two bits in our flags field (PSDT field in the spec):
*
* @NVME_CMD_PSDT_SGL_METABUF: Use SGLS for this transfer,
* If used, MPTR contains addr of single physical buffer (byte aligned).
* @NVME_CMD_PSDT_SGL_METASEG: Use SGLS for this transfer,
* If used, MPTR contains an address of an SGL segment containing
* exactly 1 SGL descriptor (qword aligned).
*/
enum {
NVME_CMD_FUSE_FIRST = (1 << 0),
NVME_CMD_FUSE_SECOND = (1 << 1),
NVME_CMD_SGL_METABUF = (1 << 6),
NVME_CMD_SGL_METASEG = (1 << 7),
NVME_CMD_SGL_ALL = NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG,
};
struct nvme_common_command {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__le32 cdw2[2];
__le64 metadata;
union nvme_data_ptr dptr;
__le32 cdw10[6];
};
struct nvme_rw_command {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__u64 rsvd2;
__le64 metadata;
union nvme_data_ptr dptr;
__le64 slba;
__le16 length;
__le16 control;
__le32 dsmgmt;
__le32 reftag;
__le16 apptag;
__le16 appmask;
};
enum {
NVME_RW_LR = 1 << 15,
NVME_RW_FUA = 1 << 14,
NVME_RW_DSM_FREQ_UNSPEC = 0,
NVME_RW_DSM_FREQ_TYPICAL = 1,
NVME_RW_DSM_FREQ_RARE = 2,
NVME_RW_DSM_FREQ_READS = 3,
NVME_RW_DSM_FREQ_WRITES = 4,
NVME_RW_DSM_FREQ_RW = 5,
NVME_RW_DSM_FREQ_ONCE = 6,
NVME_RW_DSM_FREQ_PREFETCH = 7,
NVME_RW_DSM_FREQ_TEMP = 8,
NVME_RW_DSM_LATENCY_NONE = 0 << 4,
NVME_RW_DSM_LATENCY_IDLE = 1 << 4,
NVME_RW_DSM_LATENCY_NORM = 2 << 4,
NVME_RW_DSM_LATENCY_LOW = 3 << 4,
NVME_RW_DSM_SEQ_REQ = 1 << 6,
NVME_RW_DSM_COMPRESSED = 1 << 7,
NVME_RW_PRINFO_PRCHK_REF = 1 << 10,
NVME_RW_PRINFO_PRCHK_APP = 1 << 11,
NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
NVME_RW_PRINFO_PRACT = 1 << 13,
};
struct nvme_dsm_cmd {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__u64 rsvd2[2];
union nvme_data_ptr dptr;
__le32 nr;
__le32 attributes;
__u32 rsvd12[4];
};
enum {
NVME_DSMGMT_IDR = 1 << 0,
NVME_DSMGMT_IDW = 1 << 1,
NVME_DSMGMT_AD = 1 << 2,
};
#define NVME_DSM_MAX_RANGES 256
struct nvme_dsm_range {
__le32 cattr;
__le32 nlb;
__le64 slba;
};
struct nvme_write_zeroes_cmd {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__u64 rsvd2;
__le64 metadata;
union nvme_data_ptr dptr;
__le64 slba;
__le16 length;
__le16 control;
__le32 dsmgmt;
__le32 reftag;
__le16 apptag;
__le16 appmask;
};
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-08 02:08:45 +08:00
/* Features */
struct nvme_feat_auto_pst {
__le64 entries[32];
};
enum {
NVME_HOST_MEM_ENABLE = (1 << 0),
NVME_HOST_MEM_RETURN = (1 << 1),
};
/* Admin commands */
enum nvme_admin_opcode {
nvme_admin_delete_sq = 0x00,
nvme_admin_create_sq = 0x01,
nvme_admin_get_log_page = 0x02,
nvme_admin_delete_cq = 0x04,
nvme_admin_create_cq = 0x05,
nvme_admin_identify = 0x06,
nvme_admin_abort_cmd = 0x08,
nvme_admin_set_features = 0x09,
nvme_admin_get_features = 0x0a,
nvme_admin_async_event = 0x0c,
nvme_admin_ns_mgmt = 0x0d,
nvme_admin_activate_fw = 0x10,
nvme_admin_download_fw = 0x11,
nvme_admin_ns_attach = 0x15,
nvme_admin_keep_alive = 0x18,
nvme: improve performance for virtual NVMe devices This change provides a mechanism to reduce the number of MMIO doorbell writes for the NVMe driver. When running in a virtualized environment like QEMU, the cost of an MMIO is quite hefy here. The main idea for the patch is provide the device two memory location locations: 1) to store the doorbell values so they can be lookup without the doorbell MMIO write 2) to store an event index. I believe the doorbell value is obvious, the event index not so much. Similar to the virtio specification, the virtual device can tell the driver (guest OS) not to write MMIO unless you are writing past this value. FYI: doorbell values are written by the nvme driver (guest OS) and the event index is written by the virtual device (host OS). The patch implements a new admin command that will communicate where these two memory locations reside. If the command fails, the nvme driver will work as before without any optimizations. Contributions: Eric Northup <digitaleric@google.com> Frank Swiderski <fes@google.com> Ted Tso <tytso@mit.edu> Keith Busch <keith.busch@intel.com> Just to give an idea on the performance boost with the vendor extension: Running fio [1], a stock NVMe driver I get about 200K read IOPs with my vendor patch I get about 1000K read IOPs. This was running with a null device i.e. the backing device simply returned success on every read IO request. [1] Running on a 4 core machine: fio --time_based --name=benchmark --runtime=30 --filename=/dev/nvme0n1 --nrfiles=1 --ioengine=libaio --iodepth=32 --direct=1 --invalidate=1 --verify=0 --verify_fatal=0 --numjobs=4 --rw=randread --blocksize=4k --randrepeat=false Signed-off-by: Rob Nelson <rlnelson@google.com> [mlin: port for upstream] Signed-off-by: Ming Lin <mlin@kernel.org> [koike: updated for upstream] Signed-off-by: Helen Koike <helen.koike@collabora.co.uk> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Keith Busch <keith.busch@intel.com>
2017-04-10 23:51:07 +08:00
nvme_admin_dbbuf = 0x7C,
nvme_admin_format_nvm = 0x80,
nvme_admin_security_send = 0x81,
nvme_admin_security_recv = 0x82,
};
enum {
NVME_QUEUE_PHYS_CONTIG = (1 << 0),
NVME_CQ_IRQ_ENABLED = (1 << 1),
NVME_SQ_PRIO_URGENT = (0 << 1),
NVME_SQ_PRIO_HIGH = (1 << 1),
NVME_SQ_PRIO_MEDIUM = (2 << 1),
NVME_SQ_PRIO_LOW = (3 << 1),
NVME_FEAT_ARBITRATION = 0x01,
NVME_FEAT_POWER_MGMT = 0x02,
NVME_FEAT_LBA_RANGE = 0x03,
NVME_FEAT_TEMP_THRESH = 0x04,
NVME_FEAT_ERR_RECOVERY = 0x05,
NVME_FEAT_VOLATILE_WC = 0x06,
NVME_FEAT_NUM_QUEUES = 0x07,
NVME_FEAT_IRQ_COALESCE = 0x08,
NVME_FEAT_IRQ_CONFIG = 0x09,
NVME_FEAT_WRITE_ATOMIC = 0x0a,
NVME_FEAT_ASYNC_EVENT = 0x0b,
NVME_FEAT_AUTO_PST = 0x0c,
NVME_FEAT_HOST_MEM_BUF = 0x0d,
NVME_FEAT_KATO = 0x0f,
NVME_FEAT_SW_PROGRESS = 0x80,
NVME_FEAT_HOST_ID = 0x81,
NVME_FEAT_RESV_MASK = 0x82,
NVME_FEAT_RESV_PERSIST = 0x83,
NVME_LOG_ERROR = 0x01,
NVME_LOG_SMART = 0x02,
NVME_LOG_FW_SLOT = 0x03,
NVME_LOG_DISC = 0x70,
NVME_LOG_RESERVATION = 0x80,
NVME_FWACT_REPL = (0 << 3),
NVME_FWACT_REPL_ACTV = (1 << 3),
NVME_FWACT_ACTV = (2 << 3),
};
struct nvme_identify {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__u64 rsvd2[2];
union nvme_data_ptr dptr;
__u8 cns;
__u8 rsvd3;
__le16 ctrlid;
__u32 rsvd11[5];
};
struct nvme_features {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__u64 rsvd2[2];
union nvme_data_ptr dptr;
__le32 fid;
__le32 dword11;
__le32 dword12;
__le32 dword13;
__le32 dword14;
__le32 dword15;
};
struct nvme_host_mem_buf_desc {
__le64 addr;
__le32 size;
__u32 rsvd;
};
struct nvme_create_cq {
__u8 opcode;
__u8 flags;
__u16 command_id;
__u32 rsvd1[5];
__le64 prp1;
__u64 rsvd8;
__le16 cqid;
__le16 qsize;
__le16 cq_flags;
__le16 irq_vector;
__u32 rsvd12[4];
};
struct nvme_create_sq {
__u8 opcode;
__u8 flags;
__u16 command_id;
__u32 rsvd1[5];
__le64 prp1;
__u64 rsvd8;
__le16 sqid;
__le16 qsize;
__le16 sq_flags;
__le16 cqid;
__u32 rsvd12[4];
};
struct nvme_delete_queue {
__u8 opcode;
__u8 flags;
__u16 command_id;
__u32 rsvd1[9];
__le16 qid;
__u16 rsvd10;
__u32 rsvd11[5];
};
struct nvme_abort_cmd {
__u8 opcode;
__u8 flags;
__u16 command_id;
__u32 rsvd1[9];
__le16 sqid;
__u16 cid;
__u32 rsvd11[5];
};
struct nvme_download_firmware {
__u8 opcode;
__u8 flags;
__u16 command_id;
__u32 rsvd1[5];
union nvme_data_ptr dptr;
__le32 numd;
__le32 offset;
__u32 rsvd12[4];
};
struct nvme_format_cmd {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__u64 rsvd2[4];
__le32 cdw10;
__u32 rsvd11[5];
};
struct nvme_get_log_page_command {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__u64 rsvd2[2];
union nvme_data_ptr dptr;
__u8 lid;
__u8 rsvd10;
__le16 numdl;
__le16 numdu;
__u16 rsvd11;
__le32 lpol;
__le32 lpou;
__u32 rsvd14[2];
};
/*
* Fabrics subcommands.
*/
enum nvmf_fabrics_opcode {
nvme_fabrics_command = 0x7f,
};
enum nvmf_capsule_command {
nvme_fabrics_type_property_set = 0x00,
nvme_fabrics_type_connect = 0x01,
nvme_fabrics_type_property_get = 0x04,
};
struct nvmf_common_command {
__u8 opcode;
__u8 resv1;
__u16 command_id;
__u8 fctype;
__u8 resv2[35];
__u8 ts[24];
};
/*
* The legal cntlid range a NVMe Target will provide.
* Note that cntlid of value 0 is considered illegal in the fabrics world.
* Devices based on earlier specs did not have the subsystem concept;
* therefore, those devices had their cntlid value set to 0 as a result.
*/
#define NVME_CNTLID_MIN 1
#define NVME_CNTLID_MAX 0xffef
#define NVME_CNTLID_DYNAMIC 0xffff
#define MAX_DISC_LOGS 255
/* Discovery log page entry */
struct nvmf_disc_rsp_page_entry {
__u8 trtype;
__u8 adrfam;
__u8 subtype;
__u8 treq;
__le16 portid;
__le16 cntlid;
__le16 asqsz;
__u8 resv8[22];
char trsvcid[NVMF_TRSVCID_SIZE];
__u8 resv64[192];
char subnqn[NVMF_NQN_FIELD_LEN];
char traddr[NVMF_TRADDR_SIZE];
union tsas {
char common[NVMF_TSAS_SIZE];
struct rdma {
__u8 qptype;
__u8 prtype;
__u8 cms;
__u8 resv3[5];
__u16 pkey;
__u8 resv10[246];
} rdma;
} tsas;
};
/* Discovery log page header */
struct nvmf_disc_rsp_page_hdr {
__le64 genctr;
__le64 numrec;
__le16 recfmt;
__u8 resv14[1006];
struct nvmf_disc_rsp_page_entry entries[0];
};
struct nvmf_connect_command {
__u8 opcode;
__u8 resv1;
__u16 command_id;
__u8 fctype;
__u8 resv2[19];
union nvme_data_ptr dptr;
__le16 recfmt;
__le16 qid;
__le16 sqsize;
__u8 cattr;
__u8 resv3;
__le32 kato;
__u8 resv4[12];
};
struct nvmf_connect_data {
uuid_t hostid;
__le16 cntlid;
char resv4[238];
char subsysnqn[NVMF_NQN_FIELD_LEN];
char hostnqn[NVMF_NQN_FIELD_LEN];
char resv5[256];
};
struct nvmf_property_set_command {
__u8 opcode;
__u8 resv1;
__u16 command_id;
__u8 fctype;
__u8 resv2[35];
__u8 attrib;
__u8 resv3[3];
__le32 offset;
__le64 value;
__u8 resv4[8];
};
struct nvmf_property_get_command {
__u8 opcode;
__u8 resv1;
__u16 command_id;
__u8 fctype;
__u8 resv2[35];
__u8 attrib;
__u8 resv3[3];
__le32 offset;
__u8 resv4[16];
};
nvme: improve performance for virtual NVMe devices This change provides a mechanism to reduce the number of MMIO doorbell writes for the NVMe driver. When running in a virtualized environment like QEMU, the cost of an MMIO is quite hefy here. The main idea for the patch is provide the device two memory location locations: 1) to store the doorbell values so they can be lookup without the doorbell MMIO write 2) to store an event index. I believe the doorbell value is obvious, the event index not so much. Similar to the virtio specification, the virtual device can tell the driver (guest OS) not to write MMIO unless you are writing past this value. FYI: doorbell values are written by the nvme driver (guest OS) and the event index is written by the virtual device (host OS). The patch implements a new admin command that will communicate where these two memory locations reside. If the command fails, the nvme driver will work as before without any optimizations. Contributions: Eric Northup <digitaleric@google.com> Frank Swiderski <fes@google.com> Ted Tso <tytso@mit.edu> Keith Busch <keith.busch@intel.com> Just to give an idea on the performance boost with the vendor extension: Running fio [1], a stock NVMe driver I get about 200K read IOPs with my vendor patch I get about 1000K read IOPs. This was running with a null device i.e. the backing device simply returned success on every read IO request. [1] Running on a 4 core machine: fio --time_based --name=benchmark --runtime=30 --filename=/dev/nvme0n1 --nrfiles=1 --ioengine=libaio --iodepth=32 --direct=1 --invalidate=1 --verify=0 --verify_fatal=0 --numjobs=4 --rw=randread --blocksize=4k --randrepeat=false Signed-off-by: Rob Nelson <rlnelson@google.com> [mlin: port for upstream] Signed-off-by: Ming Lin <mlin@kernel.org> [koike: updated for upstream] Signed-off-by: Helen Koike <helen.koike@collabora.co.uk> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Keith Busch <keith.busch@intel.com>
2017-04-10 23:51:07 +08:00
struct nvme_dbbuf {
__u8 opcode;
__u8 flags;
__u16 command_id;
__u32 rsvd1[5];
__le64 prp1;
__le64 prp2;
__u32 rsvd12[6];
};
struct nvme_command {
union {
struct nvme_common_command common;
struct nvme_rw_command rw;
struct nvme_identify identify;
struct nvme_features features;
struct nvme_create_cq create_cq;
struct nvme_create_sq create_sq;
struct nvme_delete_queue delete_queue;
struct nvme_download_firmware dlfw;
struct nvme_format_cmd format;
struct nvme_dsm_cmd dsm;
struct nvme_write_zeroes_cmd write_zeroes;
struct nvme_abort_cmd abort;
struct nvme_get_log_page_command get_log_page;
struct nvmf_common_command fabrics;
struct nvmf_connect_command connect;
struct nvmf_property_set_command prop_set;
struct nvmf_property_get_command prop_get;
nvme: improve performance for virtual NVMe devices This change provides a mechanism to reduce the number of MMIO doorbell writes for the NVMe driver. When running in a virtualized environment like QEMU, the cost of an MMIO is quite hefy here. The main idea for the patch is provide the device two memory location locations: 1) to store the doorbell values so they can be lookup without the doorbell MMIO write 2) to store an event index. I believe the doorbell value is obvious, the event index not so much. Similar to the virtio specification, the virtual device can tell the driver (guest OS) not to write MMIO unless you are writing past this value. FYI: doorbell values are written by the nvme driver (guest OS) and the event index is written by the virtual device (host OS). The patch implements a new admin command that will communicate where these two memory locations reside. If the command fails, the nvme driver will work as before without any optimizations. Contributions: Eric Northup <digitaleric@google.com> Frank Swiderski <fes@google.com> Ted Tso <tytso@mit.edu> Keith Busch <keith.busch@intel.com> Just to give an idea on the performance boost with the vendor extension: Running fio [1], a stock NVMe driver I get about 200K read IOPs with my vendor patch I get about 1000K read IOPs. This was running with a null device i.e. the backing device simply returned success on every read IO request. [1] Running on a 4 core machine: fio --time_based --name=benchmark --runtime=30 --filename=/dev/nvme0n1 --nrfiles=1 --ioengine=libaio --iodepth=32 --direct=1 --invalidate=1 --verify=0 --verify_fatal=0 --numjobs=4 --rw=randread --blocksize=4k --randrepeat=false Signed-off-by: Rob Nelson <rlnelson@google.com> [mlin: port for upstream] Signed-off-by: Ming Lin <mlin@kernel.org> [koike: updated for upstream] Signed-off-by: Helen Koike <helen.koike@collabora.co.uk> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Keith Busch <keith.busch@intel.com>
2017-04-10 23:51:07 +08:00
struct nvme_dbbuf dbbuf;
};
};
static inline bool nvme_is_write(struct nvme_command *cmd)
{
/*
* What a mess...
*
* Why can't we simply have a Fabrics In and Fabrics out command?
*/
if (unlikely(cmd->common.opcode == nvme_fabrics_command))
return cmd->fabrics.opcode & 1;
return cmd->common.opcode & 1;
}
enum {
/*
* Generic Command Status:
*/
NVME_SC_SUCCESS = 0x0,
NVME_SC_INVALID_OPCODE = 0x1,
NVME_SC_INVALID_FIELD = 0x2,
NVME_SC_CMDID_CONFLICT = 0x3,
NVME_SC_DATA_XFER_ERROR = 0x4,
NVME_SC_POWER_LOSS = 0x5,
NVME_SC_INTERNAL = 0x6,
NVME_SC_ABORT_REQ = 0x7,
NVME_SC_ABORT_QUEUE = 0x8,
NVME_SC_FUSED_FAIL = 0x9,
NVME_SC_FUSED_MISSING = 0xa,
NVME_SC_INVALID_NS = 0xb,
NVME_SC_CMD_SEQ_ERROR = 0xc,
NVME_SC_SGL_INVALID_LAST = 0xd,
NVME_SC_SGL_INVALID_COUNT = 0xe,
NVME_SC_SGL_INVALID_DATA = 0xf,
NVME_SC_SGL_INVALID_METADATA = 0x10,
NVME_SC_SGL_INVALID_TYPE = 0x11,
NVME_SC_SGL_INVALID_OFFSET = 0x16,
NVME_SC_SGL_INVALID_SUBTYPE = 0x17,
NVME_SC_LBA_RANGE = 0x80,
NVME_SC_CAP_EXCEEDED = 0x81,
NVME_SC_NS_NOT_READY = 0x82,
NVME_SC_RESERVATION_CONFLICT = 0x83,
/*
* Command Specific Status:
*/
NVME_SC_CQ_INVALID = 0x100,
NVME_SC_QID_INVALID = 0x101,
NVME_SC_QUEUE_SIZE = 0x102,
NVME_SC_ABORT_LIMIT = 0x103,
NVME_SC_ABORT_MISSING = 0x104,
NVME_SC_ASYNC_LIMIT = 0x105,
NVME_SC_FIRMWARE_SLOT = 0x106,
NVME_SC_FIRMWARE_IMAGE = 0x107,
NVME_SC_INVALID_VECTOR = 0x108,
NVME_SC_INVALID_LOG_PAGE = 0x109,
NVME_SC_INVALID_FORMAT = 0x10a,
NVME_SC_FW_NEEDS_CONV_RESET = 0x10b,
NVME_SC_INVALID_QUEUE = 0x10c,
NVME_SC_FEATURE_NOT_SAVEABLE = 0x10d,
NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e,
NVME_SC_FEATURE_NOT_PER_NS = 0x10f,
NVME_SC_FW_NEEDS_SUBSYS_RESET = 0x110,
NVME_SC_FW_NEEDS_RESET = 0x111,
NVME_SC_FW_NEEDS_MAX_TIME = 0x112,
NVME_SC_FW_ACIVATE_PROHIBITED = 0x113,
NVME_SC_OVERLAPPING_RANGE = 0x114,
NVME_SC_NS_INSUFFICENT_CAP = 0x115,
NVME_SC_NS_ID_UNAVAILABLE = 0x116,
NVME_SC_NS_ALREADY_ATTACHED = 0x118,
NVME_SC_NS_IS_PRIVATE = 0x119,
NVME_SC_NS_NOT_ATTACHED = 0x11a,
NVME_SC_THIN_PROV_NOT_SUPP = 0x11b,
NVME_SC_CTRL_LIST_INVALID = 0x11c,
/*
* I/O Command Set Specific - NVM commands:
*/
NVME_SC_BAD_ATTRIBUTES = 0x180,
NVME_SC_INVALID_PI = 0x181,
NVME_SC_READ_ONLY = 0x182,
NVME_SC_ONCS_NOT_SUPPORTED = 0x183,
/*
* I/O Command Set Specific - Fabrics commands:
*/
NVME_SC_CONNECT_FORMAT = 0x180,
NVME_SC_CONNECT_CTRL_BUSY = 0x181,
NVME_SC_CONNECT_INVALID_PARAM = 0x182,
NVME_SC_CONNECT_RESTART_DISC = 0x183,
NVME_SC_CONNECT_INVALID_HOST = 0x184,
NVME_SC_DISCOVERY_RESTART = 0x190,
NVME_SC_AUTH_REQUIRED = 0x191,
/*
* Media and Data Integrity Errors:
*/
NVME_SC_WRITE_FAULT = 0x280,
NVME_SC_READ_ERROR = 0x281,
NVME_SC_GUARD_CHECK = 0x282,
NVME_SC_APPTAG_CHECK = 0x283,
NVME_SC_REFTAG_CHECK = 0x284,
NVME_SC_COMPARE_FAILED = 0x285,
NVME_SC_ACCESS_DENIED = 0x286,
NVME_SC_UNWRITTEN_BLOCK = 0x287,
NVME_SC_DNR = 0x4000,
/*
* FC Transport-specific error status values for NVME commands
*
* Transport-specific status code values must be in the range 0xB0..0xBF
*/
/* Generic FC failure - catchall */
NVME_SC_FC_TRANSPORT_ERROR = 0x00B0,
/* I/O failure due to FC ABTS'd */
NVME_SC_FC_TRANSPORT_ABORTED = 0x00B1,
};
struct nvme_completion {
/*
* Used by Admin and Fabrics commands to return data:
*/
union nvme_result {
__le16 u16;
__le32 u32;
__le64 u64;
} result;
__le16 sq_head; /* how much of this queue may be reclaimed */
__le16 sq_id; /* submission queue that generated this entry */
__u16 command_id; /* of the command which completed */
__le16 status; /* did the command fail, and if so, why? */
};
#define NVME_VS(major, minor, tertiary) \
(((major) << 16) | ((minor) << 8) | (tertiary))
#endif /* _LINUX_NVME_H */