OpenCloudOS-Kernel/drivers/nvdimm/label.c

1326 lines
35 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
*/
#include <linux/device.h>
#include <linux/ndctl.h>
#include <linux/uuid.h>
#include <linux/slab.h>
#include <linux/io.h>
#include <linux/nd.h>
#include "nd-core.h"
#include "label.h"
#include "nd.h"
static guid_t nvdimm_btt_guid;
static guid_t nvdimm_btt2_guid;
static guid_t nvdimm_pfn_guid;
static guid_t nvdimm_dax_guid;
static const char NSINDEX_SIGNATURE[] = "NAMESPACE_INDEX\0";
static u32 best_seq(u32 a, u32 b)
{
a &= NSINDEX_SEQ_MASK;
b &= NSINDEX_SEQ_MASK;
if (a == 0 || a == b)
return b;
else if (b == 0)
return a;
else if (nd_inc_seq(a) == b)
return b;
else
return a;
}
unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd)
{
return ndd->nslabel_size;
}
static size_t __sizeof_namespace_index(u32 nslot)
{
return ALIGN(sizeof(struct nd_namespace_index) + DIV_ROUND_UP(nslot, 8),
NSINDEX_ALIGN);
}
static int __nvdimm_num_label_slots(struct nvdimm_drvdata *ndd,
size_t index_size)
{
return (ndd->nsarea.config_size - index_size * 2) /
sizeof_namespace_label(ndd);
}
int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd)
{
u32 tmp_nslot, n;
tmp_nslot = ndd->nsarea.config_size / sizeof_namespace_label(ndd);
n = __sizeof_namespace_index(tmp_nslot) / NSINDEX_ALIGN;
return __nvdimm_num_label_slots(ndd, NSINDEX_ALIGN * n);
}
size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
{
u32 nslot, space, size;
/*
* Per UEFI 2.7, the minimum size of the Label Storage Area is large
* enough to hold 2 index blocks and 2 labels. The minimum index
* block size is 256 bytes. The label size is 128 for namespaces
* prior to version 1.2 and at minimum 256 for version 1.2 and later.
*/
nslot = nvdimm_num_label_slots(ndd);
space = ndd->nsarea.config_size - nslot * sizeof_namespace_label(ndd);
size = __sizeof_namespace_index(nslot) * 2;
if (size <= space && nslot >= 2)
return size / 2;
dev_err(ndd->dev, "label area (%d) too small to host (%d byte) labels\n",
ndd->nsarea.config_size, sizeof_namespace_label(ndd));
return 0;
}
static int __nd_label_validate(struct nvdimm_drvdata *ndd)
{
/*
* On media label format consists of two index blocks followed
* by an array of labels. None of these structures are ever
* updated in place. A sequence number tracks the current
* active index and the next one to write, while labels are
* written to free slots.
*
* +------------+
* | |
* | nsindex0 |
* | |
* +------------+
* | |
* | nsindex1 |
* | |
* +------------+
* | label0 |
* +------------+
* | label1 |
* +------------+
* | |
* ....nslot...
* | |
* +------------+
* | labelN |
* +------------+
*/
struct nd_namespace_index *nsindex[] = {
to_namespace_index(ndd, 0),
to_namespace_index(ndd, 1),
};
const int num_index = ARRAY_SIZE(nsindex);
struct device *dev = ndd->dev;
bool valid[2] = { 0 };
int i, num_valid = 0;
u32 seq;
for (i = 0; i < num_index; i++) {
u32 nslot;
u8 sig[NSINDEX_SIG_LEN];
u64 sum_save, sum, size;
unsigned int version, labelsize;
memcpy(sig, nsindex[i]->sig, NSINDEX_SIG_LEN);
if (memcmp(sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN) != 0) {
dev_dbg(dev, "nsindex%d signature invalid\n", i);
continue;
}
/* label sizes larger than 128 arrived with v1.2 */
version = __le16_to_cpu(nsindex[i]->major) * 100
+ __le16_to_cpu(nsindex[i]->minor);
if (version >= 102)
labelsize = 1 << (7 + nsindex[i]->labelsize);
else
labelsize = 128;
if (labelsize != sizeof_namespace_label(ndd)) {
dev_dbg(dev, "nsindex%d labelsize %d invalid\n",
i, nsindex[i]->labelsize);
continue;
}
sum_save = __le64_to_cpu(nsindex[i]->checksum);
nsindex[i]->checksum = __cpu_to_le64(0);
sum = nd_fletcher64(nsindex[i], sizeof_namespace_index(ndd), 1);
nsindex[i]->checksum = __cpu_to_le64(sum_save);
if (sum != sum_save) {
dev_dbg(dev, "nsindex%d checksum invalid\n", i);
continue;
}
seq = __le32_to_cpu(nsindex[i]->seq);
if ((seq & NSINDEX_SEQ_MASK) == 0) {
dev_dbg(dev, "nsindex%d sequence: %#x invalid\n", i, seq);
continue;
}
/* sanity check the index against expected values */
if (__le64_to_cpu(nsindex[i]->myoff)
!= i * sizeof_namespace_index(ndd)) {
dev_dbg(dev, "nsindex%d myoff: %#llx invalid\n",
i, (unsigned long long)
__le64_to_cpu(nsindex[i]->myoff));
continue;
}
if (__le64_to_cpu(nsindex[i]->otheroff)
!= (!i) * sizeof_namespace_index(ndd)) {
dev_dbg(dev, "nsindex%d otheroff: %#llx invalid\n",
i, (unsigned long long)
__le64_to_cpu(nsindex[i]->otheroff));
continue;
}
if (__le64_to_cpu(nsindex[i]->labeloff)
!= 2 * sizeof_namespace_index(ndd)) {
dev_dbg(dev, "nsindex%d labeloff: %#llx invalid\n",
i, (unsigned long long)
__le64_to_cpu(nsindex[i]->labeloff));
continue;
}
size = __le64_to_cpu(nsindex[i]->mysize);
if (size > sizeof_namespace_index(ndd)
|| size < sizeof(struct nd_namespace_index)) {
dev_dbg(dev, "nsindex%d mysize: %#llx invalid\n", i, size);
continue;
}
nslot = __le32_to_cpu(nsindex[i]->nslot);
if (nslot * sizeof_namespace_label(ndd)
+ 2 * sizeof_namespace_index(ndd)
> ndd->nsarea.config_size) {
dev_dbg(dev, "nsindex%d nslot: %u invalid, config_size: %#x\n",
i, nslot, ndd->nsarea.config_size);
continue;
}
valid[i] = true;
num_valid++;
}
switch (num_valid) {
case 0:
break;
case 1:
for (i = 0; i < num_index; i++)
if (valid[i])
return i;
/* can't have num_valid > 0 but valid[] = { false, false } */
WARN_ON(1);
break;
default:
/* pick the best index... */
seq = best_seq(__le32_to_cpu(nsindex[0]->seq),
__le32_to_cpu(nsindex[1]->seq));
if (seq == (__le32_to_cpu(nsindex[1]->seq) & NSINDEX_SEQ_MASK))
return 1;
else
return 0;
break;
}
return -1;
}
static int nd_label_validate(struct nvdimm_drvdata *ndd)
{
/*
* In order to probe for and validate namespace index blocks we
* need to know the size of the labels, and we can't trust the
* size of the labels until we validate the index blocks.
* Resolve this dependency loop by probing for known label
* sizes, but default to v1.2 256-byte namespace labels if
* discovery fails.
*/
int label_size[] = { 128, 256 };
int i, rc;
for (i = 0; i < ARRAY_SIZE(label_size); i++) {
ndd->nslabel_size = label_size[i];
rc = __nd_label_validate(ndd);
if (rc >= 0)
return rc;
}
return -1;
}
static void nd_label_copy(struct nvdimm_drvdata *ndd,
struct nd_namespace_index *dst,
struct nd_namespace_index *src)
{
/* just exit if either destination or source is NULL */
if (!dst || !src)
return;
memcpy(dst, src, sizeof_namespace_index(ndd));
}
static struct nd_namespace_label *nd_label_base(struct nvdimm_drvdata *ndd)
{
void *base = to_namespace_index(ndd, 0);
return base + 2 * sizeof_namespace_index(ndd);
}
static int to_slot(struct nvdimm_drvdata *ndd,
struct nd_namespace_label *nd_label)
{
unsigned long label, base;
label = (unsigned long) nd_label;
base = (unsigned long) nd_label_base(ndd);
return (label - base) / sizeof_namespace_label(ndd);
}
static struct nd_namespace_label *to_label(struct nvdimm_drvdata *ndd, int slot)
{
unsigned long label, base;
base = (unsigned long) nd_label_base(ndd);
label = base + sizeof_namespace_label(ndd) * slot;
return (struct nd_namespace_label *) label;
}
#define for_each_clear_bit_le(bit, addr, size) \
for ((bit) = find_next_zero_bit_le((addr), (size), 0); \
(bit) < (size); \
(bit) = find_next_zero_bit_le((addr), (size), (bit) + 1))
/**
* preamble_index - common variable initialization for nd_label_* routines
* @ndd: dimm container for the relevant label set
* @idx: namespace_index index
* @nsindex_out: on return set to the currently active namespace index
* @free: on return set to the free label bitmap in the index
* @nslot: on return set to the number of slots in the label space
*/
static bool preamble_index(struct nvdimm_drvdata *ndd, int idx,
struct nd_namespace_index **nsindex_out,
unsigned long **free, u32 *nslot)
{
struct nd_namespace_index *nsindex;
nsindex = to_namespace_index(ndd, idx);
if (nsindex == NULL)
return false;
*free = (unsigned long *) nsindex->free;
*nslot = __le32_to_cpu(nsindex->nslot);
*nsindex_out = nsindex;
return true;
}
char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags)
{
if (!label_id || !uuid)
return NULL;
snprintf(label_id->id, ND_LABEL_ID_SIZE, "%s-%pUb",
flags & NSLABEL_FLAG_LOCAL ? "blk" : "pmem", uuid);
return label_id->id;
}
static bool preamble_current(struct nvdimm_drvdata *ndd,
struct nd_namespace_index **nsindex,
unsigned long **free, u32 *nslot)
{
return preamble_index(ndd, ndd->ns_current, nsindex,
free, nslot);
}
static bool preamble_next(struct nvdimm_drvdata *ndd,
struct nd_namespace_index **nsindex,
unsigned long **free, u32 *nslot)
{
return preamble_index(ndd, ndd->ns_next, nsindex,
free, nslot);
}
static bool slot_valid(struct nvdimm_drvdata *ndd,
struct nd_namespace_label *nd_label, u32 slot)
{
/* check that we are written where we expect to be written */
if (slot != __le32_to_cpu(nd_label->slot))
return false;
/* check that DPA allocations are page aligned */
if ((__le64_to_cpu(nd_label->dpa)
| __le64_to_cpu(nd_label->rawsize)) % SZ_4K)
return false;
/* check checksum */
if (namespace_label_has(ndd, checksum)) {
u64 sum, sum_save;
sum_save = __le64_to_cpu(nd_label->checksum);
nd_label->checksum = __cpu_to_le64(0);
sum = nd_fletcher64(nd_label, sizeof_namespace_label(ndd), 1);
nd_label->checksum = __cpu_to_le64(sum_save);
if (sum != sum_save) {
dev_dbg(ndd->dev, "fail checksum. slot: %d expect: %#llx\n",
slot, sum);
return false;
}
}
return true;
}
int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
{
struct nd_namespace_index *nsindex;
unsigned long *free;
u32 nslot, slot;
if (!preamble_current(ndd, &nsindex, &free, &nslot))
return 0; /* no label, nothing to reserve */
for_each_clear_bit_le(slot, free, nslot) {
struct nvdimm *nvdimm = to_nvdimm(ndd->dev);
struct nd_namespace_label *nd_label;
struct nd_region *nd_region = NULL;
u8 label_uuid[NSLABEL_UUID_LEN];
struct nd_label_id label_id;
struct resource *res;
u32 flags;
nd_label = to_label(ndd, slot);
if (!slot_valid(ndd, nd_label, slot))
continue;
memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN);
flags = __le32_to_cpu(nd_label->flags);
if (test_bit(NDD_NOBLK, &nvdimm->flags))
flags &= ~NSLABEL_FLAG_LOCAL;
nd_label_gen_id(&label_id, label_uuid, flags);
res = nvdimm_allocate_dpa(ndd, &label_id,
__le64_to_cpu(nd_label->dpa),
__le64_to_cpu(nd_label->rawsize));
nd_dbg_dpa(nd_region, ndd, res, "reserve\n");
if (!res)
return -EBUSY;
}
return 0;
}
int nd_label_data_init(struct nvdimm_drvdata *ndd)
{
size_t config_size, read_size, max_xfer, offset;
struct nd_namespace_index *nsindex;
unsigned int i;
int rc = 0;
u32 nslot;
if (ndd->data)
return 0;
if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0) {
dev_dbg(ndd->dev, "failed to init config data area: (%u:%u)\n",
ndd->nsarea.max_xfer, ndd->nsarea.config_size);
return -ENXIO;
}
/*
* We need to determine the maximum index area as this is the section
* we must read and validate before we can start processing labels.
*
* If the area is too small to contain the two indexes and 2 labels
* then we abort.
*
* Start at a label size of 128 as this should result in the largest
* possible namespace index size.
*/
ndd->nslabel_size = 128;
read_size = sizeof_namespace_index(ndd) * 2;
if (!read_size)
return -ENXIO;
/* Allocate config data */
config_size = ndd->nsarea.config_size;
ndd->data = kvzalloc(config_size, GFP_KERNEL);
if (!ndd->data)
return -ENOMEM;
/*
* We want to guarantee as few reads as possible while conserving
* memory. To do that we figure out how much unused space will be left
* in the last read, divide that by the total number of reads it is
* going to take given our maximum transfer size, and then reduce our
* maximum transfer size based on that result.
*/
max_xfer = min_t(size_t, ndd->nsarea.max_xfer, config_size);
if (read_size < max_xfer) {
/* trim waste */
max_xfer -= ((max_xfer - 1) - (config_size - 1) % max_xfer) /
DIV_ROUND_UP(config_size, max_xfer);
/* make certain we read indexes in exactly 1 read */
if (max_xfer < read_size)
max_xfer = read_size;
}
/* Make our initial read size a multiple of max_xfer size */
read_size = min(DIV_ROUND_UP(read_size, max_xfer) * max_xfer,
config_size);
/* Read the index data */
rc = nvdimm_get_config_data(ndd, ndd->data, 0, read_size);
if (rc)
goto out_err;
/* Validate index data, if not valid assume all labels are invalid */
ndd->ns_current = nd_label_validate(ndd);
if (ndd->ns_current < 0)
return 0;
/* Record our index values */
ndd->ns_next = nd_label_next_nsindex(ndd->ns_current);
/* Copy "current" index on top of the "next" index */
nsindex = to_current_namespace_index(ndd);
nd_label_copy(ndd, to_next_namespace_index(ndd), nsindex);
/* Determine starting offset for label data */
offset = __le64_to_cpu(nsindex->labeloff);
nslot = __le32_to_cpu(nsindex->nslot);
/* Loop through the free list pulling in any active labels */
for (i = 0; i < nslot; i++, offset += ndd->nslabel_size) {
size_t label_read_size;
/* zero out the unused labels */
if (test_bit_le(i, nsindex->free)) {
memset(ndd->data + offset, 0, ndd->nslabel_size);
continue;
}
/* if we already read past here then just continue */
if (offset + ndd->nslabel_size <= read_size)
continue;
/* if we haven't read in a while reset our read_size offset */
if (read_size < offset)
read_size = offset;
/* determine how much more will be read after this next call. */
label_read_size = offset + ndd->nslabel_size - read_size;
label_read_size = DIV_ROUND_UP(label_read_size, max_xfer) *
max_xfer;
/* truncate last read if needed */
if (read_size + label_read_size > config_size)
label_read_size = config_size - read_size;
/* Read the label data */
rc = nvdimm_get_config_data(ndd, ndd->data + read_size,
read_size, label_read_size);
if (rc)
goto out_err;
/* push read_size to next read offset */
read_size += label_read_size;
}
dev_dbg(ndd->dev, "len: %zu rc: %d\n", offset, rc);
out_err:
return rc;
}
int nd_label_active_count(struct nvdimm_drvdata *ndd)
{
struct nd_namespace_index *nsindex;
unsigned long *free;
u32 nslot, slot;
int count = 0;
if (!preamble_current(ndd, &nsindex, &free, &nslot))
return 0;
for_each_clear_bit_le(slot, free, nslot) {
struct nd_namespace_label *nd_label;
nd_label = to_label(ndd, slot);
if (!slot_valid(ndd, nd_label, slot)) {
u32 label_slot = __le32_to_cpu(nd_label->slot);
u64 size = __le64_to_cpu(nd_label->rawsize);
u64 dpa = __le64_to_cpu(nd_label->dpa);
dev_dbg(ndd->dev,
"slot%d invalid slot: %d dpa: %llx size: %llx\n",
slot, label_slot, dpa, size);
continue;
}
count++;
}
return count;
}
struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n)
{
struct nd_namespace_index *nsindex;
unsigned long *free;
u32 nslot, slot;
if (!preamble_current(ndd, &nsindex, &free, &nslot))
return NULL;
for_each_clear_bit_le(slot, free, nslot) {
struct nd_namespace_label *nd_label;
nd_label = to_label(ndd, slot);
if (!slot_valid(ndd, nd_label, slot))
continue;
if (n-- == 0)
return to_label(ndd, slot);
}
return NULL;
}
u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd)
{
struct nd_namespace_index *nsindex;
unsigned long *free;
u32 nslot, slot;
if (!preamble_next(ndd, &nsindex, &free, &nslot))
return UINT_MAX;
WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
slot = find_next_bit_le(free, nslot, 0);
if (slot == nslot)
return UINT_MAX;
clear_bit_le(slot, free);
return slot;
}
bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot)
{
struct nd_namespace_index *nsindex;
unsigned long *free;
u32 nslot;
if (!preamble_next(ndd, &nsindex, &free, &nslot))
return false;
WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
if (slot < nslot)
return !test_and_set_bit_le(slot, free);
return false;
}
u32 nd_label_nfree(struct nvdimm_drvdata *ndd)
{
struct nd_namespace_index *nsindex;
unsigned long *free;
u32 nslot;
WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
if (!preamble_next(ndd, &nsindex, &free, &nslot))
return nvdimm_num_label_slots(ndd);
return bitmap_weight(free, nslot);
}
static int nd_label_write_index(struct nvdimm_drvdata *ndd, int index, u32 seq,
unsigned long flags)
{
struct nd_namespace_index *nsindex;
unsigned long offset;
u64 checksum;
u32 nslot;
int rc;
nsindex = to_namespace_index(ndd, index);
if (flags & ND_NSINDEX_INIT)
nslot = nvdimm_num_label_slots(ndd);
else
nslot = __le32_to_cpu(nsindex->nslot);
memcpy(nsindex->sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN);
memset(&nsindex->flags, 0, 3);
nsindex->labelsize = sizeof_namespace_label(ndd) >> 8;
nsindex->seq = __cpu_to_le32(seq);
offset = (unsigned long) nsindex
- (unsigned long) to_namespace_index(ndd, 0);
nsindex->myoff = __cpu_to_le64(offset);
nsindex->mysize = __cpu_to_le64(sizeof_namespace_index(ndd));
offset = (unsigned long) to_namespace_index(ndd,
nd_label_next_nsindex(index))
- (unsigned long) to_namespace_index(ndd, 0);
nsindex->otheroff = __cpu_to_le64(offset);
offset = (unsigned long) nd_label_base(ndd)
- (unsigned long) to_namespace_index(ndd, 0);
nsindex->labeloff = __cpu_to_le64(offset);
nsindex->nslot = __cpu_to_le32(nslot);
nsindex->major = __cpu_to_le16(1);
if (sizeof_namespace_label(ndd) < 256)
nsindex->minor = __cpu_to_le16(1);
else
nsindex->minor = __cpu_to_le16(2);
nsindex->checksum = __cpu_to_le64(0);
if (flags & ND_NSINDEX_INIT) {
unsigned long *free = (unsigned long *) nsindex->free;
u32 nfree = ALIGN(nslot, BITS_PER_LONG);
int last_bits, i;
memset(nsindex->free, 0xff, nfree / 8);
for (i = 0, last_bits = nfree - nslot; i < last_bits; i++)
clear_bit_le(nslot + i, free);
}
checksum = nd_fletcher64(nsindex, sizeof_namespace_index(ndd), 1);
nsindex->checksum = __cpu_to_le64(checksum);
rc = nvdimm_set_config_data(ndd, __le64_to_cpu(nsindex->myoff),
nsindex, sizeof_namespace_index(ndd));
if (rc < 0)
return rc;
if (flags & ND_NSINDEX_INIT)
return 0;
/* copy the index we just wrote to the new 'next' */
WARN_ON(index != ndd->ns_next);
nd_label_copy(ndd, to_current_namespace_index(ndd), nsindex);
ndd->ns_current = nd_label_next_nsindex(ndd->ns_current);
ndd->ns_next = nd_label_next_nsindex(ndd->ns_next);
WARN_ON(ndd->ns_current == ndd->ns_next);
return 0;
}
static unsigned long nd_label_offset(struct nvdimm_drvdata *ndd,
struct nd_namespace_label *nd_label)
{
return (unsigned long) nd_label
- (unsigned long) to_namespace_index(ndd, 0);
}
enum nvdimm_claim_class to_nvdimm_cclass(guid_t *guid)
{
if (guid_equal(guid, &nvdimm_btt_guid))
return NVDIMM_CCLASS_BTT;
else if (guid_equal(guid, &nvdimm_btt2_guid))
return NVDIMM_CCLASS_BTT2;
else if (guid_equal(guid, &nvdimm_pfn_guid))
return NVDIMM_CCLASS_PFN;
else if (guid_equal(guid, &nvdimm_dax_guid))
return NVDIMM_CCLASS_DAX;
else if (guid_equal(guid, &guid_null))
return NVDIMM_CCLASS_NONE;
return NVDIMM_CCLASS_UNKNOWN;
}
static const guid_t *to_abstraction_guid(enum nvdimm_claim_class claim_class,
guid_t *target)
{
if (claim_class == NVDIMM_CCLASS_BTT)
return &nvdimm_btt_guid;
else if (claim_class == NVDIMM_CCLASS_BTT2)
return &nvdimm_btt2_guid;
else if (claim_class == NVDIMM_CCLASS_PFN)
return &nvdimm_pfn_guid;
else if (claim_class == NVDIMM_CCLASS_DAX)
return &nvdimm_dax_guid;
else if (claim_class == NVDIMM_CCLASS_UNKNOWN) {
/*
* If we're modifying a namespace for which we don't
* know the claim_class, don't touch the existing guid.
*/
return target;
} else
return &guid_null;
}
libnvdimm/namespace: Fix label tracking error Users have reported intermittent occurrences of DIMM initialization failures due to duplicate allocations of address capacity detected in the labels, or errors of the form below, both have the same root cause. nd namespace1.4: failed to track label: 0 WARNING: CPU: 17 PID: 1381 at drivers/nvdimm/label.c:863 RIP: 0010:__pmem_label_update+0x56c/0x590 [libnvdimm] Call Trace: ? nd_pmem_namespace_label_update+0xd6/0x160 [libnvdimm] nd_pmem_namespace_label_update+0xd6/0x160 [libnvdimm] uuid_store+0x17e/0x190 [libnvdimm] kernfs_fop_write+0xf0/0x1a0 vfs_write+0xb7/0x1b0 ksys_write+0x57/0xd0 do_syscall_64+0x60/0x210 Unfortunately those reports were typically with a busy parallel namespace creation / destruction loop making it difficult to see the components of the bug. However, Jane provided a simple reproducer using the work-in-progress sub-section implementation. When ndctl is reconfiguring a namespace it may take an existing defunct / disabled namespace and reconfigure it with a new uuid and other parameters. Critically namespace_update_uuid() takes existing address resources and renames them for the new namespace to use / reconfigure as it sees fit. The bug is that this rename only happens in the resource tracking tree. Existing labels with the old uuid are not reaped leading to a scenario where multiple active labels reference the same span of address range. Teach namespace_update_uuid() to flag any references to the old uuid for reaping at the next label update attempt. Cc: <stable@vger.kernel.org> Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation") Link: https://github.com/pmem/ndctl/issues/91 Reported-by: Jane Chu <jane.chu@oracle.com> Reported-by: Jeff Moyer <jmoyer@redhat.com> Reported-by: Erwin Tsaur <erwin.tsaur@oracle.com> Cc: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2019-05-01 12:51:21 +08:00
static void reap_victim(struct nd_mapping *nd_mapping,
struct nd_label_ent *victim)
{
struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
u32 slot = to_slot(ndd, victim->label);
dev_dbg(ndd->dev, "free: %d\n", slot);
nd_label_free_slot(ndd, slot);
victim->label = NULL;
}
static int __pmem_label_update(struct nd_region *nd_region,
struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm,
int pos, unsigned long flags)
{
struct nd_namespace_common *ndns = &nspm->nsio.common;
struct nd_interleave_set *nd_set = nd_region->nd_set;
struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
struct nd_namespace_label *nd_label;
struct nd_namespace_index *nsindex;
libnvdimm/namespace: Fix label tracking error Users have reported intermittent occurrences of DIMM initialization failures due to duplicate allocations of address capacity detected in the labels, or errors of the form below, both have the same root cause. nd namespace1.4: failed to track label: 0 WARNING: CPU: 17 PID: 1381 at drivers/nvdimm/label.c:863 RIP: 0010:__pmem_label_update+0x56c/0x590 [libnvdimm] Call Trace: ? nd_pmem_namespace_label_update+0xd6/0x160 [libnvdimm] nd_pmem_namespace_label_update+0xd6/0x160 [libnvdimm] uuid_store+0x17e/0x190 [libnvdimm] kernfs_fop_write+0xf0/0x1a0 vfs_write+0xb7/0x1b0 ksys_write+0x57/0xd0 do_syscall_64+0x60/0x210 Unfortunately those reports were typically with a busy parallel namespace creation / destruction loop making it difficult to see the components of the bug. However, Jane provided a simple reproducer using the work-in-progress sub-section implementation. When ndctl is reconfiguring a namespace it may take an existing defunct / disabled namespace and reconfigure it with a new uuid and other parameters. Critically namespace_update_uuid() takes existing address resources and renames them for the new namespace to use / reconfigure as it sees fit. The bug is that this rename only happens in the resource tracking tree. Existing labels with the old uuid are not reaped leading to a scenario where multiple active labels reference the same span of address range. Teach namespace_update_uuid() to flag any references to the old uuid for reaping at the next label update attempt. Cc: <stable@vger.kernel.org> Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation") Link: https://github.com/pmem/ndctl/issues/91 Reported-by: Jane Chu <jane.chu@oracle.com> Reported-by: Jeff Moyer <jmoyer@redhat.com> Reported-by: Erwin Tsaur <erwin.tsaur@oracle.com> Cc: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2019-05-01 12:51:21 +08:00
struct nd_label_ent *label_ent;
struct nd_label_id label_id;
struct resource *res;
unsigned long *free;
u32 nslot, slot;
size_t offset;
u64 cookie;
int rc;
if (!preamble_next(ndd, &nsindex, &free, &nslot))
return -ENXIO;
cookie = nd_region_interleave_set_cookie(nd_region, nsindex);
nd_label_gen_id(&label_id, nspm->uuid, 0);
for_each_dpa_resource(ndd, res)
if (strcmp(res->name, label_id.id) == 0)
break;
if (!res) {
WARN_ON_ONCE(1);
return -ENXIO;
}
/* allocate and write the label to the staging (next) index */
slot = nd_label_alloc_slot(ndd);
if (slot == UINT_MAX)
return -ENXIO;
dev_dbg(ndd->dev, "allocated: %d\n", slot);
nd_label = to_label(ndd, slot);
memset(nd_label, 0, sizeof_namespace_label(ndd));
memcpy(nd_label->uuid, nspm->uuid, NSLABEL_UUID_LEN);
if (nspm->alt_name)
memcpy(nd_label->name, nspm->alt_name, NSLABEL_NAME_LEN);
nd_label->flags = __cpu_to_le32(flags);
nd_label->nlabel = __cpu_to_le16(nd_region->ndr_mappings);
nd_label->position = __cpu_to_le16(pos);
nd_label->isetcookie = __cpu_to_le64(cookie);
nd_label->rawsize = __cpu_to_le64(resource_size(res));
nd_label->lbasize = __cpu_to_le64(nspm->lbasize);
nd_label->dpa = __cpu_to_le64(res->start);
nd_label->slot = __cpu_to_le32(slot);
if (namespace_label_has(ndd, type_guid))
guid_copy(&nd_label->type_guid, &nd_set->type_guid);
if (namespace_label_has(ndd, abstraction_guid))
guid_copy(&nd_label->abstraction_guid,
to_abstraction_guid(ndns->claim_class,
&nd_label->abstraction_guid));
if (namespace_label_has(ndd, checksum)) {
u64 sum;
nd_label->checksum = __cpu_to_le64(0);
sum = nd_fletcher64(nd_label, sizeof_namespace_label(ndd), 1);
nd_label->checksum = __cpu_to_le64(sum);
}
nd_dbg_dpa(nd_region, ndd, res, "\n");
/* update label */
offset = nd_label_offset(ndd, nd_label);
rc = nvdimm_set_config_data(ndd, offset, nd_label,
sizeof_namespace_label(ndd));
if (rc < 0)
return rc;
/* Garbage collect the previous label */
mutex_lock(&nd_mapping->lock);
list_for_each_entry(label_ent, &nd_mapping->labels, list) {
if (!label_ent->label)
continue;
libnvdimm/namespace: Fix label tracking error Users have reported intermittent occurrences of DIMM initialization failures due to duplicate allocations of address capacity detected in the labels, or errors of the form below, both have the same root cause. nd namespace1.4: failed to track label: 0 WARNING: CPU: 17 PID: 1381 at drivers/nvdimm/label.c:863 RIP: 0010:__pmem_label_update+0x56c/0x590 [libnvdimm] Call Trace: ? nd_pmem_namespace_label_update+0xd6/0x160 [libnvdimm] nd_pmem_namespace_label_update+0xd6/0x160 [libnvdimm] uuid_store+0x17e/0x190 [libnvdimm] kernfs_fop_write+0xf0/0x1a0 vfs_write+0xb7/0x1b0 ksys_write+0x57/0xd0 do_syscall_64+0x60/0x210 Unfortunately those reports were typically with a busy parallel namespace creation / destruction loop making it difficult to see the components of the bug. However, Jane provided a simple reproducer using the work-in-progress sub-section implementation. When ndctl is reconfiguring a namespace it may take an existing defunct / disabled namespace and reconfigure it with a new uuid and other parameters. Critically namespace_update_uuid() takes existing address resources and renames them for the new namespace to use / reconfigure as it sees fit. The bug is that this rename only happens in the resource tracking tree. Existing labels with the old uuid are not reaped leading to a scenario where multiple active labels reference the same span of address range. Teach namespace_update_uuid() to flag any references to the old uuid for reaping at the next label update attempt. Cc: <stable@vger.kernel.org> Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation") Link: https://github.com/pmem/ndctl/issues/91 Reported-by: Jane Chu <jane.chu@oracle.com> Reported-by: Jeff Moyer <jmoyer@redhat.com> Reported-by: Erwin Tsaur <erwin.tsaur@oracle.com> Cc: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2019-05-01 12:51:21 +08:00
if (test_and_clear_bit(ND_LABEL_REAP, &label_ent->flags)
|| memcmp(nspm->uuid, label_ent->label->uuid,
NSLABEL_UUID_LEN) == 0)
reap_victim(nd_mapping, label_ent);
}
/* update index */
rc = nd_label_write_index(ndd, ndd->ns_next,
nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
if (rc == 0) {
list_for_each_entry(label_ent, &nd_mapping->labels, list)
if (!label_ent->label) {
label_ent->label = nd_label;
nd_label = NULL;
break;
}
dev_WARN_ONCE(&nspm->nsio.common.dev, nd_label,
"failed to track label: %d\n",
to_slot(ndd, nd_label));
if (nd_label)
rc = -ENXIO;
}
mutex_unlock(&nd_mapping->lock);
return rc;
}
static bool is_old_resource(struct resource *res, struct resource **list, int n)
{
int i;
if (res->flags & DPA_RESOURCE_ADJUSTED)
return false;
for (i = 0; i < n; i++)
if (res == list[i])
return true;
return false;
}
static struct resource *to_resource(struct nvdimm_drvdata *ndd,
struct nd_namespace_label *nd_label)
{
struct resource *res;
for_each_dpa_resource(ndd, res) {
if (res->start != __le64_to_cpu(nd_label->dpa))
continue;
if (resource_size(res) != __le64_to_cpu(nd_label->rawsize))
continue;
return res;
}
return NULL;
}
/*
* 1/ Account all the labels that can be freed after this update
* 2/ Allocate and write the label to the staging (next) index
* 3/ Record the resources in the namespace device
*/
static int __blk_label_update(struct nd_region *nd_region,
struct nd_mapping *nd_mapping, struct nd_namespace_blk *nsblk,
int num_labels)
{
int i, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO;
struct nd_interleave_set *nd_set = nd_region->nd_set;
struct nd_namespace_common *ndns = &nsblk->common;
struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
struct nd_namespace_label *nd_label;
struct nd_label_ent *label_ent, *e;
struct nd_namespace_index *nsindex;
unsigned long *free, *victim_map = NULL;
struct resource *res, **old_res_list;
struct nd_label_id label_id;
u8 uuid[NSLABEL_UUID_LEN];
int min_dpa_idx = 0;
LIST_HEAD(list);
u32 nslot, slot;
if (!preamble_next(ndd, &nsindex, &free, &nslot))
return -ENXIO;
old_res_list = nsblk->res;
nfree = nd_label_nfree(ndd);
old_num_resources = nsblk->num_resources;
nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
/*
* We need to loop over the old resources a few times, which seems a
* bit inefficient, but we need to know that we have the label
* space before we start mutating the tracking structures.
* Otherwise the recovery method of last resort for userspace is
* disable and re-enable the parent region.
*/
alloc = 0;
for_each_dpa_resource(ndd, res) {
if (strcmp(res->name, label_id.id) != 0)
continue;
if (!is_old_resource(res, old_res_list, old_num_resources))
alloc++;
}
victims = 0;
if (old_num_resources) {
/* convert old local-label-map to dimm-slot victim-map */
victim_map = bitmap_zalloc(nslot, GFP_KERNEL);
if (!victim_map)
return -ENOMEM;
/* mark unused labels for garbage collection */
for_each_clear_bit_le(slot, free, nslot) {
nd_label = to_label(ndd, slot);
memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
continue;
res = to_resource(ndd, nd_label);
if (res && is_old_resource(res, old_res_list,
old_num_resources))
continue;
slot = to_slot(ndd, nd_label);
set_bit(slot, victim_map);
victims++;
}
}
/* don't allow updates that consume the last label */
if (nfree - alloc < 0 || nfree - alloc + victims < 1) {
dev_info(&nsblk->common.dev, "insufficient label space\n");
bitmap_free(victim_map);
return -ENOSPC;
}
/* from here on we need to abort on error */
/* assign all resources to the namespace before writing the labels */
nsblk->res = NULL;
nsblk->num_resources = 0;
for_each_dpa_resource(ndd, res) {
if (strcmp(res->name, label_id.id) != 0)
continue;
if (!nsblk_add_resource(nd_region, ndd, nsblk, res->start)) {
rc = -ENOMEM;
goto abort;
}
}
/*
* Find the resource associated with the first label in the set
* per the v1.2 namespace specification.
*/
for (i = 0; i < nsblk->num_resources; i++) {
struct resource *min = nsblk->res[min_dpa_idx];
res = nsblk->res[i];
if (res->start < min->start)
min_dpa_idx = i;
}
for (i = 0; i < nsblk->num_resources; i++) {
size_t offset;
res = nsblk->res[i];
if (is_old_resource(res, old_res_list, old_num_resources))
continue; /* carry-over */
slot = nd_label_alloc_slot(ndd);
if (slot == UINT_MAX)
goto abort;
dev_dbg(ndd->dev, "allocated: %d\n", slot);
nd_label = to_label(ndd, slot);
memset(nd_label, 0, sizeof_namespace_label(ndd));
memcpy(nd_label->uuid, nsblk->uuid, NSLABEL_UUID_LEN);
if (nsblk->alt_name)
memcpy(nd_label->name, nsblk->alt_name,
NSLABEL_NAME_LEN);
nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_LOCAL);
/*
* Use the presence of the type_guid as a flag to
* determine isetcookie usage and nlabel + position
* policy for blk-aperture namespaces.
*/
if (namespace_label_has(ndd, type_guid)) {
if (i == min_dpa_idx) {
nd_label->nlabel = __cpu_to_le16(nsblk->num_resources);
nd_label->position = __cpu_to_le16(0);
} else {
nd_label->nlabel = __cpu_to_le16(0xffff);
nd_label->position = __cpu_to_le16(0xffff);
}
nd_label->isetcookie = __cpu_to_le64(nd_set->cookie2);
} else {
nd_label->nlabel = __cpu_to_le16(0); /* N/A */
nd_label->position = __cpu_to_le16(0); /* N/A */
nd_label->isetcookie = __cpu_to_le64(0); /* N/A */
}
nd_label->dpa = __cpu_to_le64(res->start);
nd_label->rawsize = __cpu_to_le64(resource_size(res));
nd_label->lbasize = __cpu_to_le64(nsblk->lbasize);
nd_label->slot = __cpu_to_le32(slot);
if (namespace_label_has(ndd, type_guid))
guid_copy(&nd_label->type_guid, &nd_set->type_guid);
if (namespace_label_has(ndd, abstraction_guid))
guid_copy(&nd_label->abstraction_guid,
to_abstraction_guid(ndns->claim_class,
&nd_label->abstraction_guid));
if (namespace_label_has(ndd, checksum)) {
u64 sum;
nd_label->checksum = __cpu_to_le64(0);
sum = nd_fletcher64(nd_label,
sizeof_namespace_label(ndd), 1);
nd_label->checksum = __cpu_to_le64(sum);
}
/* update label */
offset = nd_label_offset(ndd, nd_label);
rc = nvdimm_set_config_data(ndd, offset, nd_label,
sizeof_namespace_label(ndd));
if (rc < 0)
goto abort;
}
/* free up now unused slots in the new index */
for_each_set_bit(slot, victim_map, victim_map ? nslot : 0) {
dev_dbg(ndd->dev, "free: %d\n", slot);
nd_label_free_slot(ndd, slot);
}
/* update index */
rc = nd_label_write_index(ndd, ndd->ns_next,
nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
if (rc)
goto abort;
/*
* Now that the on-dimm labels are up to date, fix up the tracking
* entries in nd_mapping->labels
*/
nlabel = 0;
mutex_lock(&nd_mapping->lock);
list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) {
nd_label = label_ent->label;
if (!nd_label)
continue;
nlabel++;
memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
continue;
nlabel--;
list_move(&label_ent->list, &list);
label_ent->label = NULL;
}
list_splice_tail_init(&list, &nd_mapping->labels);
mutex_unlock(&nd_mapping->lock);
if (nlabel + nsblk->num_resources > num_labels) {
/*
* Bug, we can't end up with more resources than
* available labels
*/
WARN_ON_ONCE(1);
rc = -ENXIO;
goto out;
}
mutex_lock(&nd_mapping->lock);
label_ent = list_first_entry_or_null(&nd_mapping->labels,
typeof(*label_ent), list);
if (!label_ent) {
WARN_ON(1);
mutex_unlock(&nd_mapping->lock);
rc = -ENXIO;
goto out;
}
for_each_clear_bit_le(slot, free, nslot) {
nd_label = to_label(ndd, slot);
memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
continue;
res = to_resource(ndd, nd_label);
res->flags &= ~DPA_RESOURCE_ADJUSTED;
dev_vdbg(&nsblk->common.dev, "assign label slot: %d\n", slot);
list_for_each_entry_from(label_ent, &nd_mapping->labels, list) {
if (label_ent->label)
continue;
label_ent->label = nd_label;
nd_label = NULL;
break;
}
if (nd_label)
dev_WARN(&nsblk->common.dev,
"failed to track label slot%d\n", slot);
}
mutex_unlock(&nd_mapping->lock);
out:
kfree(old_res_list);
bitmap_free(victim_map);
return rc;
abort:
/*
* 1/ repair the allocated label bitmap in the index
* 2/ restore the resource list
*/
nd_label_copy(ndd, nsindex, to_current_namespace_index(ndd));
kfree(nsblk->res);
nsblk->res = old_res_list;
nsblk->num_resources = old_num_resources;
old_res_list = NULL;
goto out;
}
static int init_labels(struct nd_mapping *nd_mapping, int num_labels)
{
int i, old_num_labels = 0;
struct nd_label_ent *label_ent;
struct nd_namespace_index *nsindex;
struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
mutex_lock(&nd_mapping->lock);
list_for_each_entry(label_ent, &nd_mapping->labels, list)
old_num_labels++;
mutex_unlock(&nd_mapping->lock);
/*
* We need to preserve all the old labels for the mapping so
* they can be garbage collected after writing the new labels.
*/
for (i = old_num_labels; i < num_labels; i++) {
label_ent = kzalloc(sizeof(*label_ent), GFP_KERNEL);
if (!label_ent)
return -ENOMEM;
mutex_lock(&nd_mapping->lock);
list_add_tail(&label_ent->list, &nd_mapping->labels);
mutex_unlock(&nd_mapping->lock);
}
if (ndd->ns_current == -1 || ndd->ns_next == -1)
/* pass */;
else
return max(num_labels, old_num_labels);
nsindex = to_namespace_index(ndd, 0);
memset(nsindex, 0, ndd->nsarea.config_size);
for (i = 0; i < 2; i++) {
int rc = nd_label_write_index(ndd, i, 3 - i, ND_NSINDEX_INIT);
if (rc)
return rc;
}
ndd->ns_next = 1;
ndd->ns_current = 0;
return max(num_labels, old_num_labels);
}
static int del_labels(struct nd_mapping *nd_mapping, u8 *uuid)
{
struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
struct nd_label_ent *label_ent, *e;
struct nd_namespace_index *nsindex;
u8 label_uuid[NSLABEL_UUID_LEN];
unsigned long *free;
LIST_HEAD(list);
u32 nslot, slot;
int active = 0;
if (!uuid)
return 0;
/* no index || no labels == nothing to delete */
if (!preamble_next(ndd, &nsindex, &free, &nslot))
return 0;
mutex_lock(&nd_mapping->lock);
list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) {
struct nd_namespace_label *nd_label = label_ent->label;
if (!nd_label)
continue;
active++;
memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN);
if (memcmp(label_uuid, uuid, NSLABEL_UUID_LEN) != 0)
continue;
active--;
slot = to_slot(ndd, nd_label);
nd_label_free_slot(ndd, slot);
dev_dbg(ndd->dev, "free: %d\n", slot);
list_move_tail(&label_ent->list, &list);
label_ent->label = NULL;
}
list_splice_tail_init(&list, &nd_mapping->labels);
if (active == 0) {
nd_mapping_free_labels(nd_mapping);
dev_dbg(ndd->dev, "no more active labels\n");
}
mutex_unlock(&nd_mapping->lock);
return nd_label_write_index(ndd, ndd->ns_next,
nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
}
int nd_pmem_namespace_label_update(struct nd_region *nd_region,
struct nd_namespace_pmem *nspm, resource_size_t size)
{
int i, rc;
for (i = 0; i < nd_region->ndr_mappings; i++) {
struct nd_mapping *nd_mapping = &nd_region->mapping[i];
struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
struct resource *res;
int count = 0;
if (size == 0) {
rc = del_labels(nd_mapping, nspm->uuid);
if (rc)
return rc;
continue;
}
for_each_dpa_resource(ndd, res)
if (strncmp(res->name, "pmem", 4) == 0)
count++;
WARN_ON_ONCE(!count);
rc = init_labels(nd_mapping, count);
if (rc < 0)
return rc;
rc = __pmem_label_update(nd_region, nd_mapping, nspm, i,
NSLABEL_FLAG_UPDATING);
if (rc)
return rc;
}
if (size == 0)
return 0;
/* Clear the UPDATING flag per UEFI 2.7 expectations */
for (i = 0; i < nd_region->ndr_mappings; i++) {
struct nd_mapping *nd_mapping = &nd_region->mapping[i];
rc = __pmem_label_update(nd_region, nd_mapping, nspm, i, 0);
if (rc)
return rc;
}
return 0;
}
int nd_blk_namespace_label_update(struct nd_region *nd_region,
struct nd_namespace_blk *nsblk, resource_size_t size)
{
struct nd_mapping *nd_mapping = &nd_region->mapping[0];
struct resource *res;
int count = 0;
if (size == 0)
return del_labels(nd_mapping, nsblk->uuid);
for_each_dpa_resource(to_ndd(nd_mapping), res)
count++;
count = init_labels(nd_mapping, count);
if (count < 0)
return count;
return __blk_label_update(nd_region, nd_mapping, nsblk, count);
}
int __init nd_label_init(void)
{
WARN_ON(guid_parse(NVDIMM_BTT_GUID, &nvdimm_btt_guid));
WARN_ON(guid_parse(NVDIMM_BTT2_GUID, &nvdimm_btt2_guid));
WARN_ON(guid_parse(NVDIMM_PFN_GUID, &nvdimm_pfn_guid));
WARN_ON(guid_parse(NVDIMM_DAX_GUID, &nvdimm_dax_guid));
return 0;
}