OpenCloudOS-Kernel/drivers/md/dm-ps-io-affinity.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2020 Oracle Corporation
 *
 * Module Author: Mike Christie
 */
#include "dm-path-selector.h"

#include <linux/device-mapper.h>
#include <linux/module.h>

#define DM_MSG_PREFIX "multipath io-affinity"

struct path_info {
	struct dm_path *path;
	cpumask_var_t cpumask;
	refcount_t refcount;
	bool failed;
};

struct selector {
	struct path_info **path_map;
	cpumask_var_t path_mask;
	atomic_t map_misses;
};

static void ioa_free_path(struct selector *s, unsigned int cpu)
{
	struct path_info *pi = s->path_map[cpu];

	if (!pi)
		return;

	if (refcount_dec_and_test(&pi->refcount)) {
		cpumask_clear_cpu(cpu, s->path_mask);
		free_cpumask_var(pi->cpumask);
		kfree(pi);

		s->path_map[cpu] = NULL;
	}
}

static int ioa_add_path(struct path_selector *ps, struct dm_path *path,
			int argc, char **argv, char **error)
{
	struct selector *s = ps->context;
	struct path_info *pi = NULL;
	unsigned int cpu;
	int ret;

	if (argc != 1) {
		*error = "io-affinity ps: invalid number of arguments";
		return -EINVAL;
	}

	pi = kzalloc(sizeof(*pi), GFP_KERNEL);
	if (!pi) {
		*error = "io-affinity ps: Error allocating path context";
		return -ENOMEM;
	}

	pi->path = path;
	path->pscontext = pi;
	refcount_set(&pi->refcount, 1);

	if (!zalloc_cpumask_var(&pi->cpumask, GFP_KERNEL)) {
		*error = "io-affinity ps: Error allocating cpumask context";
		ret = -ENOMEM;
		goto free_pi;
	}

	ret = cpumask_parse(argv[0], pi->cpumask);
	if (ret) {
		*error = "io-affinity ps: invalid cpumask";
		ret = -EINVAL;
		goto free_mask;
	}

	for_each_cpu(cpu, pi->cpumask) {
		if (cpu >= nr_cpu_ids) {
			DMWARN_LIMIT("Ignoring mapping for CPU %u. Max CPU is %u",
				     cpu, nr_cpu_ids);
			break;
		}

		if (s->path_map[cpu]) {
			DMWARN("CPU mapping for %u exists. Ignoring.", cpu);
			continue;
		}

		cpumask_set_cpu(cpu, s->path_mask);
		s->path_map[cpu] = pi;
		refcount_inc(&pi->refcount);
	}

	if (refcount_dec_and_test(&pi->refcount)) {
		*error = "io-affinity ps: No new/valid CPU mapping found";
		ret = -EINVAL;
		goto free_mask;
	}

	return 0;

free_mask:
	free_cpumask_var(pi->cpumask);
free_pi:
	kfree(pi);
	return ret;
}

static int ioa_create(struct path_selector *ps, unsigned argc, char **argv)
{
	struct selector *s;

	s = kmalloc(sizeof(*s), GFP_KERNEL);
	if (!s)
		return -ENOMEM;

	s->path_map = kzalloc(nr_cpu_ids * sizeof(struct path_info *),
			      GFP_KERNEL);
	if (!s->path_map)
		goto free_selector;

	if (!zalloc_cpumask_var(&s->path_mask, GFP_KERNEL))
		goto free_map;

	atomic_set(&s->map_misses, 0);
	ps->context = s;
	return 0;

free_map:
	kfree(s->path_map);
free_selector:
	kfree(s);
	return -ENOMEM;
}

static void ioa_destroy(struct path_selector *ps)
{
	struct selector *s = ps->context;
	unsigned cpu;

	for_each_cpu(cpu, s->path_mask)
		ioa_free_path(s, cpu);

	free_cpumask_var(s->path_mask);
	kfree(s->path_map);
	kfree(s);

	ps->context = NULL;
}

static int ioa_status(struct path_selector *ps, struct dm_path *path,
		      status_type_t type, char *result, unsigned int maxlen)
{
	struct selector *s = ps->context;
	struct path_info *pi;
	int sz = 0;

	if (!path) {
		DMEMIT("0 ");
		return sz;
	}

	switch(type) {
	case STATUSTYPE_INFO:
		DMEMIT("%d ", atomic_read(&s->map_misses));
		break;
	case STATUSTYPE_TABLE:
		pi = path->pscontext;
		DMEMIT("%*pb ", cpumask_pr_args(pi->cpumask));
		break;
	case STATUSTYPE_IMA:
		*result = '\0';
		break;
	}

	return sz;
}

static void ioa_fail_path(struct path_selector *ps, struct dm_path *p)
{
	struct path_info *pi = p->pscontext;

	pi->failed = true;
}

static int ioa_reinstate_path(struct path_selector *ps, struct dm_path *p)
{
	struct path_info *pi = p->pscontext;

	pi->failed = false;
	return 0;
}

static struct dm_path *ioa_select_path(struct path_selector *ps,
				       size_t nr_bytes)
{
	unsigned int cpu, node;
	struct selector *s = ps->context;
	const struct cpumask *cpumask;
	struct path_info *pi;
	int i;

	cpu = get_cpu();

	pi = s->path_map[cpu];
	if (pi && !pi->failed)
		goto done;

	/*
	 * Perf is not optimal, but we at least try the local node then just
	 * try not to fail.
	 */
	if (!pi)
		atomic_inc(&s->map_misses);

	node = cpu_to_node(cpu);
	cpumask = cpumask_of_node(node);
	for_each_cpu(i, cpumask) {
		pi = s->path_map[i];
		if (pi && !pi->failed)
			goto done;
	}

	for_each_cpu(i, s->path_mask) {
		pi = s->path_map[i];
		if (pi && !pi->failed)
			goto done;
	}
	pi = NULL;

done:
	put_cpu();
	return pi ? pi->path : NULL;
}

static struct path_selector_type ioa_ps = {
	.name		= "io-affinity",
	.module		= THIS_MODULE,
	.table_args	= 1,
	.info_args	= 1,
	.create		= ioa_create,
	.destroy	= ioa_destroy,
	.status		= ioa_status,
	.add_path	= ioa_add_path,
	.fail_path	= ioa_fail_path,
	.reinstate_path	= ioa_reinstate_path,
	.select_path	= ioa_select_path,
};

static int __init dm_ioa_init(void)
{
	int ret = dm_register_path_selector(&ioa_ps);

	if (ret < 0)
		DMERR("register failed %d", ret);
	return ret;
}

static void __exit dm_ioa_exit(void)
{
	int ret = dm_unregister_path_selector(&ioa_ps);

	if (ret < 0)
		DMERR("unregister failed %d", ret);
}

module_init(dm_ioa_init);
module_exit(dm_ioa_exit);

MODULE_DESCRIPTION(DM_NAME " multipath path selector that selects paths based on the CPU IO is being executed on");
MODULE_AUTHOR("Mike Christie <michael.christie@oracle.com>");
MODULE_LICENSE("GPL");
dm mpath: add IO affinity path selector This patch adds a path selector that selects paths based on a CPU to path mapping the user passes in and what CPU we are executing on. The primary user for this PS is where the app is optimized to use specific CPUs so other PSs undo the apps handy work, and the storage and it's transport are not a bottlneck. For these io-affinity PS setups a path's transport/interconnect perf is not going to flucuate a lot and there is no major differences between paths, so QL/HST smarts do not help and RR always messes up what the app is trying to do. On a system with 16 cores, where you have a job per CPU: fio --filename=/dev/dm-0 --direct=1 --rw=randrw --bs=4k \ --ioengine=libaio --iodepth=128 --numjobs=16 and a dm-multipath device setup where each CPU is mapped to one path: // When in mq mode I had to set dm_mq_nr_hw_queues=$NUM_PATHS. // Bio mode also showed similar results. 0 16777216 multipath 0 0 1 1 io-affinity 0 16 1 8:16 1 8:32 2 8:64 4 8:48 8 8:80 10 8:96 20 8:112 40 8:128 80 8:144 100 8:160 200 8:176 400 8:192 800 8:208 1000 8:224 2000 8:240 4000 65:0 8000 we can see a IOPs increase of 25%. The percent increase depends on the device and interconnect. For a slower/medium speed path/device that can do around 180K IOPs a path if you ran that fio command to it directly we saw a 25% increase like above. Slower path'd devices that could do around 90K per path showed maybe around a 2 - 5% increase. If you use something like null_blk or scsi_debug which can multi-million IOPs and hack it up so each device they export shows up as a path then you see 50%+ increases. Signed-off-by: Mike Christie <michael.christie@oracle.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> 2020-10-23 08:27:50 +08:00			`// SPDX-License-Identifier: GPL-2.0-only`
			`/*`
			`* Copyright (C) 2020 Oracle Corporation`
			`*`
			`* Module Author: Mike Christie`
			`*/`
			`#include "dm-path-selector.h"`

			`#include <linux/device-mapper.h>`
			`#include <linux/module.h>`

			`#define DM_MSG_PREFIX "multipath io-affinity"`

			`struct path_info {`
			`struct dm_path *path;`
			`cpumask_var_t cpumask;`
			`refcount_t refcount;`
			`bool failed;`
			`};`

			`struct selector {`
			`struct path_info **path_map;`
			`cpumask_var_t path_mask;`
			`atomic_t map_misses;`
			`};`

			`static void ioa_free_path(struct selector *s, unsigned int cpu)`
			`{`
			`struct path_info *pi = s->path_map[cpu];`

			`if (!pi)`
			`return;`

			`if (refcount_dec_and_test(&pi->refcount)) {`
			`cpumask_clear_cpu(cpu, s->path_mask);`
			`free_cpumask_var(pi->cpumask);`
			`kfree(pi);`

			`s->path_map[cpu] = NULL;`
			`}`
			`}`

			`static int ioa_add_path(struct path_selector ps, struct dm_path path,`
			`int argc, char argv, char error)`
			`{`
			`struct selector *s = ps->context;`
			`struct path_info *pi = NULL;`
			`unsigned int cpu;`
			`int ret;`

			`if (argc != 1) {`
			`*error = "io-affinity ps: invalid number of arguments";`
			`return -EINVAL;`
			`}`

			`pi = kzalloc(sizeof(*pi), GFP_KERNEL);`
			`if (!pi) {`
			`*error = "io-affinity ps: Error allocating path context";`
			`return -ENOMEM;`
			`}`

			`pi->path = path;`
			`path->pscontext = pi;`
			`refcount_set(&pi->refcount, 1);`

			`if (!zalloc_cpumask_var(&pi->cpumask, GFP_KERNEL)) {`
			`*error = "io-affinity ps: Error allocating cpumask context";`
			`ret = -ENOMEM;`
			`goto free_pi;`
			`}`

			`ret = cpumask_parse(argv[0], pi->cpumask);`
			`if (ret) {`
			`*error = "io-affinity ps: invalid cpumask";`
			`ret = -EINVAL;`
			`goto free_mask;`
			`}`

			`for_each_cpu(cpu, pi->cpumask) {`
			`if (cpu >= nr_cpu_ids) {`
			`DMWARN_LIMIT("Ignoring mapping for CPU %u. Max CPU is %u",`
			`cpu, nr_cpu_ids);`
			`break;`
			`}`

			`if (s->path_map[cpu]) {`
			`DMWARN("CPU mapping for %u exists. Ignoring.", cpu);`
			`continue;`
			`}`

			`cpumask_set_cpu(cpu, s->path_mask);`
			`s->path_map[cpu] = pi;`
			`refcount_inc(&pi->refcount);`
			`}`

			`if (refcount_dec_and_test(&pi->refcount)) {`
			`*error = "io-affinity ps: No new/valid CPU mapping found";`
			`ret = -EINVAL;`
			`goto free_mask;`
			`}`

			`return 0;`

			`free_mask:`
			`free_cpumask_var(pi->cpumask);`
			`free_pi:`
			`kfree(pi);`
			`return ret;`
			`}`

			`static int ioa_create(struct path_selector ps, unsigned argc, char *argv)`
			`{`
			`struct selector *s;`

			`s = kmalloc(sizeof(*s), GFP_KERNEL);`
			`if (!s)`
			`return -ENOMEM;`

			`s->path_map = kzalloc(nr_cpu_ids * sizeof(struct path_info *),`
			`GFP_KERNEL);`
			`if (!s->path_map)`
			`goto free_selector;`

			`if (!zalloc_cpumask_var(&s->path_mask, GFP_KERNEL))`
			`goto free_map;`

			`atomic_set(&s->map_misses, 0);`
			`ps->context = s;`
			`return 0;`

			`free_map:`
			`kfree(s->path_map);`
			`free_selector:`
			`kfree(s);`
			`return -ENOMEM;`
			`}`

			`static void ioa_destroy(struct path_selector *ps)`
			`{`
			`struct selector *s = ps->context;`
			`unsigned cpu;`

			`for_each_cpu(cpu, s->path_mask)`
			`ioa_free_path(s, cpu);`

			`free_cpumask_var(s->path_mask);`
			`kfree(s->path_map);`
			`kfree(s);`

			`ps->context = NULL;`
			`}`

			`static int ioa_status(struct path_selector ps, struct dm_path path,`
			`status_type_t type, char *result, unsigned int maxlen)`
			`{`
			`struct selector *s = ps->context;`
			`struct path_info *pi;`
			`int sz = 0;`

			`if (!path) {`
			`DMEMIT("0 ");`
			`return sz;`
			`}`

			`switch(type) {`
			`case STATUSTYPE_INFO:`
			`DMEMIT("%d ", atomic_read(&s->map_misses));`
			`break;`
			`case STATUSTYPE_TABLE:`
			`pi = path->pscontext;`
			`DMEMIT("%*pb ", cpumask_pr_args(pi->cpumask));`
			`break;`
dm: update target status functions to support IMA measurement For device mapper targets to take advantage of IMA's measurement capabilities, the status functions for the individual targets need to be updated to handle the status_type_t case for value STATUSTYPE_IMA. Update status functions for the following target types, to log their respective attributes to be measured using IMA. 01. cache 02. crypt 03. integrity 04. linear 05. mirror 06. multipath 07. raid 08. snapshot 09. striped 10. verity For rest of the targets, handle the STATUSTYPE_IMA case by setting the measurement buffer to NULL. For IMA to measure the data on a given system, the IMA policy on the system needs to be updated to have the following line, and the system needs to be restarted for the measurements to take effect. /etc/ima/ima-policy measure func=CRITICAL_DATA label=device-mapper template=ima-buf The measurements will be reflected in the IMA logs, which are located at: /sys/kernel/security/integrity/ima/ascii_runtime_measurements /sys/kernel/security/integrity/ima/binary_runtime_measurements These IMA logs can later be consumed by various attestation clients running on the system, and send them to external services for attesting the system. The DM target data measured by IMA subsystem can alternatively be queried from userspace by setting DM_IMA_MEASUREMENT_FLAG with DM_TABLE_STATUS_CMD. Signed-off-by: Tushar Sugandhi <tusharsu@linux.microsoft.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> 2021-07-13 08:49:03 +08:00			`case STATUSTYPE_IMA:`
			`*result = '\0';`
			`break;`
dm mpath: add IO affinity path selector This patch adds a path selector that selects paths based on a CPU to path mapping the user passes in and what CPU we are executing on. The primary user for this PS is where the app is optimized to use specific CPUs so other PSs undo the apps handy work, and the storage and it's transport are not a bottlneck. For these io-affinity PS setups a path's transport/interconnect perf is not going to flucuate a lot and there is no major differences between paths, so QL/HST smarts do not help and RR always messes up what the app is trying to do. On a system with 16 cores, where you have a job per CPU: fio --filename=/dev/dm-0 --direct=1 --rw=randrw --bs=4k \ --ioengine=libaio --iodepth=128 --numjobs=16 and a dm-multipath device setup where each CPU is mapped to one path: // When in mq mode I had to set dm_mq_nr_hw_queues=$NUM_PATHS. // Bio mode also showed similar results. 0 16777216 multipath 0 0 1 1 io-affinity 0 16 1 8:16 1 8:32 2 8:64 4 8:48 8 8:80 10 8:96 20 8:112 40 8:128 80 8:144 100 8:160 200 8:176 400 8:192 800 8:208 1000 8:224 2000 8:240 4000 65:0 8000 we can see a IOPs increase of 25%. The percent increase depends on the device and interconnect. For a slower/medium speed path/device that can do around 180K IOPs a path if you ran that fio command to it directly we saw a 25% increase like above. Slower path'd devices that could do around 90K per path showed maybe around a 2 - 5% increase. If you use something like null_blk or scsi_debug which can multi-million IOPs and hack it up so each device they export shows up as a path then you see 50%+ increases. Signed-off-by: Mike Christie <michael.christie@oracle.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> 2020-10-23 08:27:50 +08:00			`}`

			`return sz;`
			`}`

			`static void ioa_fail_path(struct path_selector ps, struct dm_path p)`
			`{`
			`struct path_info *pi = p->pscontext;`

			`pi->failed = true;`
			`}`

			`static int ioa_reinstate_path(struct path_selector ps, struct dm_path p)`
			`{`
			`struct path_info *pi = p->pscontext;`

			`pi->failed = false;`
			`return 0;`
			`}`

			`static struct dm_path ioa_select_path(struct path_selector ps,`
			`size_t nr_bytes)`
			`{`
			`unsigned int cpu, node;`
			`struct selector *s = ps->context;`
			`const struct cpumask *cpumask;`
			`struct path_info *pi;`
			`int i;`

			`cpu = get_cpu();`

			`pi = s->path_map[cpu];`
			`if (pi && !pi->failed)`
			`goto done;`

			`/*`
			`* Perf is not optimal, but we at least try the local node then just`
			`* try not to fail.`
			`*/`
			`if (!pi)`
			`atomic_inc(&s->map_misses);`

			`node = cpu_to_node(cpu);`
			`cpumask = cpumask_of_node(node);`
			`for_each_cpu(i, cpumask) {`
			`pi = s->path_map[i];`
			`if (pi && !pi->failed)`
			`goto done;`
			`}`

			`for_each_cpu(i, s->path_mask) {`
			`pi = s->path_map[i];`
			`if (pi && !pi->failed)`
			`goto done;`
			`}`
			`pi = NULL;`

			`done:`
			`put_cpu();`
			`return pi ? pi->path : NULL;`
			`}`

			`static struct path_selector_type ioa_ps = {`
			`.name = "io-affinity",`
			`.module = THIS_MODULE,`
			`.table_args = 1,`
			`.info_args = 1,`
			`.create = ioa_create,`
			`.destroy = ioa_destroy,`
			`.status = ioa_status,`
			`.add_path = ioa_add_path,`
			`.fail_path = ioa_fail_path,`
			`.reinstate_path = ioa_reinstate_path,`
			`.select_path = ioa_select_path,`
			`};`

			`static int __init dm_ioa_init(void)`
			`{`
			`int ret = dm_register_path_selector(&ioa_ps);`

			`if (ret < 0)`
			`DMERR("register failed %d", ret);`
			`return ret;`
			`}`

			`static void __exit dm_ioa_exit(void)`
			`{`
			`int ret = dm_unregister_path_selector(&ioa_ps);`

			`if (ret < 0)`
			`DMERR("unregister failed %d", ret);`
			`}`

			`module_init(dm_ioa_init);`
			`module_exit(dm_ioa_exit);`

			`MODULE_DESCRIPTION(DM_NAME " multipath path selector that selects paths based on the CPU IO is being executed on");`
			`MODULE_AUTHOR("Mike Christie <michael.christie@oracle.com>");`
			`MODULE_LICENSE("GPL");`