566 lines
14 KiB
C
566 lines
14 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Historical Service Time
|
|
*
|
|
* Keeps a time-weighted exponential moving average of the historical
|
|
* service time. Estimates future service time based on the historical
|
|
* service time and the number of outstanding requests.
|
|
*
|
|
* Marks paths stale if they have not finished within hst *
|
|
* num_paths. If a path is stale and unused, we will send a single
|
|
* request to probe in case the path has improved. This situation
|
|
* generally arises if the path is so much worse than others that it
|
|
* will never have the best estimated service time, or if the entire
|
|
* multipath device is unused. If a path is stale and in use, limit the
|
|
* number of requests it can receive with the assumption that the path
|
|
* has become degraded.
|
|
*
|
|
* To avoid repeatedly calculating exponents for time weighting, times
|
|
* are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
|
|
* ns, and the weighting is pre-calculated.
|
|
*
|
|
*/
|
|
|
|
#include "dm.h"
|
|
#include "dm-path-selector.h"
|
|
|
|
#include <linux/blkdev.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/module.h>
|
|
#include <linux/sched/clock.h>
|
|
|
|
|
|
#define DM_MSG_PREFIX "multipath historical-service-time"
|
|
#define HST_MIN_IO 1
|
|
#define HST_VERSION "0.1.1"
|
|
|
|
#define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */
|
|
#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
|
|
#define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
|
|
#define HST_FIXED_95 972
|
|
#define HST_MAX_INFLIGHT HST_FIXED_1
|
|
#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
|
|
#define HST_WEIGHT_COUNT 64ULL
|
|
|
|
struct selector {
|
|
struct list_head valid_paths;
|
|
struct list_head failed_paths;
|
|
int valid_count;
|
|
spinlock_t lock;
|
|
|
|
unsigned int weights[HST_WEIGHT_COUNT];
|
|
unsigned int threshold_multiplier;
|
|
};
|
|
|
|
struct path_info {
|
|
struct list_head list;
|
|
struct dm_path *path;
|
|
unsigned int repeat_count;
|
|
|
|
spinlock_t lock;
|
|
|
|
u64 historical_service_time; /* Fixed point */
|
|
|
|
u64 stale_after;
|
|
u64 last_finish;
|
|
|
|
u64 outstanding;
|
|
};
|
|
|
|
/**
|
|
* fixed_power - compute: x^n, in O(log n) time
|
|
*
|
|
* @x: base of the power
|
|
* @frac_bits: fractional bits of @x
|
|
* @n: power to raise @x to.
|
|
*
|
|
* By exploiting the relation between the definition of the natural power
|
|
* function: x^n := x*x*...*x (x multiplied by itself for n times), and
|
|
* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
|
|
* (where: n_i \elem {0, 1}, the binary vector representing n),
|
|
* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
|
|
* of course trivially computable in O(log_2 n), the length of our binary
|
|
* vector.
|
|
*
|
|
* (see: kernel/sched/loadavg.c)
|
|
*/
|
|
static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
|
|
{
|
|
unsigned long result = 1UL << frac_bits;
|
|
|
|
if (n) {
|
|
for (;;) {
|
|
if (n & 1) {
|
|
result *= x;
|
|
result += 1UL << (frac_bits - 1);
|
|
result >>= frac_bits;
|
|
}
|
|
n >>= 1;
|
|
if (!n)
|
|
break;
|
|
x *= x;
|
|
x += 1UL << (frac_bits - 1);
|
|
x >>= frac_bits;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Calculate the next value of an exponential moving average
|
|
* a_1 = a_0 * e + a * (1 - e)
|
|
*
|
|
* @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
|
|
* @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
|
|
* @weight: [0, HST_FIXED_1]
|
|
*
|
|
* Note:
|
|
* To account for multiple periods in the same calculation,
|
|
* a_n = a_0 * e^n + a * (1 - e^n),
|
|
* so call fixed_ema(last, next, pow(weight, N))
|
|
*/
|
|
static u64 fixed_ema(u64 last, u64 next, u64 weight)
|
|
{
|
|
last *= weight;
|
|
last += next * (HST_FIXED_1 - weight);
|
|
last += 1ULL << (HST_FIXED_SHIFT - 1);
|
|
return last >> HST_FIXED_SHIFT;
|
|
}
|
|
|
|
static struct selector *alloc_selector(void)
|
|
{
|
|
struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
|
|
|
|
if (s) {
|
|
INIT_LIST_HEAD(&s->valid_paths);
|
|
INIT_LIST_HEAD(&s->failed_paths);
|
|
spin_lock_init(&s->lock);
|
|
s->valid_count = 0;
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
/*
|
|
* Get the weight for a given time span.
|
|
*/
|
|
static u64 hst_weight(struct path_selector *ps, u64 delta)
|
|
{
|
|
struct selector *s = ps->context;
|
|
int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
|
|
HST_WEIGHT_COUNT - 1);
|
|
|
|
return s->weights[bucket];
|
|
}
|
|
|
|
/*
|
|
* Set up the weights array.
|
|
*
|
|
* weights[len-1] = 0
|
|
* weights[n] = base ^ (n + 1)
|
|
*/
|
|
static void hst_set_weights(struct path_selector *ps, unsigned int base)
|
|
{
|
|
struct selector *s = ps->context;
|
|
int i;
|
|
|
|
if (base >= HST_FIXED_1)
|
|
return;
|
|
|
|
for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
|
|
s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
|
|
s->weights[HST_WEIGHT_COUNT - 1] = 0;
|
|
}
|
|
|
|
static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
|
|
{
|
|
struct selector *s;
|
|
unsigned int base_weight = HST_FIXED_95;
|
|
unsigned int threshold_multiplier = 0;
|
|
char dummy;
|
|
|
|
/*
|
|
* Arguments: [<base_weight> [<threshold_multiplier>]]
|
|
* <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
|
|
* value of 0 will completely ignore any history.
|
|
* If not given, default (HST_FIXED_95) is used.
|
|
* <threshold_multiplier>: Minimum threshold multiplier for paths to
|
|
* be considered different. That is, a path is
|
|
* considered different iff (p1 > N * p2) where p1
|
|
* is the path with higher service time. A threshold
|
|
* of 1 or 0 has no effect. Defaults to 0.
|
|
*/
|
|
if (argc > 2)
|
|
return -EINVAL;
|
|
|
|
if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
|
|
base_weight >= HST_FIXED_1)) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (argc > 1 && (sscanf(argv[1], "%u%c",
|
|
&threshold_multiplier, &dummy) != 1)) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
s = alloc_selector();
|
|
if (!s)
|
|
return -ENOMEM;
|
|
|
|
ps->context = s;
|
|
|
|
hst_set_weights(ps, base_weight);
|
|
s->threshold_multiplier = threshold_multiplier;
|
|
return 0;
|
|
}
|
|
|
|
static void free_paths(struct list_head *paths)
|
|
{
|
|
struct path_info *pi, *next;
|
|
|
|
list_for_each_entry_safe(pi, next, paths, list) {
|
|
list_del(&pi->list);
|
|
kfree(pi);
|
|
}
|
|
}
|
|
|
|
static void hst_destroy(struct path_selector *ps)
|
|
{
|
|
struct selector *s = ps->context;
|
|
|
|
free_paths(&s->valid_paths);
|
|
free_paths(&s->failed_paths);
|
|
kfree(s);
|
|
ps->context = NULL;
|
|
}
|
|
|
|
static int hst_status(struct path_selector *ps, struct dm_path *path,
|
|
status_type_t type, char *result, unsigned int maxlen)
|
|
{
|
|
unsigned int sz = 0;
|
|
struct path_info *pi;
|
|
|
|
if (!path) {
|
|
struct selector *s = ps->context;
|
|
|
|
DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
|
|
} else {
|
|
pi = path->pscontext;
|
|
|
|
switch (type) {
|
|
case STATUSTYPE_INFO:
|
|
DMEMIT("%llu %llu %llu ", pi->historical_service_time,
|
|
pi->outstanding, pi->stale_after);
|
|
break;
|
|
case STATUSTYPE_TABLE:
|
|
DMEMIT("0 ");
|
|
break;
|
|
case STATUSTYPE_IMA:
|
|
*result = '\0';
|
|
break;
|
|
}
|
|
}
|
|
|
|
return sz;
|
|
}
|
|
|
|
static int hst_add_path(struct path_selector *ps, struct dm_path *path,
|
|
int argc, char **argv, char **error)
|
|
{
|
|
struct selector *s = ps->context;
|
|
struct path_info *pi;
|
|
unsigned int repeat_count = HST_MIN_IO;
|
|
char dummy;
|
|
unsigned long flags;
|
|
|
|
/*
|
|
* Arguments: [<repeat_count>]
|
|
* <repeat_count>: The number of I/Os before switching path.
|
|
* If not given, default (HST_MIN_IO) is used.
|
|
*/
|
|
if (argc > 1) {
|
|
*error = "historical-service-time ps: incorrect number of arguments";
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
|
|
*error = "historical-service-time ps: invalid repeat count";
|
|
return -EINVAL;
|
|
}
|
|
|
|
/* allocate the path */
|
|
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
|
|
if (!pi) {
|
|
*error = "historical-service-time ps: Error allocating path context";
|
|
return -ENOMEM;
|
|
}
|
|
|
|
pi->path = path;
|
|
pi->repeat_count = repeat_count;
|
|
|
|
pi->historical_service_time = HST_FIXED_1;
|
|
|
|
spin_lock_init(&pi->lock);
|
|
pi->outstanding = 0;
|
|
|
|
pi->stale_after = 0;
|
|
pi->last_finish = 0;
|
|
|
|
path->pscontext = pi;
|
|
|
|
spin_lock_irqsave(&s->lock, flags);
|
|
list_add_tail(&pi->list, &s->valid_paths);
|
|
s->valid_count++;
|
|
spin_unlock_irqrestore(&s->lock, flags);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
|
|
{
|
|
struct selector *s = ps->context;
|
|
struct path_info *pi = path->pscontext;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&s->lock, flags);
|
|
list_move(&pi->list, &s->failed_paths);
|
|
s->valid_count--;
|
|
spin_unlock_irqrestore(&s->lock, flags);
|
|
}
|
|
|
|
static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
|
|
{
|
|
struct selector *s = ps->context;
|
|
struct path_info *pi = path->pscontext;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&s->lock, flags);
|
|
list_move_tail(&pi->list, &s->valid_paths);
|
|
s->valid_count++;
|
|
spin_unlock_irqrestore(&s->lock, flags);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void hst_fill_compare(struct path_info *pi, u64 *hst,
|
|
u64 *out, u64 *stale)
|
|
{
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&pi->lock, flags);
|
|
*hst = pi->historical_service_time;
|
|
*out = pi->outstanding;
|
|
*stale = pi->stale_after;
|
|
spin_unlock_irqrestore(&pi->lock, flags);
|
|
}
|
|
|
|
/*
|
|
* Compare the estimated service time of 2 paths, pi1 and pi2,
|
|
* for the incoming I/O.
|
|
*
|
|
* Returns:
|
|
* < 0 : pi1 is better
|
|
* 0 : no difference between pi1 and pi2
|
|
* > 0 : pi2 is better
|
|
*
|
|
*/
|
|
static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
|
|
u64 time_now, struct path_selector *ps)
|
|
{
|
|
struct selector *s = ps->context;
|
|
u64 hst1, hst2;
|
|
long long out1, out2, stale1, stale2;
|
|
int pi2_better, over_threshold;
|
|
|
|
hst_fill_compare(pi1, &hst1, &out1, &stale1);
|
|
hst_fill_compare(pi2, &hst2, &out2, &stale2);
|
|
|
|
/* Check here if estimated latency for two paths are too similar.
|
|
* If this is the case, we skip extra calculation and just compare
|
|
* outstanding requests. In this case, any unloaded paths will
|
|
* be preferred.
|
|
*/
|
|
if (hst1 > hst2)
|
|
over_threshold = hst1 > (s->threshold_multiplier * hst2);
|
|
else
|
|
over_threshold = hst2 > (s->threshold_multiplier * hst1);
|
|
|
|
if (!over_threshold)
|
|
return out1 - out2;
|
|
|
|
/*
|
|
* If an unloaded path is stale, choose it. If both paths are unloaded,
|
|
* choose path that is the most stale.
|
|
* (If one path is loaded, choose the other)
|
|
*/
|
|
if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
|
|
(!out1 && !out2))
|
|
return (!out2 * stale1) - (!out1 * stale2);
|
|
|
|
/* Compare estimated service time. If outstanding is the same, we
|
|
* don't need to multiply
|
|
*/
|
|
if (out1 == out2) {
|
|
pi2_better = hst1 > hst2;
|
|
} else {
|
|
/* Potential overflow with out >= 1024 */
|
|
if (unlikely(out1 >= HST_MAX_INFLIGHT ||
|
|
out2 >= HST_MAX_INFLIGHT)) {
|
|
/* If over 1023 in-flights, we may overflow if hst
|
|
* is at max. (With this shift we still overflow at
|
|
* 1048576 in-flights, which is high enough).
|
|
*/
|
|
hst1 >>= HST_FIXED_SHIFT;
|
|
hst2 >>= HST_FIXED_SHIFT;
|
|
}
|
|
pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
|
|
}
|
|
|
|
/* In the case that the 'winner' is stale, limit to equal usage. */
|
|
if (pi2_better) {
|
|
if (stale2 < time_now)
|
|
return out1 - out2;
|
|
return 1;
|
|
}
|
|
if (stale1 < time_now)
|
|
return out1 - out2;
|
|
return -1;
|
|
}
|
|
|
|
static struct dm_path *hst_select_path(struct path_selector *ps,
|
|
size_t nr_bytes)
|
|
{
|
|
struct selector *s = ps->context;
|
|
struct path_info *pi = NULL, *best = NULL;
|
|
u64 time_now = sched_clock();
|
|
struct dm_path *ret = NULL;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&s->lock, flags);
|
|
if (list_empty(&s->valid_paths))
|
|
goto out;
|
|
|
|
list_for_each_entry(pi, &s->valid_paths, list) {
|
|
if (!best || (hst_compare(pi, best, time_now, ps) < 0))
|
|
best = pi;
|
|
}
|
|
|
|
if (!best)
|
|
goto out;
|
|
|
|
/* Move last used path to end (least preferred in case of ties) */
|
|
list_move_tail(&best->list, &s->valid_paths);
|
|
|
|
ret = best->path;
|
|
|
|
out:
|
|
spin_unlock_irqrestore(&s->lock, flags);
|
|
return ret;
|
|
}
|
|
|
|
static int hst_start_io(struct path_selector *ps, struct dm_path *path,
|
|
size_t nr_bytes)
|
|
{
|
|
struct path_info *pi = path->pscontext;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&pi->lock, flags);
|
|
pi->outstanding++;
|
|
spin_unlock_irqrestore(&pi->lock, flags);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static u64 path_service_time(struct path_info *pi, u64 start_time)
|
|
{
|
|
u64 sched_now = ktime_get_ns();
|
|
|
|
/* if a previous disk request has finished after this IO was
|
|
* sent to the hardware, pretend the submission happened
|
|
* serially.
|
|
*/
|
|
if (time_after64(pi->last_finish, start_time))
|
|
start_time = pi->last_finish;
|
|
|
|
pi->last_finish = sched_now;
|
|
if (time_before64(sched_now, start_time))
|
|
return 0;
|
|
|
|
return sched_now - start_time;
|
|
}
|
|
|
|
static int hst_end_io(struct path_selector *ps, struct dm_path *path,
|
|
size_t nr_bytes, u64 start_time)
|
|
{
|
|
struct path_info *pi = path->pscontext;
|
|
struct selector *s = ps->context;
|
|
unsigned long flags;
|
|
u64 st;
|
|
|
|
spin_lock_irqsave(&pi->lock, flags);
|
|
|
|
st = path_service_time(pi, start_time);
|
|
pi->outstanding--;
|
|
pi->historical_service_time =
|
|
fixed_ema(pi->historical_service_time,
|
|
min(st * HST_FIXED_1, HST_FIXED_MAX),
|
|
hst_weight(ps, st));
|
|
|
|
/*
|
|
* On request end, mark path as fresh. If a path hasn't
|
|
* finished any requests within the fresh period, the estimated
|
|
* service time is considered too optimistic and we limit the
|
|
* maximum requests on that path.
|
|
*/
|
|
pi->stale_after = pi->last_finish +
|
|
(s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
|
|
|
|
spin_unlock_irqrestore(&pi->lock, flags);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static struct path_selector_type hst_ps = {
|
|
.name = "historical-service-time",
|
|
.module = THIS_MODULE,
|
|
.table_args = 1,
|
|
.info_args = 3,
|
|
.create = hst_create,
|
|
.destroy = hst_destroy,
|
|
.status = hst_status,
|
|
.add_path = hst_add_path,
|
|
.fail_path = hst_fail_path,
|
|
.reinstate_path = hst_reinstate_path,
|
|
.select_path = hst_select_path,
|
|
.start_io = hst_start_io,
|
|
.end_io = hst_end_io,
|
|
};
|
|
|
|
static int __init dm_hst_init(void)
|
|
{
|
|
int r = dm_register_path_selector(&hst_ps);
|
|
|
|
if (r < 0)
|
|
DMERR("register failed %d", r);
|
|
|
|
DMINFO("version " HST_VERSION " loaded");
|
|
|
|
return r;
|
|
}
|
|
|
|
static void __exit dm_hst_exit(void)
|
|
{
|
|
int r = dm_unregister_path_selector(&hst_ps);
|
|
|
|
if (r < 0)
|
|
DMERR("unregister failed %d", r);
|
|
}
|
|
|
|
module_init(dm_hst_init);
|
|
module_exit(dm_hst_exit);
|
|
|
|
MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
|
|
MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
|
|
MODULE_LICENSE("GPL");
|