Merge tag 'ipvs-for-v4.18' of http://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next
Simon Horman says: ==================== IPVS Updates for v4.18 please consider these IPVS enhancements for v4.18. * Whitepace cleanup * Add Maglev hashing algorithm as a IPVS scheduler Inju Song says "Implements the Google's Maglev hashing algorithm as a IPVS scheduler. Basically it provides consistent hashing but offers some special features about disruption and load balancing. 1) minimal disruption: when the set of destinations changes, a connection will likely be sent to the same destination as it was before. 2) load balancing: each destination will receive an almost equal number of connections. Seel also: [3.4 Consistent Hasing] in https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf " * Fix to correct implementation of Knuth's multiplicative hashing which is used in sh/dh/lblc/lblcr algorithms. Instead the implementation provided by the hash_32() macro is used. ==================== Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
This commit is contained in:
commit
146cd6b5d5
|
@ -668,6 +668,7 @@ struct ip_vs_dest {
|
||||||
volatile unsigned int flags; /* dest status flags */
|
volatile unsigned int flags; /* dest status flags */
|
||||||
atomic_t conn_flags; /* flags to copy to conn */
|
atomic_t conn_flags; /* flags to copy to conn */
|
||||||
atomic_t weight; /* server weight */
|
atomic_t weight; /* server weight */
|
||||||
|
atomic_t last_weight; /* server latest weight */
|
||||||
|
|
||||||
refcount_t refcnt; /* reference counter */
|
refcount_t refcnt; /* reference counter */
|
||||||
struct ip_vs_stats stats; /* statistics */
|
struct ip_vs_stats stats; /* statistics */
|
||||||
|
|
|
@ -225,6 +225,25 @@ config IP_VS_SH
|
||||||
If you want to compile it in kernel, say Y. To compile it as a
|
If you want to compile it in kernel, say Y. To compile it as a
|
||||||
module, choose M here. If unsure, say N.
|
module, choose M here. If unsure, say N.
|
||||||
|
|
||||||
|
config IP_VS_MH
|
||||||
|
tristate "maglev hashing scheduling"
|
||||||
|
---help---
|
||||||
|
The maglev consistent hashing scheduling algorithm provides the
|
||||||
|
Google's Maglev hashing algorithm as a IPVS scheduler. It assigns
|
||||||
|
network connections to the servers through looking up a statically
|
||||||
|
assigned special hash table called the lookup table. Maglev hashing
|
||||||
|
is to assign a preference list of all the lookup table positions
|
||||||
|
to each destination.
|
||||||
|
|
||||||
|
Through this operation, The maglev hashing gives an almost equal
|
||||||
|
share of the lookup table to each of the destinations and provides
|
||||||
|
minimal disruption by using the lookup table. When the set of
|
||||||
|
destinations changes, a connection will likely be sent to the same
|
||||||
|
destination as it was before.
|
||||||
|
|
||||||
|
If you want to compile it in kernel, say Y. To compile it as a
|
||||||
|
module, choose M here. If unsure, say N.
|
||||||
|
|
||||||
config IP_VS_SED
|
config IP_VS_SED
|
||||||
tristate "shortest expected delay scheduling"
|
tristate "shortest expected delay scheduling"
|
||||||
---help---
|
---help---
|
||||||
|
@ -266,6 +285,24 @@ config IP_VS_SH_TAB_BITS
|
||||||
needs to be large enough to effectively fit all the destinations
|
needs to be large enough to effectively fit all the destinations
|
||||||
multiplied by their respective weights.
|
multiplied by their respective weights.
|
||||||
|
|
||||||
|
comment 'IPVS MH scheduler'
|
||||||
|
|
||||||
|
config IP_VS_MH_TAB_INDEX
|
||||||
|
int "IPVS maglev hashing table index of size (the prime numbers)"
|
||||||
|
range 8 17
|
||||||
|
default 12
|
||||||
|
---help---
|
||||||
|
The maglev hashing scheduler maps source IPs to destinations
|
||||||
|
stored in a hash table. This table is assigned by a preference
|
||||||
|
list of the positions to each destination until all slots in
|
||||||
|
the table are filled. The index determines the prime for size of
|
||||||
|
the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
|
||||||
|
65521 or 131071. When using weights to allow destinations to
|
||||||
|
receive more connections, the table is assigned an amount
|
||||||
|
proportional to the weights specified. The table needs to be large
|
||||||
|
enough to effectively fit all the destinations multiplied by their
|
||||||
|
respective weights.
|
||||||
|
|
||||||
comment 'IPVS application helper'
|
comment 'IPVS application helper'
|
||||||
|
|
||||||
config IP_VS_FTP
|
config IP_VS_FTP
|
||||||
|
|
|
@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
|
||||||
obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
|
obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
|
||||||
obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
|
obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
|
||||||
obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
|
obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
|
||||||
|
obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o
|
||||||
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
|
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
|
||||||
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
|
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
|
||||||
|
|
||||||
|
|
|
@ -821,6 +821,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
|
||||||
if (add && udest->af != svc->af)
|
if (add && udest->af != svc->af)
|
||||||
ipvs->mixed_address_family_dests++;
|
ipvs->mixed_address_family_dests++;
|
||||||
|
|
||||||
|
/* keep the last_weight with latest non-0 weight */
|
||||||
|
if (add || udest->weight != 0)
|
||||||
|
atomic_set(&dest->last_weight, udest->weight);
|
||||||
|
|
||||||
/* set the weight and the flags */
|
/* set the weight and the flags */
|
||||||
atomic_set(&dest->weight, udest->weight);
|
atomic_set(&dest->weight, udest->weight);
|
||||||
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
|
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
|
||||||
|
|
|
@ -43,6 +43,7 @@
|
||||||
#include <linux/module.h>
|
#include <linux/module.h>
|
||||||
#include <linux/kernel.h>
|
#include <linux/kernel.h>
|
||||||
#include <linux/skbuff.h>
|
#include <linux/skbuff.h>
|
||||||
|
#include <linux/hash.h>
|
||||||
|
|
||||||
#include <net/ip_vs.h>
|
#include <net/ip_vs.h>
|
||||||
|
|
||||||
|
@ -81,7 +82,7 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad
|
||||||
addr_fold = addr->ip6[0]^addr->ip6[1]^
|
addr_fold = addr->ip6[0]^addr->ip6[1]^
|
||||||
addr->ip6[2]^addr->ip6[3];
|
addr->ip6[2]^addr->ip6[3];
|
||||||
#endif
|
#endif
|
||||||
return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
|
return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -48,6 +48,7 @@
|
||||||
#include <linux/kernel.h>
|
#include <linux/kernel.h>
|
||||||
#include <linux/skbuff.h>
|
#include <linux/skbuff.h>
|
||||||
#include <linux/jiffies.h>
|
#include <linux/jiffies.h>
|
||||||
|
#include <linux/hash.h>
|
||||||
|
|
||||||
/* for sysctl */
|
/* for sysctl */
|
||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
|
@ -160,7 +161,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
|
||||||
addr_fold = addr->ip6[0]^addr->ip6[1]^
|
addr_fold = addr->ip6[0]^addr->ip6[1]^
|
||||||
addr->ip6[2]^addr->ip6[3];
|
addr->ip6[2]^addr->ip6[3];
|
||||||
#endif
|
#endif
|
||||||
return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
|
return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -47,6 +47,7 @@
|
||||||
#include <linux/jiffies.h>
|
#include <linux/jiffies.h>
|
||||||
#include <linux/list.h>
|
#include <linux/list.h>
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
|
#include <linux/hash.h>
|
||||||
|
|
||||||
/* for sysctl */
|
/* for sysctl */
|
||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
|
@ -323,7 +324,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
|
||||||
addr_fold = addr->ip6[0]^addr->ip6[1]^
|
addr_fold = addr->ip6[0]^addr->ip6[1]^
|
||||||
addr->ip6[2]^addr->ip6[3];
|
addr->ip6[2]^addr->ip6[3];
|
||||||
#endif
|
#endif
|
||||||
return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
|
return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,540 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
|
/* IPVS: Maglev Hashing scheduling module
|
||||||
|
*
|
||||||
|
* Authors: Inju Song <inju.song@navercorp.com>
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* The mh algorithm is to assign a preference list of all the lookup
|
||||||
|
* table positions to each destination and populate the table with
|
||||||
|
* the most-preferred position of destinations. Then it is to select
|
||||||
|
* destination with the hash key of source IP address through looking
|
||||||
|
* up a the lookup table.
|
||||||
|
*
|
||||||
|
* The algorithm is detailed in:
|
||||||
|
* [3.4 Consistent Hasing]
|
||||||
|
https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define KMSG_COMPONENT "IPVS"
|
||||||
|
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
|
||||||
|
|
||||||
|
#include <linux/ip.h>
|
||||||
|
#include <linux/slab.h>
|
||||||
|
#include <linux/module.h>
|
||||||
|
#include <linux/kernel.h>
|
||||||
|
#include <linux/skbuff.h>
|
||||||
|
|
||||||
|
#include <net/ip_vs.h>
|
||||||
|
|
||||||
|
#include <linux/siphash.h>
|
||||||
|
#include <linux/bitops.h>
|
||||||
|
#include <linux/gcd.h>
|
||||||
|
|
||||||
|
#define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */
|
||||||
|
#define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */
|
||||||
|
|
||||||
|
struct ip_vs_mh_lookup {
|
||||||
|
struct ip_vs_dest __rcu *dest; /* real server (cache) */
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ip_vs_mh_dest_setup {
|
||||||
|
unsigned int offset; /* starting offset */
|
||||||
|
unsigned int skip; /* skip */
|
||||||
|
unsigned int perm; /* next_offset */
|
||||||
|
int turns; /* weight / gcd() and rshift */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Available prime numbers for MH table */
|
||||||
|
static int primes[] = {251, 509, 1021, 2039, 4093,
|
||||||
|
8191, 16381, 32749, 65521, 131071};
|
||||||
|
|
||||||
|
/* For IPVS MH entry hash table */
|
||||||
|
#ifndef CONFIG_IP_VS_MH_TAB_INDEX
|
||||||
|
#define CONFIG_IP_VS_MH_TAB_INDEX 12
|
||||||
|
#endif
|
||||||
|
#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2)
|
||||||
|
#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8)
|
||||||
|
#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX]
|
||||||
|
|
||||||
|
struct ip_vs_mh_state {
|
||||||
|
struct rcu_head rcu_head;
|
||||||
|
struct ip_vs_mh_lookup *lookup;
|
||||||
|
struct ip_vs_mh_dest_setup *dest_setup;
|
||||||
|
hsiphash_key_t hash1, hash2;
|
||||||
|
int gcd;
|
||||||
|
int rshift;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void generate_hash_secret(hsiphash_key_t *hash1,
|
||||||
|
hsiphash_key_t *hash2)
|
||||||
|
{
|
||||||
|
hash1->key[0] = 2654435761UL;
|
||||||
|
hash1->key[1] = 2654435761UL;
|
||||||
|
|
||||||
|
hash2->key[0] = 2654446892UL;
|
||||||
|
hash2->key[1] = 2654446892UL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Helper function to determine if server is unavailable */
|
||||||
|
static inline bool is_unavailable(struct ip_vs_dest *dest)
|
||||||
|
{
|
||||||
|
return atomic_read(&dest->weight) <= 0 ||
|
||||||
|
dest->flags & IP_VS_DEST_F_OVERLOAD;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Returns hash value for IPVS MH entry */
|
||||||
|
static inline unsigned int
|
||||||
|
ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
|
||||||
|
__be16 port, hsiphash_key_t *key, unsigned int offset)
|
||||||
|
{
|
||||||
|
unsigned int v;
|
||||||
|
__be32 addr_fold = addr->ip;
|
||||||
|
|
||||||
|
#ifdef CONFIG_IP_VS_IPV6
|
||||||
|
if (af == AF_INET6)
|
||||||
|
addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
|
||||||
|
addr->ip6[2] ^ addr->ip6[3];
|
||||||
|
#endif
|
||||||
|
v = (offset + ntohs(port) + ntohl(addr_fold));
|
||||||
|
return hsiphash(&v, sizeof(v), key);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Reset all the hash buckets of the specified table. */
|
||||||
|
static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
struct ip_vs_mh_lookup *l;
|
||||||
|
struct ip_vs_dest *dest;
|
||||||
|
|
||||||
|
l = &s->lookup[0];
|
||||||
|
for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
|
||||||
|
dest = rcu_dereference_protected(l->dest, 1);
|
||||||
|
if (dest) {
|
||||||
|
ip_vs_dest_put(dest);
|
||||||
|
RCU_INIT_POINTER(l->dest, NULL);
|
||||||
|
}
|
||||||
|
l++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
|
||||||
|
struct ip_vs_service *svc)
|
||||||
|
{
|
||||||
|
struct list_head *p;
|
||||||
|
struct ip_vs_mh_dest_setup *ds;
|
||||||
|
struct ip_vs_dest *dest;
|
||||||
|
int lw;
|
||||||
|
|
||||||
|
/* If gcd is smaller then 1, number of dests or
|
||||||
|
* all last_weight of dests are zero. So, skip
|
||||||
|
* permutation for the dests.
|
||||||
|
*/
|
||||||
|
if (s->gcd < 1)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/* Set dest_setup for the dests permutation */
|
||||||
|
p = &svc->destinations;
|
||||||
|
ds = &s->dest_setup[0];
|
||||||
|
while ((p = p->next) != &svc->destinations) {
|
||||||
|
dest = list_entry(p, struct ip_vs_dest, n_list);
|
||||||
|
|
||||||
|
ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
|
||||||
|
dest->port, &s->hash1, 0) %
|
||||||
|
IP_VS_MH_TAB_SIZE;
|
||||||
|
ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
|
||||||
|
dest->port, &s->hash2, 0) %
|
||||||
|
(IP_VS_MH_TAB_SIZE - 1) + 1;
|
||||||
|
ds->perm = ds->offset;
|
||||||
|
|
||||||
|
lw = atomic_read(&dest->last_weight);
|
||||||
|
ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
|
||||||
|
ds++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
|
||||||
|
struct ip_vs_service *svc)
|
||||||
|
{
|
||||||
|
int n, c, dt_count;
|
||||||
|
unsigned long *table;
|
||||||
|
struct list_head *p;
|
||||||
|
struct ip_vs_mh_dest_setup *ds;
|
||||||
|
struct ip_vs_dest *dest, *new_dest;
|
||||||
|
|
||||||
|
/* If gcd is smaller then 1, number of dests or
|
||||||
|
* all last_weight of dests are zero. So, skip
|
||||||
|
* the population for the dests and reset lookup table.
|
||||||
|
*/
|
||||||
|
if (s->gcd < 1) {
|
||||||
|
ip_vs_mh_reset(s);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
|
||||||
|
sizeof(unsigned long), GFP_KERNEL);
|
||||||
|
if (!table)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
p = &svc->destinations;
|
||||||
|
n = 0;
|
||||||
|
dt_count = 0;
|
||||||
|
while (n < IP_VS_MH_TAB_SIZE) {
|
||||||
|
if (p == &svc->destinations)
|
||||||
|
p = p->next;
|
||||||
|
|
||||||
|
ds = &s->dest_setup[0];
|
||||||
|
while (p != &svc->destinations) {
|
||||||
|
/* Ignore added server with zero weight */
|
||||||
|
if (ds->turns < 1) {
|
||||||
|
p = p->next;
|
||||||
|
ds++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
c = ds->perm;
|
||||||
|
while (test_bit(c, table)) {
|
||||||
|
/* Add skip, mod IP_VS_MH_TAB_SIZE */
|
||||||
|
ds->perm += ds->skip;
|
||||||
|
if (ds->perm >= IP_VS_MH_TAB_SIZE)
|
||||||
|
ds->perm -= IP_VS_MH_TAB_SIZE;
|
||||||
|
c = ds->perm;
|
||||||
|
}
|
||||||
|
|
||||||
|
__set_bit(c, table);
|
||||||
|
|
||||||
|
dest = rcu_dereference_protected(s->lookup[c].dest, 1);
|
||||||
|
new_dest = list_entry(p, struct ip_vs_dest, n_list);
|
||||||
|
if (dest != new_dest) {
|
||||||
|
if (dest)
|
||||||
|
ip_vs_dest_put(dest);
|
||||||
|
ip_vs_dest_hold(new_dest);
|
||||||
|
RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (++n == IP_VS_MH_TAB_SIZE)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
if (++dt_count >= ds->turns) {
|
||||||
|
dt_count = 0;
|
||||||
|
p = p->next;
|
||||||
|
ds++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out:
|
||||||
|
kfree(table);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get ip_vs_dest associated with supplied parameters. */
|
||||||
|
static inline struct ip_vs_dest *
|
||||||
|
ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
|
||||||
|
const union nf_inet_addr *addr, __be16 port)
|
||||||
|
{
|
||||||
|
unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
|
||||||
|
% IP_VS_MH_TAB_SIZE;
|
||||||
|
struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
|
||||||
|
|
||||||
|
return (!dest || is_unavailable(dest)) ? NULL : dest;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
|
||||||
|
static inline struct ip_vs_dest *
|
||||||
|
ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
|
||||||
|
const union nf_inet_addr *addr, __be16 port)
|
||||||
|
{
|
||||||
|
unsigned int offset, roffset;
|
||||||
|
unsigned int hash, ihash;
|
||||||
|
struct ip_vs_dest *dest;
|
||||||
|
|
||||||
|
/* First try the dest it's supposed to go to */
|
||||||
|
ihash = ip_vs_mh_hashkey(svc->af, addr, port,
|
||||||
|
&s->hash1, 0) % IP_VS_MH_TAB_SIZE;
|
||||||
|
dest = rcu_dereference(s->lookup[ihash].dest);
|
||||||
|
if (!dest)
|
||||||
|
return NULL;
|
||||||
|
if (!is_unavailable(dest))
|
||||||
|
return dest;
|
||||||
|
|
||||||
|
IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
|
||||||
|
IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
|
||||||
|
|
||||||
|
/* If the original dest is unavailable, loop around the table
|
||||||
|
* starting from ihash to find a new dest
|
||||||
|
*/
|
||||||
|
for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
|
||||||
|
roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
|
||||||
|
hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
|
||||||
|
roffset) % IP_VS_MH_TAB_SIZE;
|
||||||
|
dest = rcu_dereference(s->lookup[hash].dest);
|
||||||
|
if (!dest)
|
||||||
|
break;
|
||||||
|
if (!is_unavailable(dest))
|
||||||
|
return dest;
|
||||||
|
IP_VS_DBG_BUF(6,
|
||||||
|
"MH: selected unavailable server %s:%u (offset %u), reselecting",
|
||||||
|
IP_VS_DBG_ADDR(dest->af, &dest->addr),
|
||||||
|
ntohs(dest->port), roffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Assign all the hash buckets of the specified table with the service. */
|
||||||
|
static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
|
||||||
|
struct ip_vs_service *svc)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
if (svc->num_dests > IP_VS_MH_TAB_SIZE)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
if (svc->num_dests >= 1) {
|
||||||
|
s->dest_setup = kcalloc(svc->num_dests,
|
||||||
|
sizeof(struct ip_vs_mh_dest_setup),
|
||||||
|
GFP_KERNEL);
|
||||||
|
if (!s->dest_setup)
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
|
ip_vs_mh_permutate(s, svc);
|
||||||
|
|
||||||
|
ret = ip_vs_mh_populate(s, svc);
|
||||||
|
if (ret < 0)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
|
||||||
|
IP_VS_DBG_ADDR(svc->af, &svc->addr),
|
||||||
|
ntohs(svc->port));
|
||||||
|
|
||||||
|
out:
|
||||||
|
if (svc->num_dests >= 1) {
|
||||||
|
kfree(s->dest_setup);
|
||||||
|
s->dest_setup = NULL;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
|
||||||
|
{
|
||||||
|
struct ip_vs_dest *dest;
|
||||||
|
int weight;
|
||||||
|
int g = 0;
|
||||||
|
|
||||||
|
list_for_each_entry(dest, &svc->destinations, n_list) {
|
||||||
|
weight = atomic_read(&dest->last_weight);
|
||||||
|
if (weight > 0) {
|
||||||
|
if (g > 0)
|
||||||
|
g = gcd(weight, g);
|
||||||
|
else
|
||||||
|
g = weight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return g;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* To avoid assigning huge weight for the MH table,
|
||||||
|
* calculate shift value with gcd.
|
||||||
|
*/
|
||||||
|
static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
|
||||||
|
{
|
||||||
|
struct ip_vs_dest *dest;
|
||||||
|
int new_weight, weight = 0;
|
||||||
|
int mw, shift;
|
||||||
|
|
||||||
|
/* If gcd is smaller then 1, number of dests or
|
||||||
|
* all last_weight of dests are zero. So, return
|
||||||
|
* shift value as zero.
|
||||||
|
*/
|
||||||
|
if (gcd < 1)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
list_for_each_entry(dest, &svc->destinations, n_list) {
|
||||||
|
new_weight = atomic_read(&dest->last_weight);
|
||||||
|
if (new_weight > weight)
|
||||||
|
weight = new_weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Because gcd is greater than zero,
|
||||||
|
* the maximum weight and gcd are always greater than zero
|
||||||
|
*/
|
||||||
|
mw = weight / gcd;
|
||||||
|
|
||||||
|
/* shift = occupied bits of weight/gcd - MH highest bits */
|
||||||
|
shift = fls(mw) - IP_VS_MH_TAB_BITS;
|
||||||
|
return (shift >= 0) ? shift : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ip_vs_mh_state_free(struct rcu_head *head)
|
||||||
|
{
|
||||||
|
struct ip_vs_mh_state *s;
|
||||||
|
|
||||||
|
s = container_of(head, struct ip_vs_mh_state, rcu_head);
|
||||||
|
kfree(s->lookup);
|
||||||
|
kfree(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
struct ip_vs_mh_state *s;
|
||||||
|
|
||||||
|
/* Allocate the MH table for this service */
|
||||||
|
s = kzalloc(sizeof(*s), GFP_KERNEL);
|
||||||
|
if (!s)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
|
||||||
|
GFP_KERNEL);
|
||||||
|
if (!s->lookup) {
|
||||||
|
kfree(s);
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
|
generate_hash_secret(&s->hash1, &s->hash2);
|
||||||
|
s->gcd = ip_vs_mh_gcd_weight(svc);
|
||||||
|
s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
|
||||||
|
|
||||||
|
IP_VS_DBG(6,
|
||||||
|
"MH lookup table (memory=%zdbytes) allocated for current service\n",
|
||||||
|
sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
|
||||||
|
|
||||||
|
/* Assign the lookup table with current dests */
|
||||||
|
ret = ip_vs_mh_reassign(s, svc);
|
||||||
|
if (ret < 0) {
|
||||||
|
ip_vs_mh_reset(s);
|
||||||
|
ip_vs_mh_state_free(&s->rcu_head);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* No more failures, attach state */
|
||||||
|
svc->sched_data = s;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
|
||||||
|
{
|
||||||
|
struct ip_vs_mh_state *s = svc->sched_data;
|
||||||
|
|
||||||
|
/* Got to clean up lookup entry here */
|
||||||
|
ip_vs_mh_reset(s);
|
||||||
|
|
||||||
|
call_rcu(&s->rcu_head, ip_vs_mh_state_free);
|
||||||
|
IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
|
||||||
|
sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
|
||||||
|
struct ip_vs_dest *dest)
|
||||||
|
{
|
||||||
|
struct ip_vs_mh_state *s = svc->sched_data;
|
||||||
|
|
||||||
|
s->gcd = ip_vs_mh_gcd_weight(svc);
|
||||||
|
s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
|
||||||
|
|
||||||
|
/* Assign the lookup table with the updated service */
|
||||||
|
return ip_vs_mh_reassign(s, svc);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Helper function to get port number */
|
||||||
|
static inline __be16
|
||||||
|
ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
|
||||||
|
{
|
||||||
|
__be16 _ports[2], *ports;
|
||||||
|
|
||||||
|
/* At this point we know that we have a valid packet of some kind.
|
||||||
|
* Because ICMP packets are only guaranteed to have the first 8
|
||||||
|
* bytes, let's just grab the ports. Fortunately they're in the
|
||||||
|
* same position for all three of the protocols we care about.
|
||||||
|
*/
|
||||||
|
switch (iph->protocol) {
|
||||||
|
case IPPROTO_TCP:
|
||||||
|
case IPPROTO_UDP:
|
||||||
|
case IPPROTO_SCTP:
|
||||||
|
ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
|
||||||
|
&_ports);
|
||||||
|
if (unlikely(!ports))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (likely(!ip_vs_iph_inverse(iph)))
|
||||||
|
return ports[0];
|
||||||
|
else
|
||||||
|
return ports[1];
|
||||||
|
default:
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Maglev Hashing scheduling */
|
||||||
|
static struct ip_vs_dest *
|
||||||
|
ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
|
||||||
|
struct ip_vs_iphdr *iph)
|
||||||
|
{
|
||||||
|
struct ip_vs_dest *dest;
|
||||||
|
struct ip_vs_mh_state *s;
|
||||||
|
__be16 port = 0;
|
||||||
|
const union nf_inet_addr *hash_addr;
|
||||||
|
|
||||||
|
hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
|
||||||
|
|
||||||
|
IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
|
||||||
|
|
||||||
|
if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
|
||||||
|
port = ip_vs_mh_get_port(skb, iph);
|
||||||
|
|
||||||
|
s = (struct ip_vs_mh_state *)svc->sched_data;
|
||||||
|
|
||||||
|
if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
|
||||||
|
dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
|
||||||
|
else
|
||||||
|
dest = ip_vs_mh_get(svc, s, hash_addr, port);
|
||||||
|
|
||||||
|
if (!dest) {
|
||||||
|
ip_vs_scheduler_err(svc, "no destination available");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
|
||||||
|
IP_VS_DBG_ADDR(svc->af, hash_addr),
|
||||||
|
ntohs(port),
|
||||||
|
IP_VS_DBG_ADDR(dest->af, &dest->addr),
|
||||||
|
ntohs(dest->port));
|
||||||
|
|
||||||
|
return dest;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* IPVS MH Scheduler structure */
|
||||||
|
static struct ip_vs_scheduler ip_vs_mh_scheduler = {
|
||||||
|
.name = "mh",
|
||||||
|
.refcnt = ATOMIC_INIT(0),
|
||||||
|
.module = THIS_MODULE,
|
||||||
|
.n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
|
||||||
|
.init_service = ip_vs_mh_init_svc,
|
||||||
|
.done_service = ip_vs_mh_done_svc,
|
||||||
|
.add_dest = ip_vs_mh_dest_changed,
|
||||||
|
.del_dest = ip_vs_mh_dest_changed,
|
||||||
|
.upd_dest = ip_vs_mh_dest_changed,
|
||||||
|
.schedule = ip_vs_mh_schedule,
|
||||||
|
};
|
||||||
|
|
||||||
|
static int __init ip_vs_mh_init(void)
|
||||||
|
{
|
||||||
|
return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __exit ip_vs_mh_cleanup(void)
|
||||||
|
{
|
||||||
|
unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
|
||||||
|
rcu_barrier();
|
||||||
|
}
|
||||||
|
|
||||||
|
module_init(ip_vs_mh_init);
|
||||||
|
module_exit(ip_vs_mh_cleanup);
|
||||||
|
MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
|
||||||
|
MODULE_LICENSE("GPL v2");
|
||||||
|
MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
|
|
@ -436,7 +436,7 @@ static bool tcp_state_active(int state)
|
||||||
return tcp_state_active_table[state];
|
return tcp_state_active_table[state];
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct tcp_states_t tcp_states [] = {
|
static struct tcp_states_t tcp_states[] = {
|
||||||
/* INPUT */
|
/* INPUT */
|
||||||
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
|
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
|
||||||
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
|
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
|
||||||
|
@ -459,7 +459,7 @@ static struct tcp_states_t tcp_states [] = {
|
||||||
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
|
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct tcp_states_t tcp_states_dos [] = {
|
static struct tcp_states_t tcp_states_dos[] = {
|
||||||
/* INPUT */
|
/* INPUT */
|
||||||
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
|
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
|
||||||
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
|
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
|
||||||
|
|
|
@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
|
||||||
addr_fold = addr->ip6[0]^addr->ip6[1]^
|
addr_fold = addr->ip6[0]^addr->ip6[1]^
|
||||||
addr->ip6[2]^addr->ip6[3];
|
addr->ip6[2]^addr->ip6[3];
|
||||||
#endif
|
#endif
|
||||||
return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
|
return (offset + hash_32(ntohs(port) + ntohl(addr_fold),
|
||||||
|
IP_VS_SH_TAB_BITS)) &
|
||||||
IP_VS_SH_TAB_MASK;
|
IP_VS_SH_TAB_MASK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue