selftests/bpf: Add cls_redirect classifier
cls_redirect is a TC clsact based replacement for the glb-redirect iptables module available at [1]. It enables what GitHub calls "second chance" flows [2], similarly proposed by the Beamer paper [3]. In contrast to glb-redirect, it also supports migrating UDP flows as long as connected sockets are used. cls_redirect is in production at Cloudflare, as part of our own L4 load balancer. We have modified the encapsulation format slightly from glb-redirect: glbgue_chained_routing.private_data_type has been repurposed to form a version field and several flags. Both have been arranged in a way that a private_data_type value of zero matches the current glb-redirect behaviour. This means that cls_redirect will understand packets in glb-redirect format, but not vice versa. The test suite only covers basic features. For example, cls_redirect will correctly forward path MTU discovery packets, but this is not exercised. It is also possible to switch the encapsulation format to GRE on the last hop, which is also not tested. There are two major distinctions from glb-redirect: first, cls_redirect relies on receiving encapsulated packets directly from a router. This is because we don't have access to the neighbour tables from BPF, yet. See forward_to_next_hop for details. Second, cls_redirect performs decapsulation instead of using separate ipip and sit tunnel devices. This avoids issues with the sit tunnel [4] and makes deploying the classifier easier: decapsulated packets appear on the same interface, so existing firewall rules continue to work as expected. The code base started it's life on v4.19, so there are most likely still hold overs from old workarounds. In no particular order: - The function buf_off is required to defeat a clang optimization that leads to the verifier rejecting the program due to pointer arithmetic in the wrong order. - The function pkt_parse_ipv6 is force inlined, because it would otherwise be rejected due to returning a pointer to stack memory. - The functions fill_tuple and classify_tcp contain kludges, because we've run out of function arguments. - The logic in general is rather nested, due to verifier restrictions. I think this is either because the verifier loses track of constants on the stack, or because it can't track enum like variables. 1: https://github.com/github/glb-director/tree/master/src/glb-redirect 2: https://github.com/github/glb-director/blob/master/docs/development/second-chance-design.md 3: https://www.usenix.org/conference/nsdi18/presentation/olteanu 4: https://github.com/github/glb-director/issues/64 Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Link: https://lore.kernel.org/bpf/20200424185556.7358-2-lmb@cloudflare.com
This commit is contained in:
parent
6f8a57ccf8
commit
234589012b
|
@ -0,0 +1,456 @@
|
|||
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
// Copyright (c) 2020 Cloudflare
|
||||
|
||||
#define _GNU_SOURCE
|
||||
|
||||
#include <arpa/inet.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <linux/pkt_cls.h>
|
||||
|
||||
#include <test_progs.h>
|
||||
|
||||
#include "progs/test_cls_redirect.h"
|
||||
#include "test_cls_redirect.skel.h"
|
||||
|
||||
#define ENCAP_IP INADDR_LOOPBACK
|
||||
#define ENCAP_PORT (1234)
|
||||
|
||||
struct addr_port {
|
||||
in_port_t port;
|
||||
union {
|
||||
struct in_addr in_addr;
|
||||
struct in6_addr in6_addr;
|
||||
};
|
||||
};
|
||||
|
||||
struct tuple {
|
||||
int family;
|
||||
struct addr_port src;
|
||||
struct addr_port dst;
|
||||
};
|
||||
|
||||
static int start_server(const struct sockaddr *addr, socklen_t len, int type)
|
||||
{
|
||||
int fd = socket(addr->sa_family, type, 0);
|
||||
if (CHECK_FAIL(fd == -1))
|
||||
return -1;
|
||||
if (CHECK_FAIL(bind(fd, addr, len) == -1))
|
||||
goto err;
|
||||
if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1))
|
||||
goto err;
|
||||
|
||||
return fd;
|
||||
|
||||
err:
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int connect_to_server(const struct sockaddr *addr, socklen_t len,
|
||||
int type)
|
||||
{
|
||||
int fd = socket(addr->sa_family, type, 0);
|
||||
if (CHECK_FAIL(fd == -1))
|
||||
return -1;
|
||||
if (CHECK_FAIL(connect(fd, addr, len)))
|
||||
goto err;
|
||||
|
||||
return fd;
|
||||
|
||||
err:
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap)
|
||||
{
|
||||
const struct sockaddr_in6 *in6;
|
||||
const struct sockaddr_in *in;
|
||||
|
||||
switch (sa->sa_family) {
|
||||
case AF_INET:
|
||||
in = (const struct sockaddr_in *)sa;
|
||||
ap->in_addr = in->sin_addr;
|
||||
ap->port = in->sin_port;
|
||||
return true;
|
||||
|
||||
case AF_INET6:
|
||||
in6 = (const struct sockaddr_in6 *)sa;
|
||||
ap->in6_addr = in6->sin6_addr;
|
||||
ap->port = in6->sin6_port;
|
||||
return true;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type,
|
||||
int *server, int *conn, struct tuple *tuple)
|
||||
{
|
||||
struct sockaddr_storage ss;
|
||||
socklen_t slen = sizeof(ss);
|
||||
struct sockaddr *sa = (struct sockaddr *)&ss;
|
||||
|
||||
*server = start_server(addr, len, type);
|
||||
if (*server < 0)
|
||||
return false;
|
||||
|
||||
if (CHECK_FAIL(getsockname(*server, sa, &slen)))
|
||||
goto close_server;
|
||||
|
||||
*conn = connect_to_server(sa, slen, type);
|
||||
if (*conn < 0)
|
||||
goto close_server;
|
||||
|
||||
/* We want to simulate packets arriving at conn, so we have to
|
||||
* swap src and dst.
|
||||
*/
|
||||
slen = sizeof(ss);
|
||||
if (CHECK_FAIL(getsockname(*conn, sa, &slen)))
|
||||
goto close_conn;
|
||||
|
||||
if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst)))
|
||||
goto close_conn;
|
||||
|
||||
slen = sizeof(ss);
|
||||
if (CHECK_FAIL(getpeername(*conn, sa, &slen)))
|
||||
goto close_conn;
|
||||
|
||||
if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src)))
|
||||
goto close_conn;
|
||||
|
||||
tuple->family = ss.ss_family;
|
||||
return true;
|
||||
|
||||
close_conn:
|
||||
close(*conn);
|
||||
*conn = -1;
|
||||
close_server:
|
||||
close(*server);
|
||||
*server = -1;
|
||||
return false;
|
||||
}
|
||||
|
||||
static socklen_t prepare_addr(struct sockaddr_storage *addr, int family)
|
||||
{
|
||||
struct sockaddr_in *addr4;
|
||||
struct sockaddr_in6 *addr6;
|
||||
|
||||
switch (family) {
|
||||
case AF_INET:
|
||||
addr4 = (struct sockaddr_in *)addr;
|
||||
memset(addr4, 0, sizeof(*addr4));
|
||||
addr4->sin_family = family;
|
||||
addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
|
||||
return sizeof(*addr4);
|
||||
case AF_INET6:
|
||||
addr6 = (struct sockaddr_in6 *)addr;
|
||||
memset(addr6, 0, sizeof(*addr6));
|
||||
addr6->sin6_family = family;
|
||||
addr6->sin6_addr = in6addr_loopback;
|
||||
return sizeof(*addr6);
|
||||
default:
|
||||
fprintf(stderr, "Invalid family %d", family);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool was_decapsulated(struct bpf_prog_test_run_attr *tattr)
|
||||
{
|
||||
return tattr->data_size_out < tattr->data_size_in;
|
||||
}
|
||||
|
||||
enum type {
|
||||
UDP,
|
||||
TCP,
|
||||
__NR_KIND,
|
||||
};
|
||||
|
||||
enum hops {
|
||||
NO_HOPS,
|
||||
ONE_HOP,
|
||||
};
|
||||
|
||||
enum flags {
|
||||
NONE,
|
||||
SYN,
|
||||
ACK,
|
||||
};
|
||||
|
||||
enum conn {
|
||||
KNOWN_CONN,
|
||||
UNKNOWN_CONN,
|
||||
};
|
||||
|
||||
enum result {
|
||||
ACCEPT,
|
||||
FORWARD,
|
||||
};
|
||||
|
||||
struct test_cfg {
|
||||
enum type type;
|
||||
enum result result;
|
||||
enum conn conn;
|
||||
enum hops hops;
|
||||
enum flags flags;
|
||||
};
|
||||
|
||||
static int test_str(void *buf, size_t len, const struct test_cfg *test,
|
||||
int family)
|
||||
{
|
||||
const char *family_str, *type, *conn, *hops, *result, *flags;
|
||||
|
||||
family_str = "IPv4";
|
||||
if (family == AF_INET6)
|
||||
family_str = "IPv6";
|
||||
|
||||
type = "TCP";
|
||||
if (test->type == UDP)
|
||||
type = "UDP";
|
||||
|
||||
conn = "known";
|
||||
if (test->conn == UNKNOWN_CONN)
|
||||
conn = "unknown";
|
||||
|
||||
hops = "no hops";
|
||||
if (test->hops == ONE_HOP)
|
||||
hops = "one hop";
|
||||
|
||||
result = "accept";
|
||||
if (test->result == FORWARD)
|
||||
result = "forward";
|
||||
|
||||
flags = "none";
|
||||
if (test->flags == SYN)
|
||||
flags = "SYN";
|
||||
else if (test->flags == ACK)
|
||||
flags = "ACK";
|
||||
|
||||
return snprintf(buf, len, "%s %s %s %s (%s, flags: %s)", family_str,
|
||||
type, result, conn, hops, flags);
|
||||
}
|
||||
|
||||
static struct test_cfg tests[] = {
|
||||
{ TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, SYN },
|
||||
{ TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, ACK },
|
||||
{ TCP, FORWARD, UNKNOWN_CONN, ONE_HOP, ACK },
|
||||
{ TCP, ACCEPT, KNOWN_CONN, ONE_HOP, ACK },
|
||||
{ UDP, ACCEPT, UNKNOWN_CONN, NO_HOPS, NONE },
|
||||
{ UDP, FORWARD, UNKNOWN_CONN, ONE_HOP, NONE },
|
||||
{ UDP, ACCEPT, KNOWN_CONN, ONE_HOP, NONE },
|
||||
};
|
||||
|
||||
static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto)
|
||||
{
|
||||
const uint8_t hlen =
|
||||
(sizeof(struct guehdr) / sizeof(uint32_t)) + hop_count;
|
||||
*encap = (encap_headers_t){
|
||||
.eth = { .h_proto = htons(ETH_P_IP) },
|
||||
.ip = {
|
||||
.ihl = 5,
|
||||
.version = 4,
|
||||
.ttl = IPDEFTTL,
|
||||
.protocol = IPPROTO_UDP,
|
||||
.daddr = htonl(ENCAP_IP)
|
||||
},
|
||||
.udp = {
|
||||
.dest = htons(ENCAP_PORT),
|
||||
},
|
||||
.gue = {
|
||||
.hlen = hlen,
|
||||
.proto_ctype = proto
|
||||
},
|
||||
.unigue = {
|
||||
.hop_count = hop_count
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
static size_t build_input(const struct test_cfg *test, void *const buf,
|
||||
const struct tuple *tuple)
|
||||
{
|
||||
in_port_t sport = tuple->src.port;
|
||||
encap_headers_t encap;
|
||||
struct iphdr ip;
|
||||
struct ipv6hdr ipv6;
|
||||
struct tcphdr tcp;
|
||||
struct udphdr udp;
|
||||
struct in_addr next_hop;
|
||||
uint8_t *p = buf;
|
||||
int proto;
|
||||
|
||||
proto = IPPROTO_IPIP;
|
||||
if (tuple->family == AF_INET6)
|
||||
proto = IPPROTO_IPV6;
|
||||
|
||||
encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto);
|
||||
p = mempcpy(p, &encap, sizeof(encap));
|
||||
|
||||
if (test->hops == ONE_HOP) {
|
||||
next_hop = (struct in_addr){ .s_addr = htonl(0x7f000002) };
|
||||
p = mempcpy(p, &next_hop, sizeof(next_hop));
|
||||
}
|
||||
|
||||
proto = IPPROTO_TCP;
|
||||
if (test->type == UDP)
|
||||
proto = IPPROTO_UDP;
|
||||
|
||||
switch (tuple->family) {
|
||||
case AF_INET:
|
||||
ip = (struct iphdr){
|
||||
.ihl = 5,
|
||||
.version = 4,
|
||||
.ttl = IPDEFTTL,
|
||||
.protocol = proto,
|
||||
.saddr = tuple->src.in_addr.s_addr,
|
||||
.daddr = tuple->dst.in_addr.s_addr,
|
||||
};
|
||||
p = mempcpy(p, &ip, sizeof(ip));
|
||||
break;
|
||||
case AF_INET6:
|
||||
ipv6 = (struct ipv6hdr){
|
||||
.version = 6,
|
||||
.hop_limit = IPDEFTTL,
|
||||
.nexthdr = proto,
|
||||
.saddr = tuple->src.in6_addr,
|
||||
.daddr = tuple->dst.in6_addr,
|
||||
};
|
||||
p = mempcpy(p, &ipv6, sizeof(ipv6));
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (test->conn == UNKNOWN_CONN)
|
||||
sport--;
|
||||
|
||||
switch (test->type) {
|
||||
case TCP:
|
||||
tcp = (struct tcphdr){
|
||||
.source = sport,
|
||||
.dest = tuple->dst.port,
|
||||
};
|
||||
if (test->flags == SYN)
|
||||
tcp.syn = true;
|
||||
if (test->flags == ACK)
|
||||
tcp.ack = true;
|
||||
p = mempcpy(p, &tcp, sizeof(tcp));
|
||||
break;
|
||||
case UDP:
|
||||
udp = (struct udphdr){
|
||||
.source = sport,
|
||||
.dest = tuple->dst.port,
|
||||
};
|
||||
p = mempcpy(p, &udp, sizeof(udp));
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
return (void *)p - buf;
|
||||
}
|
||||
|
||||
static void close_fds(int *fds, int n)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
if (fds[i] > 0)
|
||||
close(fds[i]);
|
||||
}
|
||||
|
||||
void test_cls_redirect(void)
|
||||
{
|
||||
struct test_cls_redirect *skel = NULL;
|
||||
struct bpf_prog_test_run_attr tattr = {};
|
||||
int families[] = { AF_INET, AF_INET6 };
|
||||
struct sockaddr_storage ss;
|
||||
struct sockaddr *addr;
|
||||
socklen_t slen;
|
||||
int i, j, err;
|
||||
|
||||
int servers[__NR_KIND][ARRAY_SIZE(families)] = {};
|
||||
int conns[__NR_KIND][ARRAY_SIZE(families)] = {};
|
||||
struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)];
|
||||
|
||||
skel = test_cls_redirect__open();
|
||||
if (CHECK_FAIL(!skel))
|
||||
return;
|
||||
|
||||
skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP);
|
||||
skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT);
|
||||
|
||||
if (CHECK_FAIL(test_cls_redirect__load(skel)))
|
||||
goto cleanup;
|
||||
|
||||
addr = (struct sockaddr *)&ss;
|
||||
for (i = 0; i < ARRAY_SIZE(families); i++) {
|
||||
slen = prepare_addr(&ss, families[i]);
|
||||
if (CHECK_FAIL(!slen))
|
||||
goto cleanup;
|
||||
|
||||
if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM,
|
||||
&servers[UDP][i], &conns[UDP][i],
|
||||
&tuples[UDP][i])))
|
||||
goto cleanup;
|
||||
|
||||
if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM,
|
||||
&servers[TCP][i], &conns[TCP][i],
|
||||
&tuples[TCP][i])))
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
tattr.prog_fd = bpf_program__fd(skel->progs.cls_redirect);
|
||||
for (i = 0; i < ARRAY_SIZE(tests); i++) {
|
||||
struct test_cfg *test = &tests[i];
|
||||
|
||||
for (j = 0; j < ARRAY_SIZE(families); j++) {
|
||||
struct tuple *tuple = &tuples[test->type][j];
|
||||
char input[256];
|
||||
char tmp[256];
|
||||
|
||||
test_str(tmp, sizeof(tmp), test, tuple->family);
|
||||
if (!test__start_subtest(tmp))
|
||||
continue;
|
||||
|
||||
tattr.data_out = tmp;
|
||||
tattr.data_size_out = sizeof(tmp);
|
||||
|
||||
tattr.data_in = input;
|
||||
tattr.data_size_in = build_input(test, input, tuple);
|
||||
if (CHECK_FAIL(!tattr.data_size_in))
|
||||
continue;
|
||||
|
||||
err = bpf_prog_test_run_xattr(&tattr);
|
||||
if (CHECK_FAIL(err))
|
||||
continue;
|
||||
|
||||
if (tattr.retval != TC_ACT_REDIRECT) {
|
||||
PRINT_FAIL("expected TC_ACT_REDIRECT, got %d\n",
|
||||
tattr.retval);
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (test->result) {
|
||||
case ACCEPT:
|
||||
if (CHECK_FAIL(!was_decapsulated(&tattr)))
|
||||
continue;
|
||||
break;
|
||||
case FORWARD:
|
||||
if (CHECK_FAIL(was_decapsulated(&tattr)))
|
||||
continue;
|
||||
break;
|
||||
default:
|
||||
PRINT_FAIL("unknown result %d\n", test->result);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
test_cls_redirect__destroy(skel);
|
||||
close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0]));
|
||||
close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0]));
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,54 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
|
||||
/* Copyright 2019, 2020 Cloudflare */
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <linux/if_ether.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/ipv6.h>
|
||||
#include <linux/udp.h>
|
||||
|
||||
struct gre_base_hdr {
|
||||
uint16_t flags;
|
||||
uint16_t protocol;
|
||||
} __attribute__((packed));
|
||||
|
||||
struct guehdr {
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
uint8_t hlen : 5, control : 1, variant : 2;
|
||||
#else
|
||||
uint8_t variant : 2, control : 1, hlen : 5;
|
||||
#endif
|
||||
uint8_t proto_ctype;
|
||||
uint16_t flags;
|
||||
};
|
||||
|
||||
struct unigue {
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
uint8_t _r : 2, last_hop_gre : 1, forward_syn : 1, version : 4;
|
||||
#else
|
||||
uint8_t version : 4, forward_syn : 1, last_hop_gre : 1, _r : 2;
|
||||
#endif
|
||||
uint8_t reserved;
|
||||
uint8_t next_hop;
|
||||
uint8_t hop_count;
|
||||
// Next hops go here
|
||||
} __attribute__((packed));
|
||||
|
||||
typedef struct {
|
||||
struct ethhdr eth;
|
||||
struct iphdr ip;
|
||||
struct gre_base_hdr gre;
|
||||
} __attribute__((packed)) encap_gre_t;
|
||||
|
||||
typedef struct {
|
||||
struct ethhdr eth;
|
||||
struct iphdr ip;
|
||||
struct udphdr udp;
|
||||
struct guehdr gue;
|
||||
struct unigue unigue;
|
||||
} __attribute__((packed)) encap_headers_t;
|
|
@ -105,6 +105,13 @@ struct ipv6_packet {
|
|||
} __packed;
|
||||
extern struct ipv6_packet pkt_v6;
|
||||
|
||||
#define PRINT_FAIL(format...) \
|
||||
({ \
|
||||
test__fail(); \
|
||||
fprintf(stdout, "%s:FAIL:%d ", __func__, __LINE__); \
|
||||
fprintf(stdout, ##format); \
|
||||
})
|
||||
|
||||
#define _CHECK(condition, tag, duration, format...) ({ \
|
||||
int __ret = !!(condition); \
|
||||
int __save_errno = errno; \
|
||||
|
|
Loading…
Reference in New Issue