selftests/bpf: Add cls_redirect classifier

cls_redirect is a TC clsact based replacement for the glb-redirect iptables
module available at [1]. It enables what GitHub calls "second chance"
flows [2], similarly proposed by the Beamer paper [3]. In contrast to
glb-redirect, it also supports migrating UDP flows as long as connected
sockets are used. cls_redirect is in production at Cloudflare, as part of
our own L4 load balancer.

We have modified the encapsulation format slightly from glb-redirect:
glbgue_chained_routing.private_data_type has been repurposed to form a
version field and several flags. Both have been arranged in a way that
a private_data_type value of zero matches the current glb-redirect
behaviour. This means that cls_redirect will understand packets in
glb-redirect format, but not vice versa.

The test suite only covers basic features. For example, cls_redirect will
correctly forward path MTU discovery packets, but this is not exercised.
It is also possible to switch the encapsulation format to GRE on the last
hop, which is also not tested.

There are two major distinctions from glb-redirect: first, cls_redirect
relies on receiving encapsulated packets directly from a router. This is
because we don't have access to the neighbour tables from BPF, yet. See
forward_to_next_hop for details. Second, cls_redirect performs decapsulation
instead of using separate ipip and sit tunnel devices. This
avoids issues with the sit tunnel [4] and makes deploying the classifier
easier: decapsulated packets appear on the same interface, so existing
firewall rules continue to work as expected.

The code base started it's life on v4.19, so there are most likely still
hold overs from old workarounds. In no particular order:

- The function buf_off is required to defeat a clang optimization
  that leads to the verifier rejecting the program due to pointer
  arithmetic in the wrong order.

- The function pkt_parse_ipv6 is force inlined, because it would
  otherwise be rejected due to returning a pointer to stack memory.

- The functions fill_tuple and classify_tcp contain kludges, because
  we've run out of function arguments.

- The logic in general is rather nested, due to verifier restrictions.
  I think this is either because the verifier loses track of constants
  on the stack, or because it can't track enum like variables.

1: https://github.com/github/glb-director/tree/master/src/glb-redirect
2: https://github.com/github/glb-director/blob/master/docs/development/second-chance-design.md
3: https://www.usenix.org/conference/nsdi18/presentation/olteanu
4: https://github.com/github/glb-director/issues/64

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200424185556.7358-2-lmb@cloudflare.com
This commit is contained in:
Lorenz Bauer 2020-04-24 19:55:55 +01:00 committed by Alexei Starovoitov
parent 6f8a57ccf8
commit 234589012b
4 changed files with 1575 additions and 0 deletions

View File

@ -0,0 +1,456 @@
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
// Copyright (c) 2020 Cloudflare
#define _GNU_SOURCE
#include <arpa/inet.h>
#include <string.h>
#include <linux/pkt_cls.h>
#include <test_progs.h>
#include "progs/test_cls_redirect.h"
#include "test_cls_redirect.skel.h"
#define ENCAP_IP INADDR_LOOPBACK
#define ENCAP_PORT (1234)
struct addr_port {
in_port_t port;
union {
struct in_addr in_addr;
struct in6_addr in6_addr;
};
};
struct tuple {
int family;
struct addr_port src;
struct addr_port dst;
};
static int start_server(const struct sockaddr *addr, socklen_t len, int type)
{
int fd = socket(addr->sa_family, type, 0);
if (CHECK_FAIL(fd == -1))
return -1;
if (CHECK_FAIL(bind(fd, addr, len) == -1))
goto err;
if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1))
goto err;
return fd;
err:
close(fd);
return -1;
}
static int connect_to_server(const struct sockaddr *addr, socklen_t len,
int type)
{
int fd = socket(addr->sa_family, type, 0);
if (CHECK_FAIL(fd == -1))
return -1;
if (CHECK_FAIL(connect(fd, addr, len)))
goto err;
return fd;
err:
close(fd);
return -1;
}
static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap)
{
const struct sockaddr_in6 *in6;
const struct sockaddr_in *in;
switch (sa->sa_family) {
case AF_INET:
in = (const struct sockaddr_in *)sa;
ap->in_addr = in->sin_addr;
ap->port = in->sin_port;
return true;
case AF_INET6:
in6 = (const struct sockaddr_in6 *)sa;
ap->in6_addr = in6->sin6_addr;
ap->port = in6->sin6_port;
return true;
default:
return false;
}
}
static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type,
int *server, int *conn, struct tuple *tuple)
{
struct sockaddr_storage ss;
socklen_t slen = sizeof(ss);
struct sockaddr *sa = (struct sockaddr *)&ss;
*server = start_server(addr, len, type);
if (*server < 0)
return false;
if (CHECK_FAIL(getsockname(*server, sa, &slen)))
goto close_server;
*conn = connect_to_server(sa, slen, type);
if (*conn < 0)
goto close_server;
/* We want to simulate packets arriving at conn, so we have to
* swap src and dst.
*/
slen = sizeof(ss);
if (CHECK_FAIL(getsockname(*conn, sa, &slen)))
goto close_conn;
if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst)))
goto close_conn;
slen = sizeof(ss);
if (CHECK_FAIL(getpeername(*conn, sa, &slen)))
goto close_conn;
if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src)))
goto close_conn;
tuple->family = ss.ss_family;
return true;
close_conn:
close(*conn);
*conn = -1;
close_server:
close(*server);
*server = -1;
return false;
}
static socklen_t prepare_addr(struct sockaddr_storage *addr, int family)
{
struct sockaddr_in *addr4;
struct sockaddr_in6 *addr6;
switch (family) {
case AF_INET:
addr4 = (struct sockaddr_in *)addr;
memset(addr4, 0, sizeof(*addr4));
addr4->sin_family = family;
addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
return sizeof(*addr4);
case AF_INET6:
addr6 = (struct sockaddr_in6 *)addr;
memset(addr6, 0, sizeof(*addr6));
addr6->sin6_family = family;
addr6->sin6_addr = in6addr_loopback;
return sizeof(*addr6);
default:
fprintf(stderr, "Invalid family %d", family);
return 0;
}
}
static bool was_decapsulated(struct bpf_prog_test_run_attr *tattr)
{
return tattr->data_size_out < tattr->data_size_in;
}
enum type {
UDP,
TCP,
__NR_KIND,
};
enum hops {
NO_HOPS,
ONE_HOP,
};
enum flags {
NONE,
SYN,
ACK,
};
enum conn {
KNOWN_CONN,
UNKNOWN_CONN,
};
enum result {
ACCEPT,
FORWARD,
};
struct test_cfg {
enum type type;
enum result result;
enum conn conn;
enum hops hops;
enum flags flags;
};
static int test_str(void *buf, size_t len, const struct test_cfg *test,
int family)
{
const char *family_str, *type, *conn, *hops, *result, *flags;
family_str = "IPv4";
if (family == AF_INET6)
family_str = "IPv6";
type = "TCP";
if (test->type == UDP)
type = "UDP";
conn = "known";
if (test->conn == UNKNOWN_CONN)
conn = "unknown";
hops = "no hops";
if (test->hops == ONE_HOP)
hops = "one hop";
result = "accept";
if (test->result == FORWARD)
result = "forward";
flags = "none";
if (test->flags == SYN)
flags = "SYN";
else if (test->flags == ACK)
flags = "ACK";
return snprintf(buf, len, "%s %s %s %s (%s, flags: %s)", family_str,
type, result, conn, hops, flags);
}
static struct test_cfg tests[] = {
{ TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, SYN },
{ TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, ACK },
{ TCP, FORWARD, UNKNOWN_CONN, ONE_HOP, ACK },
{ TCP, ACCEPT, KNOWN_CONN, ONE_HOP, ACK },
{ UDP, ACCEPT, UNKNOWN_CONN, NO_HOPS, NONE },
{ UDP, FORWARD, UNKNOWN_CONN, ONE_HOP, NONE },
{ UDP, ACCEPT, KNOWN_CONN, ONE_HOP, NONE },
};
static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto)
{
const uint8_t hlen =
(sizeof(struct guehdr) / sizeof(uint32_t)) + hop_count;
*encap = (encap_headers_t){
.eth = { .h_proto = htons(ETH_P_IP) },
.ip = {
.ihl = 5,
.version = 4,
.ttl = IPDEFTTL,
.protocol = IPPROTO_UDP,
.daddr = htonl(ENCAP_IP)
},
.udp = {
.dest = htons(ENCAP_PORT),
},
.gue = {
.hlen = hlen,
.proto_ctype = proto
},
.unigue = {
.hop_count = hop_count
},
};
}
static size_t build_input(const struct test_cfg *test, void *const buf,
const struct tuple *tuple)
{
in_port_t sport = tuple->src.port;
encap_headers_t encap;
struct iphdr ip;
struct ipv6hdr ipv6;
struct tcphdr tcp;
struct udphdr udp;
struct in_addr next_hop;
uint8_t *p = buf;
int proto;
proto = IPPROTO_IPIP;
if (tuple->family == AF_INET6)
proto = IPPROTO_IPV6;
encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto);
p = mempcpy(p, &encap, sizeof(encap));
if (test->hops == ONE_HOP) {
next_hop = (struct in_addr){ .s_addr = htonl(0x7f000002) };
p = mempcpy(p, &next_hop, sizeof(next_hop));
}
proto = IPPROTO_TCP;
if (test->type == UDP)
proto = IPPROTO_UDP;
switch (tuple->family) {
case AF_INET:
ip = (struct iphdr){
.ihl = 5,
.version = 4,
.ttl = IPDEFTTL,
.protocol = proto,
.saddr = tuple->src.in_addr.s_addr,
.daddr = tuple->dst.in_addr.s_addr,
};
p = mempcpy(p, &ip, sizeof(ip));
break;
case AF_INET6:
ipv6 = (struct ipv6hdr){
.version = 6,
.hop_limit = IPDEFTTL,
.nexthdr = proto,
.saddr = tuple->src.in6_addr,
.daddr = tuple->dst.in6_addr,
};
p = mempcpy(p, &ipv6, sizeof(ipv6));
break;
default:
return 0;
}
if (test->conn == UNKNOWN_CONN)
sport--;
switch (test->type) {
case TCP:
tcp = (struct tcphdr){
.source = sport,
.dest = tuple->dst.port,
};
if (test->flags == SYN)
tcp.syn = true;
if (test->flags == ACK)
tcp.ack = true;
p = mempcpy(p, &tcp, sizeof(tcp));
break;
case UDP:
udp = (struct udphdr){
.source = sport,
.dest = tuple->dst.port,
};
p = mempcpy(p, &udp, sizeof(udp));
break;
default:
return 0;
}
return (void *)p - buf;
}
static void close_fds(int *fds, int n)
{
int i;
for (i = 0; i < n; i++)
if (fds[i] > 0)
close(fds[i]);
}
void test_cls_redirect(void)
{
struct test_cls_redirect *skel = NULL;
struct bpf_prog_test_run_attr tattr = {};
int families[] = { AF_INET, AF_INET6 };
struct sockaddr_storage ss;
struct sockaddr *addr;
socklen_t slen;
int i, j, err;
int servers[__NR_KIND][ARRAY_SIZE(families)] = {};
int conns[__NR_KIND][ARRAY_SIZE(families)] = {};
struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)];
skel = test_cls_redirect__open();
if (CHECK_FAIL(!skel))
return;
skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP);
skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT);
if (CHECK_FAIL(test_cls_redirect__load(skel)))
goto cleanup;
addr = (struct sockaddr *)&ss;
for (i = 0; i < ARRAY_SIZE(families); i++) {
slen = prepare_addr(&ss, families[i]);
if (CHECK_FAIL(!slen))
goto cleanup;
if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM,
&servers[UDP][i], &conns[UDP][i],
&tuples[UDP][i])))
goto cleanup;
if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM,
&servers[TCP][i], &conns[TCP][i],
&tuples[TCP][i])))
goto cleanup;
}
tattr.prog_fd = bpf_program__fd(skel->progs.cls_redirect);
for (i = 0; i < ARRAY_SIZE(tests); i++) {
struct test_cfg *test = &tests[i];
for (j = 0; j < ARRAY_SIZE(families); j++) {
struct tuple *tuple = &tuples[test->type][j];
char input[256];
char tmp[256];
test_str(tmp, sizeof(tmp), test, tuple->family);
if (!test__start_subtest(tmp))
continue;
tattr.data_out = tmp;
tattr.data_size_out = sizeof(tmp);
tattr.data_in = input;
tattr.data_size_in = build_input(test, input, tuple);
if (CHECK_FAIL(!tattr.data_size_in))
continue;
err = bpf_prog_test_run_xattr(&tattr);
if (CHECK_FAIL(err))
continue;
if (tattr.retval != TC_ACT_REDIRECT) {
PRINT_FAIL("expected TC_ACT_REDIRECT, got %d\n",
tattr.retval);
continue;
}
switch (test->result) {
case ACCEPT:
if (CHECK_FAIL(!was_decapsulated(&tattr)))
continue;
break;
case FORWARD:
if (CHECK_FAIL(was_decapsulated(&tattr)))
continue;
break;
default:
PRINT_FAIL("unknown result %d\n", test->result);
continue;
}
}
}
cleanup:
test_cls_redirect__destroy(skel);
close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0]));
close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0]));
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,54 @@
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
/* Copyright 2019, 2020 Cloudflare */
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/udp.h>
struct gre_base_hdr {
uint16_t flags;
uint16_t protocol;
} __attribute__((packed));
struct guehdr {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
uint8_t hlen : 5, control : 1, variant : 2;
#else
uint8_t variant : 2, control : 1, hlen : 5;
#endif
uint8_t proto_ctype;
uint16_t flags;
};
struct unigue {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
uint8_t _r : 2, last_hop_gre : 1, forward_syn : 1, version : 4;
#else
uint8_t version : 4, forward_syn : 1, last_hop_gre : 1, _r : 2;
#endif
uint8_t reserved;
uint8_t next_hop;
uint8_t hop_count;
// Next hops go here
} __attribute__((packed));
typedef struct {
struct ethhdr eth;
struct iphdr ip;
struct gre_base_hdr gre;
} __attribute__((packed)) encap_gre_t;
typedef struct {
struct ethhdr eth;
struct iphdr ip;
struct udphdr udp;
struct guehdr gue;
struct unigue unigue;
} __attribute__((packed)) encap_headers_t;

View File

@ -105,6 +105,13 @@ struct ipv6_packet {
} __packed;
extern struct ipv6_packet pkt_v6;
#define PRINT_FAIL(format...) \
({ \
test__fail(); \
fprintf(stdout, "%s:FAIL:%d ", __func__, __LINE__); \
fprintf(stdout, ##format); \
})
#define _CHECK(condition, tag, duration, format...) ({ \
int __ret = !!(condition); \
int __save_errno = errno; \