Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf
Daniel Borkmann says: ==================== pull-request: bpf 2021-05-11 The following pull-request contains BPF updates for your *net* tree. We've added 13 non-merge commits during the last 8 day(s) which contain a total of 21 files changed, 817 insertions(+), 382 deletions(-). The main changes are: 1) Fix multiple ringbuf bugs in particular to prevent writable mmap of read-only pages, from Andrii Nakryiko & Thadeu Lima de Souza Cascardo. 2) Fix verifier alu32 known-const subregister bound tracking for bitwise operations and/or/xor, from Daniel Borkmann. 3) Reject trampoline attachment for functions with variable arguments, and also add a deny list of other forbidden functions, from Jiri Olsa. 4) Fix nested bpf_bprintf_prepare() calls used by various helpers by switching to per-CPU buffers, from Florent Revest. 5) Fix kernel compilation with BTF debug info on ppc64 due to pahole missing TCP-CC functions like cubictcp_init, from Martin KaFai Lau. 6) Add a kconfig entry to provide an option to disallow unprivileged BPF by default, from Daniel Borkmann. 7) Fix libbpf compilation for older libelf when GELF_ST_VISIBILITY() macro is not available, from Arnaldo Carvalho de Melo. 8) Migrate test_tc_redirect to test_progs framework as prep work for upcoming skb_change_head() fix & selftest, from Jussi Maki. 9) Fix a libbpf segfault in add_dummy_ksym_var() if BTF is not present, from Ian Rogers. 10) Fix tx_only micro-benchmark in xdpsock BPF sample with proper frame size, from Magnus Karlsson. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
df6f823703
|
@ -1457,11 +1457,22 @@ unprivileged_bpf_disabled
|
|||
=========================
|
||||
|
||||
Writing 1 to this entry will disable unprivileged calls to ``bpf()``;
|
||||
once disabled, calling ``bpf()`` without ``CAP_SYS_ADMIN`` will return
|
||||
``-EPERM``.
|
||||
once disabled, calling ``bpf()`` without ``CAP_SYS_ADMIN`` or ``CAP_BPF``
|
||||
will return ``-EPERM``. Once set to 1, this can't be cleared from the
|
||||
running kernel anymore.
|
||||
|
||||
Once set, this can't be cleared.
|
||||
Writing 2 to this entry will also disable unprivileged calls to ``bpf()``,
|
||||
however, an admin can still change this setting later on, if needed, by
|
||||
writing 0 or 1 to this entry.
|
||||
|
||||
If ``BPF_UNPRIV_DEFAULT_OFF`` is enabled in the kernel config, then this
|
||||
entry will default to 2 instead of 0.
|
||||
|
||||
= =============================================================
|
||||
0 Unprivileged calls to ``bpf()`` are enabled
|
||||
1 Unprivileged calls to ``bpf()`` are disabled without recovery
|
||||
2 Unprivileged calls to ``bpf()`` are disabled
|
||||
= =============================================================
|
||||
|
||||
watchdog
|
||||
========
|
||||
|
|
41
init/Kconfig
41
init/Kconfig
|
@ -442,6 +442,7 @@ config AUDITSYSCALL
|
|||
|
||||
source "kernel/irq/Kconfig"
|
||||
source "kernel/time/Kconfig"
|
||||
source "kernel/bpf/Kconfig"
|
||||
source "kernel/Kconfig.preempt"
|
||||
|
||||
menu "CPU/Task time and stats accounting"
|
||||
|
@ -1713,46 +1714,6 @@ config KALLSYMS_BASE_RELATIVE
|
|||
|
||||
# syscall, maps, verifier
|
||||
|
||||
config BPF_LSM
|
||||
bool "LSM Instrumentation with BPF"
|
||||
depends on BPF_EVENTS
|
||||
depends on BPF_SYSCALL
|
||||
depends on SECURITY
|
||||
depends on BPF_JIT
|
||||
help
|
||||
Enables instrumentation of the security hooks with eBPF programs for
|
||||
implementing dynamic MAC and Audit Policies.
|
||||
|
||||
If you are unsure how to answer this question, answer N.
|
||||
|
||||
config BPF_SYSCALL
|
||||
bool "Enable bpf() system call"
|
||||
select BPF
|
||||
select IRQ_WORK
|
||||
select TASKS_TRACE_RCU
|
||||
select BINARY_PRINTF
|
||||
select NET_SOCK_MSG if INET
|
||||
default n
|
||||
help
|
||||
Enable the bpf() system call that allows to manipulate eBPF
|
||||
programs and maps via file descriptors.
|
||||
|
||||
config ARCH_WANT_DEFAULT_BPF_JIT
|
||||
bool
|
||||
|
||||
config BPF_JIT_ALWAYS_ON
|
||||
bool "Permanently enable BPF JIT and remove BPF interpreter"
|
||||
depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
|
||||
help
|
||||
Enables BPF JIT and removes BPF interpreter to avoid
|
||||
speculative execution of BPF instructions by the interpreter
|
||||
|
||||
config BPF_JIT_DEFAULT_ON
|
||||
def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
|
||||
depends on HAVE_EBPF_JIT && BPF_JIT
|
||||
|
||||
source "kernel/bpf/preload/Kconfig"
|
||||
|
||||
config USERFAULTFD
|
||||
bool "Enable userfaultfd() system call"
|
||||
depends on MMU
|
||||
|
|
|
@ -0,0 +1,88 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
|
||||
# BPF interpreter that, for example, classic socket filters depend on.
|
||||
config BPF
|
||||
bool
|
||||
|
||||
# Used by archs to tell that they support BPF JIT compiler plus which
|
||||
# flavour. Only one of the two can be selected for a specific arch since
|
||||
# eBPF JIT supersedes the cBPF JIT.
|
||||
|
||||
# Classic BPF JIT (cBPF)
|
||||
config HAVE_CBPF_JIT
|
||||
bool
|
||||
|
||||
# Extended BPF JIT (eBPF)
|
||||
config HAVE_EBPF_JIT
|
||||
bool
|
||||
|
||||
# Used by archs to tell that they want the BPF JIT compiler enabled by
|
||||
# default for kernels that were compiled with BPF JIT support.
|
||||
config ARCH_WANT_DEFAULT_BPF_JIT
|
||||
bool
|
||||
|
||||
menu "BPF subsystem"
|
||||
|
||||
config BPF_SYSCALL
|
||||
bool "Enable bpf() system call"
|
||||
select BPF
|
||||
select IRQ_WORK
|
||||
select TASKS_TRACE_RCU
|
||||
select BINARY_PRINTF
|
||||
select NET_SOCK_MSG if INET
|
||||
default n
|
||||
help
|
||||
Enable the bpf() system call that allows to manipulate BPF programs
|
||||
and maps via file descriptors.
|
||||
|
||||
config BPF_JIT
|
||||
bool "Enable BPF Just In Time compiler"
|
||||
depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
|
||||
depends on MODULES
|
||||
help
|
||||
BPF programs are normally handled by a BPF interpreter. This option
|
||||
allows the kernel to generate native code when a program is loaded
|
||||
into the kernel. This will significantly speed-up processing of BPF
|
||||
programs.
|
||||
|
||||
Note, an admin should enable this feature changing:
|
||||
/proc/sys/net/core/bpf_jit_enable
|
||||
/proc/sys/net/core/bpf_jit_harden (optional)
|
||||
/proc/sys/net/core/bpf_jit_kallsyms (optional)
|
||||
|
||||
config BPF_JIT_ALWAYS_ON
|
||||
bool "Permanently enable BPF JIT and remove BPF interpreter"
|
||||
depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
|
||||
help
|
||||
Enables BPF JIT and removes BPF interpreter to avoid speculative
|
||||
execution of BPF instructions by the interpreter.
|
||||
|
||||
config BPF_JIT_DEFAULT_ON
|
||||
def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
|
||||
depends on HAVE_EBPF_JIT && BPF_JIT
|
||||
|
||||
config BPF_UNPRIV_DEFAULT_OFF
|
||||
bool "Disable unprivileged BPF by default"
|
||||
depends on BPF_SYSCALL
|
||||
help
|
||||
Disables unprivileged BPF by default by setting the corresponding
|
||||
/proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can
|
||||
still reenable it by setting it to 0 later on, or permanently
|
||||
disable it by setting it to 1 (from which no other transition to
|
||||
0 is possible anymore).
|
||||
|
||||
source "kernel/bpf/preload/Kconfig"
|
||||
|
||||
config BPF_LSM
|
||||
bool "Enable BPF LSM Instrumentation"
|
||||
depends on BPF_EVENTS
|
||||
depends on BPF_SYSCALL
|
||||
depends on SECURITY
|
||||
depends on BPF_JIT
|
||||
help
|
||||
Enables instrumentation of the security hooks with BPF programs for
|
||||
implementing dynamic MAC and Audit Policies.
|
||||
|
||||
If you are unsure how to answer this question, answer N.
|
||||
|
||||
endmenu # "BPF subsystem"
|
|
@ -5206,6 +5206,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
|
|||
m->ret_size = ret;
|
||||
|
||||
for (i = 0; i < nargs; i++) {
|
||||
if (i == nargs - 1 && args[i].type == 0) {
|
||||
bpf_log(log,
|
||||
"The function %s with variable args is unsupported.\n",
|
||||
tname);
|
||||
return -EINVAL;
|
||||
}
|
||||
ret = __get_type_size(btf, args[i].type, &t);
|
||||
if (ret < 0) {
|
||||
bpf_log(log,
|
||||
|
@ -5213,6 +5219,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
|
|||
tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (ret == 0) {
|
||||
bpf_log(log,
|
||||
"The function %s has malformed void argument.\n",
|
||||
tname);
|
||||
return -EINVAL;
|
||||
}
|
||||
m->arg_size[i] = ret;
|
||||
}
|
||||
m->nr_args = nargs;
|
||||
|
|
|
@ -696,34 +696,35 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
|
|||
*/
|
||||
#define MAX_PRINTF_BUF_LEN 512
|
||||
|
||||
struct bpf_printf_buf {
|
||||
char tmp_buf[MAX_PRINTF_BUF_LEN];
|
||||
/* Support executing three nested bprintf helper calls on a given CPU */
|
||||
struct bpf_bprintf_buffers {
|
||||
char tmp_bufs[3][MAX_PRINTF_BUF_LEN];
|
||||
};
|
||||
static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf);
|
||||
static DEFINE_PER_CPU(int, bpf_printf_buf_used);
|
||||
static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs);
|
||||
static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
|
||||
|
||||
static int try_get_fmt_tmp_buf(char **tmp_buf)
|
||||
{
|
||||
struct bpf_printf_buf *bufs;
|
||||
int used;
|
||||
struct bpf_bprintf_buffers *bufs;
|
||||
int nest_level;
|
||||
|
||||
preempt_disable();
|
||||
used = this_cpu_inc_return(bpf_printf_buf_used);
|
||||
if (WARN_ON_ONCE(used > 1)) {
|
||||
this_cpu_dec(bpf_printf_buf_used);
|
||||
nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
|
||||
if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bufs->tmp_bufs))) {
|
||||
this_cpu_dec(bpf_bprintf_nest_level);
|
||||
preempt_enable();
|
||||
return -EBUSY;
|
||||
}
|
||||
bufs = this_cpu_ptr(&bpf_printf_buf);
|
||||
*tmp_buf = bufs->tmp_buf;
|
||||
bufs = this_cpu_ptr(&bpf_bprintf_bufs);
|
||||
*tmp_buf = bufs->tmp_bufs[nest_level - 1];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bpf_bprintf_cleanup(void)
|
||||
{
|
||||
if (this_cpu_read(bpf_printf_buf_used)) {
|
||||
this_cpu_dec(bpf_printf_buf_used);
|
||||
if (this_cpu_read(bpf_bprintf_nest_level)) {
|
||||
this_cpu_dec(bpf_bprintf_nest_level);
|
||||
preempt_enable();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -221,25 +221,20 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
|
|||
return -ENOTSUPP;
|
||||
}
|
||||
|
||||
static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb)
|
||||
{
|
||||
size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT;
|
||||
|
||||
/* consumer page + producer page + 2 x data pages */
|
||||
return RINGBUF_POS_PAGES + 2 * data_pages;
|
||||
}
|
||||
|
||||
static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
|
||||
{
|
||||
struct bpf_ringbuf_map *rb_map;
|
||||
size_t mmap_sz;
|
||||
|
||||
rb_map = container_of(map, struct bpf_ringbuf_map, map);
|
||||
mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT;
|
||||
|
||||
if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz)
|
||||
return -EINVAL;
|
||||
|
||||
if (vma->vm_flags & VM_WRITE) {
|
||||
/* allow writable mapping for the consumer_pos only */
|
||||
if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
|
||||
return -EPERM;
|
||||
} else {
|
||||
vma->vm_flags &= ~VM_MAYWRITE;
|
||||
}
|
||||
/* remap_vmalloc_range() checks size and offset constraints */
|
||||
return remap_vmalloc_range(vma, rb_map->rb,
|
||||
vma->vm_pgoff + RINGBUF_PGOFF);
|
||||
}
|
||||
|
@ -315,6 +310,9 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
|
|||
return NULL;
|
||||
|
||||
len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
|
||||
if (len > rb->mask + 1)
|
||||
return NULL;
|
||||
|
||||
cons_pos = smp_load_acquire(&rb->consumer_pos);
|
||||
|
||||
if (in_nmi()) {
|
||||
|
|
|
@ -50,7 +50,8 @@ static DEFINE_SPINLOCK(map_idr_lock);
|
|||
static DEFINE_IDR(link_idr);
|
||||
static DEFINE_SPINLOCK(link_idr_lock);
|
||||
|
||||
int sysctl_unprivileged_bpf_disabled __read_mostly;
|
||||
int sysctl_unprivileged_bpf_disabled __read_mostly =
|
||||
IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
|
||||
|
||||
static const struct bpf_map_ops * const bpf_map_types[] = {
|
||||
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
|
||||
|
|
|
@ -7084,11 +7084,10 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
|
|||
s32 smin_val = src_reg->s32_min_value;
|
||||
u32 umax_val = src_reg->u32_max_value;
|
||||
|
||||
/* Assuming scalar64_min_max_and will be called so its safe
|
||||
* to skip updating register for known 32-bit case.
|
||||
*/
|
||||
if (src_known && dst_known)
|
||||
if (src_known && dst_known) {
|
||||
__mark_reg32_known(dst_reg, var32_off.value);
|
||||
return;
|
||||
}
|
||||
|
||||
/* We get our minimum from the var_off, since that's inherently
|
||||
* bitwise. Our maximum is the minimum of the operands' maxima.
|
||||
|
@ -7108,7 +7107,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
|
|||
dst_reg->s32_min_value = dst_reg->u32_min_value;
|
||||
dst_reg->s32_max_value = dst_reg->u32_max_value;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
|
||||
|
@ -7155,11 +7153,10 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
|
|||
s32 smin_val = src_reg->s32_min_value;
|
||||
u32 umin_val = src_reg->u32_min_value;
|
||||
|
||||
/* Assuming scalar64_min_max_or will be called so it is safe
|
||||
* to skip updating register for known case.
|
||||
*/
|
||||
if (src_known && dst_known)
|
||||
if (src_known && dst_known) {
|
||||
__mark_reg32_known(dst_reg, var32_off.value);
|
||||
return;
|
||||
}
|
||||
|
||||
/* We get our maximum from the var_off, and our minimum is the
|
||||
* maximum of the operands' minima
|
||||
|
@ -7224,11 +7221,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
|
|||
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
|
||||
s32 smin_val = src_reg->s32_min_value;
|
||||
|
||||
/* Assuming scalar64_min_max_xor will be called so it is safe
|
||||
* to skip updating register for known case.
|
||||
*/
|
||||
if (src_known && dst_known)
|
||||
if (src_known && dst_known) {
|
||||
__mark_reg32_known(dst_reg, var32_off.value);
|
||||
return;
|
||||
}
|
||||
|
||||
/* We get both minimum and maximum from the var32_off. */
|
||||
dst_reg->u32_min_value = var32_off.value;
|
||||
|
@ -13200,6 +13196,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
|
|||
return 0;
|
||||
}
|
||||
|
||||
BTF_SET_START(btf_id_deny)
|
||||
BTF_ID_UNUSED
|
||||
#ifdef CONFIG_SMP
|
||||
BTF_ID(func, migrate_disable)
|
||||
BTF_ID(func, migrate_enable)
|
||||
#endif
|
||||
#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
|
||||
BTF_ID(func, rcu_read_unlock_strict)
|
||||
#endif
|
||||
BTF_SET_END(btf_id_deny)
|
||||
|
||||
static int check_attach_btf_id(struct bpf_verifier_env *env)
|
||||
{
|
||||
struct bpf_prog *prog = env->prog;
|
||||
|
@ -13259,6 +13266,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
|
|||
ret = bpf_lsm_verify_prog(&env->log, prog);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
} else if (prog->type == BPF_PROG_TYPE_TRACING &&
|
||||
btf_id_set_contains(&btf_id_deny, btf_id)) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
|
||||
|
|
|
@ -225,7 +225,27 @@ static int bpf_stats_handler(struct ctl_table *table, int write,
|
|||
mutex_unlock(&bpf_stats_enabled_mutex);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int bpf_unpriv_handler(struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int ret, unpriv_enable = *(int *)table->data;
|
||||
bool locked_state = unpriv_enable == 1;
|
||||
struct ctl_table tmp = *table;
|
||||
|
||||
if (write && !capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
tmp.data = &unpriv_enable;
|
||||
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
|
||||
if (write && !ret) {
|
||||
if (locked_state && unpriv_enable != 1)
|
||||
return -EPERM;
|
||||
*(int *)table->data = unpriv_enable;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */
|
||||
|
||||
/*
|
||||
* /proc/sys support
|
||||
|
@ -2600,10 +2620,9 @@ static struct ctl_table kern_table[] = {
|
|||
.data = &sysctl_unprivileged_bpf_disabled,
|
||||
.maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
|
||||
.mode = 0644,
|
||||
/* only handle a transition from default "0" to "1" */
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ONE,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
.proc_handler = bpf_unpriv_handler,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = &two,
|
||||
},
|
||||
{
|
||||
.procname = "bpf_stats_enabled",
|
||||
|
|
27
net/Kconfig
27
net/Kconfig
|
@ -302,21 +302,6 @@ config BQL
|
|||
select DQL
|
||||
default y
|
||||
|
||||
config BPF_JIT
|
||||
bool "enable BPF Just In Time compiler"
|
||||
depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
|
||||
depends on MODULES
|
||||
help
|
||||
Berkeley Packet Filter filtering capabilities are normally handled
|
||||
by an interpreter. This option allows kernel to generate a native
|
||||
code when filter is loaded in memory. This should speedup
|
||||
packet sniffing (libpcap/tcpdump).
|
||||
|
||||
Note, admin should enable this feature changing:
|
||||
/proc/sys/net/core/bpf_jit_enable
|
||||
/proc/sys/net/core/bpf_jit_harden (optional)
|
||||
/proc/sys/net/core/bpf_jit_kallsyms (optional)
|
||||
|
||||
config BPF_STREAM_PARSER
|
||||
bool "enable BPF STREAM_PARSER"
|
||||
depends on INET
|
||||
|
@ -470,15 +455,3 @@ config ETHTOOL_NETLINK
|
|||
e.g. notification messages.
|
||||
|
||||
endif # if NET
|
||||
|
||||
# Used by archs to tell that they support BPF JIT compiler plus which flavour.
|
||||
# Only one of the two can be selected for a specific arch since eBPF JIT supersedes
|
||||
# the cBPF JIT.
|
||||
|
||||
# Classic BPF JIT (cBPF)
|
||||
config HAVE_CBPF_JIT
|
||||
bool
|
||||
|
||||
# Extended BPF JIT (eBPF)
|
||||
config HAVE_EBPF_JIT
|
||||
bool
|
||||
|
|
|
@ -185,6 +185,7 @@ BTF_ID(func, tcp_reno_cong_avoid)
|
|||
BTF_ID(func, tcp_reno_undo_cwnd)
|
||||
BTF_ID(func, tcp_slow_start)
|
||||
BTF_ID(func, tcp_cong_avoid_ai)
|
||||
#ifdef CONFIG_X86
|
||||
#ifdef CONFIG_DYNAMIC_FTRACE
|
||||
#if IS_BUILTIN(CONFIG_TCP_CONG_CUBIC)
|
||||
BTF_ID(func, cubictcp_init)
|
||||
|
@ -213,6 +214,7 @@ BTF_ID(func, bbr_min_tso_segs)
|
|||
BTF_ID(func, bbr_set_state)
|
||||
#endif
|
||||
#endif /* CONFIG_DYNAMIC_FTRACE */
|
||||
#endif /* CONFIG_X86 */
|
||||
BTF_SET_END(bpf_tcp_ca_kfunc_ids)
|
||||
|
||||
static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id)
|
||||
|
|
|
@ -1255,7 +1255,7 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size)
|
|||
for (i = 0; i < batch_size; i++) {
|
||||
struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx,
|
||||
idx + i);
|
||||
tx_desc->addr = (*frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
|
||||
tx_desc->addr = (*frame_nb + i) * opt_xsk_frame_size;
|
||||
tx_desc->len = PKT_SIZE;
|
||||
}
|
||||
|
||||
|
|
|
@ -3216,6 +3216,9 @@ static int add_dummy_ksym_var(struct btf *btf)
|
|||
const struct btf_var_secinfo *vs;
|
||||
const struct btf_type *sec;
|
||||
|
||||
if (!btf)
|
||||
return 0;
|
||||
|
||||
sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC,
|
||||
BTF_KIND_DATASEC);
|
||||
if (sec_btf_id < 0)
|
||||
|
|
|
@ -41,6 +41,11 @@
|
|||
#define ELF_C_READ_MMAP ELF_C_READ
|
||||
#endif
|
||||
|
||||
/* Older libelf all end up in this expression, for both 32 and 64 bit */
|
||||
#ifndef GELF_ST_VISIBILITY
|
||||
#define GELF_ST_VISIBILITY(o) ((o) & 0x03)
|
||||
#endif
|
||||
|
||||
#define BTF_INFO_ENC(kind, kind_flag, vlen) \
|
||||
((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
|
||||
#define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type)
|
||||
|
|
|
@ -40,7 +40,7 @@ struct ipv6_packet pkt_v6 = {
|
|||
.tcp.doff = 5,
|
||||
};
|
||||
|
||||
static int settimeo(int fd, int timeout_ms)
|
||||
int settimeo(int fd, int timeout_ms)
|
||||
{
|
||||
struct timeval timeout = { .tv_sec = 3 };
|
||||
|
||||
|
|
|
@ -33,6 +33,7 @@ struct ipv6_packet {
|
|||
} __packed;
|
||||
extern struct ipv6_packet pkt_v6;
|
||||
|
||||
int settimeo(int fd, int timeout_ms);
|
||||
int start_server(int family, int type, const char *addr, __u16 port,
|
||||
int timeout_ms);
|
||||
int connect_to_fd(int server_fd, int timeout_ms);
|
||||
|
|
|
@ -0,0 +1,589 @@
|
|||
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
|
||||
/*
|
||||
* This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
|
||||
* between src and dst. The netns fwd has veth links to each src and dst. The
|
||||
* client is in src and server in dst. The test installs a TC BPF program to each
|
||||
* host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the
|
||||
* neigh addr population and redirect or ii) bpf_redirect_peer() for namespace
|
||||
* switch from ingress side; it also installs a checker prog on the egress side
|
||||
* to drop unexpected traffic.
|
||||
*/
|
||||
|
||||
#define _GNU_SOURCE
|
||||
#include <fcntl.h>
|
||||
#include <linux/limits.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <sched.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "test_progs.h"
|
||||
#include "network_helpers.h"
|
||||
#include "test_tc_neigh_fib.skel.h"
|
||||
#include "test_tc_neigh.skel.h"
|
||||
#include "test_tc_peer.skel.h"
|
||||
|
||||
#define NS_SRC "ns_src"
|
||||
#define NS_FWD "ns_fwd"
|
||||
#define NS_DST "ns_dst"
|
||||
|
||||
#define IP4_SRC "172.16.1.100"
|
||||
#define IP4_DST "172.16.2.100"
|
||||
#define IP4_PORT 9004
|
||||
|
||||
#define IP6_SRC "::1:dead:beef:cafe"
|
||||
#define IP6_DST "::2:dead:beef:cafe"
|
||||
#define IP6_PORT 9006
|
||||
|
||||
#define IP4_SLL "169.254.0.1"
|
||||
#define IP4_DLL "169.254.0.2"
|
||||
#define IP4_NET "169.254.0.0"
|
||||
|
||||
#define IFADDR_STR_LEN 18
|
||||
#define PING_ARGS "-c 3 -w 10 -q"
|
||||
|
||||
#define SRC_PROG_PIN_FILE "/sys/fs/bpf/test_tc_src"
|
||||
#define DST_PROG_PIN_FILE "/sys/fs/bpf/test_tc_dst"
|
||||
#define CHK_PROG_PIN_FILE "/sys/fs/bpf/test_tc_chk"
|
||||
|
||||
#define TIMEOUT_MILLIS 10000
|
||||
|
||||
#define MAX_PROC_MODS 128
|
||||
#define MAX_PROC_VALUE_LEN 16
|
||||
|
||||
#define log_err(MSG, ...) \
|
||||
fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
|
||||
__FILE__, __LINE__, strerror(errno), ##__VA_ARGS__)
|
||||
|
||||
struct proc_mod {
|
||||
char path[PATH_MAX];
|
||||
char oldval[MAX_PROC_VALUE_LEN];
|
||||
int oldlen;
|
||||
};
|
||||
|
||||
static const char * const namespaces[] = {NS_SRC, NS_FWD, NS_DST, NULL};
|
||||
static int root_netns_fd = -1;
|
||||
static int num_proc_mods;
|
||||
static struct proc_mod proc_mods[MAX_PROC_MODS];
|
||||
|
||||
/**
|
||||
* modify_proc() - Modify entry in /proc
|
||||
*
|
||||
* Modifies an entry in /proc and saves the original value for later
|
||||
* restoration with restore_proc().
|
||||
*/
|
||||
static int modify_proc(const char *path, const char *newval)
|
||||
{
|
||||
struct proc_mod *mod;
|
||||
FILE *f;
|
||||
|
||||
if (num_proc_mods + 1 > MAX_PROC_MODS)
|
||||
return -1;
|
||||
|
||||
f = fopen(path, "r+");
|
||||
if (!f)
|
||||
return -1;
|
||||
|
||||
mod = &proc_mods[num_proc_mods];
|
||||
num_proc_mods++;
|
||||
|
||||
strncpy(mod->path, path, PATH_MAX);
|
||||
|
||||
if (!fread(mod->oldval, 1, MAX_PROC_VALUE_LEN, f)) {
|
||||
log_err("reading from %s failed", path);
|
||||
goto fail;
|
||||
}
|
||||
rewind(f);
|
||||
if (fwrite(newval, strlen(newval), 1, f) != 1) {
|
||||
log_err("writing to %s failed", path);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
fclose(f);
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
fclose(f);
|
||||
num_proc_mods--;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* restore_proc() - Restore all /proc modifications
|
||||
*/
|
||||
static void restore_proc(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < num_proc_mods; i++) {
|
||||
struct proc_mod *mod = &proc_mods[i];
|
||||
FILE *f;
|
||||
|
||||
f = fopen(mod->path, "w");
|
||||
if (!f) {
|
||||
log_err("fopen of %s failed", mod->path);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (fwrite(mod->oldval, mod->oldlen, 1, f) != 1)
|
||||
log_err("fwrite to %s failed", mod->path);
|
||||
|
||||
fclose(f);
|
||||
}
|
||||
num_proc_mods = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* setns_by_name() - Set networks namespace by name
|
||||
*/
|
||||
static int setns_by_name(const char *name)
|
||||
{
|
||||
int nsfd;
|
||||
char nspath[PATH_MAX];
|
||||
int err;
|
||||
|
||||
snprintf(nspath, sizeof(nspath), "%s/%s", "/var/run/netns", name);
|
||||
nsfd = open(nspath, O_RDONLY | O_CLOEXEC);
|
||||
if (nsfd < 0)
|
||||
return nsfd;
|
||||
|
||||
err = setns(nsfd, CLONE_NEWNET);
|
||||
close(nsfd);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/**
|
||||
* setns_root() - Set network namespace to original (root) namespace
|
||||
*
|
||||
* Not expected to ever fail, so error not returned, but failure logged
|
||||
* and test marked as failed.
|
||||
*/
|
||||
static void setns_root(void)
|
||||
{
|
||||
ASSERT_OK(setns(root_netns_fd, CLONE_NEWNET), "setns root");
|
||||
}
|
||||
|
||||
static int netns_setup_namespaces(const char *verb)
|
||||
{
|
||||
const char * const *ns = namespaces;
|
||||
char cmd[128];
|
||||
|
||||
while (*ns) {
|
||||
snprintf(cmd, sizeof(cmd), "ip netns %s %s", verb, *ns);
|
||||
if (!ASSERT_OK(system(cmd), cmd))
|
||||
return -1;
|
||||
ns++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct netns_setup_result {
|
||||
int ifindex_veth_src_fwd;
|
||||
int ifindex_veth_dst_fwd;
|
||||
};
|
||||
|
||||
static int get_ifaddr(const char *name, char *ifaddr)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
FILE *f;
|
||||
int ret;
|
||||
|
||||
snprintf(path, PATH_MAX, "/sys/class/net/%s/address", name);
|
||||
f = fopen(path, "r");
|
||||
if (!ASSERT_OK_PTR(f, path))
|
||||
return -1;
|
||||
|
||||
ret = fread(ifaddr, 1, IFADDR_STR_LEN, f);
|
||||
if (!ASSERT_EQ(ret, IFADDR_STR_LEN, "fread ifaddr")) {
|
||||
fclose(f);
|
||||
return -1;
|
||||
}
|
||||
fclose(f);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int get_ifindex(const char *name)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
char buf[32];
|
||||
FILE *f;
|
||||
int ret;
|
||||
|
||||
snprintf(path, PATH_MAX, "/sys/class/net/%s/ifindex", name);
|
||||
f = fopen(path, "r");
|
||||
if (!ASSERT_OK_PTR(f, path))
|
||||
return -1;
|
||||
|
||||
ret = fread(buf, 1, sizeof(buf), f);
|
||||
if (!ASSERT_GT(ret, 0, "fread ifindex")) {
|
||||
fclose(f);
|
||||
return -1;
|
||||
}
|
||||
fclose(f);
|
||||
return atoi(buf);
|
||||
}
|
||||
|
||||
#define SYS(fmt, ...) \
|
||||
({ \
|
||||
char cmd[1024]; \
|
||||
snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \
|
||||
if (!ASSERT_OK(system(cmd), cmd)) \
|
||||
goto fail; \
|
||||
})
|
||||
|
||||
static int netns_setup_links_and_routes(struct netns_setup_result *result)
|
||||
{
|
||||
char veth_src_fwd_addr[IFADDR_STR_LEN+1] = {};
|
||||
char veth_dst_fwd_addr[IFADDR_STR_LEN+1] = {};
|
||||
|
||||
SYS("ip link add veth_src type veth peer name veth_src_fwd");
|
||||
SYS("ip link add veth_dst type veth peer name veth_dst_fwd");
|
||||
if (get_ifaddr("veth_src_fwd", veth_src_fwd_addr))
|
||||
goto fail;
|
||||
if (get_ifaddr("veth_dst_fwd", veth_dst_fwd_addr))
|
||||
goto fail;
|
||||
|
||||
result->ifindex_veth_src_fwd = get_ifindex("veth_src_fwd");
|
||||
if (result->ifindex_veth_src_fwd < 0)
|
||||
goto fail;
|
||||
result->ifindex_veth_dst_fwd = get_ifindex("veth_dst_fwd");
|
||||
if (result->ifindex_veth_dst_fwd < 0)
|
||||
goto fail;
|
||||
|
||||
SYS("ip link set veth_src netns " NS_SRC);
|
||||
SYS("ip link set veth_src_fwd netns " NS_FWD);
|
||||
SYS("ip link set veth_dst_fwd netns " NS_FWD);
|
||||
SYS("ip link set veth_dst netns " NS_DST);
|
||||
|
||||
/** setup in 'src' namespace */
|
||||
if (!ASSERT_OK(setns_by_name(NS_SRC), "setns src"))
|
||||
goto fail;
|
||||
|
||||
SYS("ip addr add " IP4_SRC "/32 dev veth_src");
|
||||
SYS("ip addr add " IP6_SRC "/128 dev veth_src nodad");
|
||||
SYS("ip link set dev veth_src up");
|
||||
|
||||
SYS("ip route add " IP4_DST "/32 dev veth_src scope global");
|
||||
SYS("ip route add " IP4_NET "/16 dev veth_src scope global");
|
||||
SYS("ip route add " IP6_DST "/128 dev veth_src scope global");
|
||||
|
||||
SYS("ip neigh add " IP4_DST " dev veth_src lladdr %s",
|
||||
veth_src_fwd_addr);
|
||||
SYS("ip neigh add " IP6_DST " dev veth_src lladdr %s",
|
||||
veth_src_fwd_addr);
|
||||
|
||||
/** setup in 'fwd' namespace */
|
||||
if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd"))
|
||||
goto fail;
|
||||
|
||||
/* The fwd netns automatically gets a v6 LL address / routes, but also
|
||||
* needs v4 one in order to start ARP probing. IP4_NET route is added
|
||||
* to the endpoints so that the ARP processing will reply.
|
||||
*/
|
||||
SYS("ip addr add " IP4_SLL "/32 dev veth_src_fwd");
|
||||
SYS("ip addr add " IP4_DLL "/32 dev veth_dst_fwd");
|
||||
SYS("ip link set dev veth_src_fwd up");
|
||||
SYS("ip link set dev veth_dst_fwd up");
|
||||
|
||||
SYS("ip route add " IP4_SRC "/32 dev veth_src_fwd scope global");
|
||||
SYS("ip route add " IP6_SRC "/128 dev veth_src_fwd scope global");
|
||||
SYS("ip route add " IP4_DST "/32 dev veth_dst_fwd scope global");
|
||||
SYS("ip route add " IP6_DST "/128 dev veth_dst_fwd scope global");
|
||||
|
||||
/** setup in 'dst' namespace */
|
||||
if (!ASSERT_OK(setns_by_name(NS_DST), "setns dst"))
|
||||
goto fail;
|
||||
|
||||
SYS("ip addr add " IP4_DST "/32 dev veth_dst");
|
||||
SYS("ip addr add " IP6_DST "/128 dev veth_dst nodad");
|
||||
SYS("ip link set dev veth_dst up");
|
||||
|
||||
SYS("ip route add " IP4_SRC "/32 dev veth_dst scope global");
|
||||
SYS("ip route add " IP4_NET "/16 dev veth_dst scope global");
|
||||
SYS("ip route add " IP6_SRC "/128 dev veth_dst scope global");
|
||||
|
||||
SYS("ip neigh add " IP4_SRC " dev veth_dst lladdr %s",
|
||||
veth_dst_fwd_addr);
|
||||
SYS("ip neigh add " IP6_SRC " dev veth_dst lladdr %s",
|
||||
veth_dst_fwd_addr);
|
||||
|
||||
setns_root();
|
||||
return 0;
|
||||
fail:
|
||||
setns_root();
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int netns_load_bpf(void)
|
||||
{
|
||||
if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd"))
|
||||
return -1;
|
||||
|
||||
SYS("tc qdisc add dev veth_src_fwd clsact");
|
||||
SYS("tc filter add dev veth_src_fwd ingress bpf da object-pinned "
|
||||
SRC_PROG_PIN_FILE);
|
||||
SYS("tc filter add dev veth_src_fwd egress bpf da object-pinned "
|
||||
CHK_PROG_PIN_FILE);
|
||||
|
||||
SYS("tc qdisc add dev veth_dst_fwd clsact");
|
||||
SYS("tc filter add dev veth_dst_fwd ingress bpf da object-pinned "
|
||||
DST_PROG_PIN_FILE);
|
||||
SYS("tc filter add dev veth_dst_fwd egress bpf da object-pinned "
|
||||
CHK_PROG_PIN_FILE);
|
||||
|
||||
setns_root();
|
||||
return -1;
|
||||
fail:
|
||||
setns_root();
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int netns_unload_bpf(void)
|
||||
{
|
||||
if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd"))
|
||||
goto fail;
|
||||
SYS("tc qdisc delete dev veth_src_fwd clsact");
|
||||
SYS("tc qdisc delete dev veth_dst_fwd clsact");
|
||||
|
||||
setns_root();
|
||||
return 0;
|
||||
fail:
|
||||
setns_root();
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
static void test_tcp(int family, const char *addr, __u16 port)
|
||||
{
|
||||
int listen_fd = -1, accept_fd = -1, client_fd = -1;
|
||||
char buf[] = "testing testing";
|
||||
int n;
|
||||
|
||||
if (!ASSERT_OK(setns_by_name(NS_DST), "setns dst"))
|
||||
return;
|
||||
|
||||
listen_fd = start_server(family, SOCK_STREAM, addr, port, 0);
|
||||
if (!ASSERT_GE(listen_fd, 0, "listen"))
|
||||
goto done;
|
||||
|
||||
if (!ASSERT_OK(setns_by_name(NS_SRC), "setns src"))
|
||||
goto done;
|
||||
|
||||
client_fd = connect_to_fd(listen_fd, TIMEOUT_MILLIS);
|
||||
if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
|
||||
goto done;
|
||||
|
||||
accept_fd = accept(listen_fd, NULL, NULL);
|
||||
if (!ASSERT_GE(accept_fd, 0, "accept"))
|
||||
goto done;
|
||||
|
||||
if (!ASSERT_OK(settimeo(accept_fd, TIMEOUT_MILLIS), "settimeo"))
|
||||
goto done;
|
||||
|
||||
n = write(client_fd, buf, sizeof(buf));
|
||||
if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
|
||||
goto done;
|
||||
|
||||
n = read(accept_fd, buf, sizeof(buf));
|
||||
ASSERT_EQ(n, sizeof(buf), "recv from server");
|
||||
|
||||
done:
|
||||
setns_root();
|
||||
if (listen_fd >= 0)
|
||||
close(listen_fd);
|
||||
if (accept_fd >= 0)
|
||||
close(accept_fd);
|
||||
if (client_fd >= 0)
|
||||
close(client_fd);
|
||||
}
|
||||
|
||||
static int test_ping(int family, const char *addr)
|
||||
{
|
||||
const char *ping = family == AF_INET6 ? "ping6" : "ping";
|
||||
|
||||
SYS("ip netns exec " NS_SRC " %s " PING_ARGS " %s", ping, addr);
|
||||
return 0;
|
||||
fail:
|
||||
return -1;
|
||||
}
|
||||
|
||||
static void test_connectivity(void)
|
||||
{
|
||||
test_tcp(AF_INET, IP4_DST, IP4_PORT);
|
||||
test_ping(AF_INET, IP4_DST);
|
||||
test_tcp(AF_INET6, IP6_DST, IP6_PORT);
|
||||
test_ping(AF_INET6, IP6_DST);
|
||||
}
|
||||
|
||||
static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result)
|
||||
{
|
||||
struct test_tc_neigh_fib *skel;
|
||||
int err;
|
||||
|
||||
skel = test_tc_neigh_fib__open();
|
||||
if (!ASSERT_OK_PTR(skel, "test_tc_neigh_fib__open"))
|
||||
return;
|
||||
|
||||
if (!ASSERT_OK(test_tc_neigh_fib__load(skel), "test_tc_neigh_fib__load")) {
|
||||
test_tc_neigh_fib__destroy(skel);
|
||||
return;
|
||||
}
|
||||
|
||||
err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
|
||||
if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
|
||||
goto done;
|
||||
|
||||
err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
|
||||
if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE))
|
||||
goto done;
|
||||
|
||||
err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
|
||||
if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE))
|
||||
goto done;
|
||||
|
||||
if (netns_load_bpf())
|
||||
goto done;
|
||||
|
||||
/* bpf_fib_lookup() checks if forwarding is enabled */
|
||||
if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd"))
|
||||
goto done;
|
||||
|
||||
err = modify_proc("/proc/sys/net/ipv4/ip_forward", "1");
|
||||
if (!ASSERT_OK(err, "set ipv4.ip_forward"))
|
||||
goto done;
|
||||
|
||||
err = modify_proc("/proc/sys/net/ipv6/conf/all/forwarding", "1");
|
||||
if (!ASSERT_OK(err, "set ipv6.forwarding"))
|
||||
goto done;
|
||||
setns_root();
|
||||
|
||||
test_connectivity();
|
||||
done:
|
||||
bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
|
||||
bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
|
||||
bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
|
||||
test_tc_neigh_fib__destroy(skel);
|
||||
netns_unload_bpf();
|
||||
setns_root();
|
||||
restore_proc();
|
||||
}
|
||||
|
||||
static void test_tc_redirect_neigh(struct netns_setup_result *setup_result)
|
||||
{
|
||||
struct test_tc_neigh *skel;
|
||||
int err;
|
||||
|
||||
skel = test_tc_neigh__open();
|
||||
if (!ASSERT_OK_PTR(skel, "test_tc_neigh__open"))
|
||||
return;
|
||||
|
||||
skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd;
|
||||
skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd;
|
||||
|
||||
err = test_tc_neigh__load(skel);
|
||||
if (!ASSERT_OK(err, "test_tc_neigh__load")) {
|
||||
test_tc_neigh__destroy(skel);
|
||||
return;
|
||||
}
|
||||
|
||||
err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
|
||||
if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
|
||||
goto done;
|
||||
|
||||
err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
|
||||
if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE))
|
||||
goto done;
|
||||
|
||||
err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
|
||||
if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE))
|
||||
goto done;
|
||||
|
||||
if (netns_load_bpf())
|
||||
goto done;
|
||||
|
||||
test_connectivity();
|
||||
|
||||
done:
|
||||
bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
|
||||
bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
|
||||
bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
|
||||
test_tc_neigh__destroy(skel);
|
||||
netns_unload_bpf();
|
||||
setns_root();
|
||||
}
|
||||
|
||||
static void test_tc_redirect_peer(struct netns_setup_result *setup_result)
|
||||
{
|
||||
struct test_tc_peer *skel;
|
||||
int err;
|
||||
|
||||
skel = test_tc_peer__open();
|
||||
if (!ASSERT_OK_PTR(skel, "test_tc_peer__open"))
|
||||
return;
|
||||
|
||||
skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd;
|
||||
skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd;
|
||||
|
||||
err = test_tc_peer__load(skel);
|
||||
if (!ASSERT_OK(err, "test_tc_peer__load")) {
|
||||
test_tc_peer__destroy(skel);
|
||||
return;
|
||||
}
|
||||
|
||||
err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
|
||||
if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
|
||||
goto done;
|
||||
|
||||
err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
|
||||
if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE))
|
||||
goto done;
|
||||
|
||||
err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
|
||||
if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE))
|
||||
goto done;
|
||||
|
||||
if (netns_load_bpf())
|
||||
goto done;
|
||||
|
||||
test_connectivity();
|
||||
|
||||
done:
|
||||
bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
|
||||
bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
|
||||
bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
|
||||
test_tc_peer__destroy(skel);
|
||||
netns_unload_bpf();
|
||||
setns_root();
|
||||
}
|
||||
|
||||
void test_tc_redirect(void)
|
||||
{
|
||||
struct netns_setup_result setup_result;
|
||||
|
||||
root_netns_fd = open("/proc/self/ns/net", O_RDONLY);
|
||||
if (!ASSERT_GE(root_netns_fd, 0, "open /proc/self/ns/net"))
|
||||
return;
|
||||
|
||||
if (netns_setup_namespaces("add"))
|
||||
goto done;
|
||||
|
||||
if (netns_setup_links_and_routes(&setup_result))
|
||||
goto done;
|
||||
|
||||
if (test__start_subtest("tc_redirect_peer"))
|
||||
test_tc_redirect_peer(&setup_result);
|
||||
|
||||
if (test__start_subtest("tc_redirect_neigh"))
|
||||
test_tc_redirect_neigh(&setup_result);
|
||||
|
||||
if (test__start_subtest("tc_redirect_neigh_fib"))
|
||||
test_tc_redirect_neigh_fib(&setup_result);
|
||||
|
||||
done:
|
||||
close(root_netns_fd);
|
||||
netns_setup_namespaces("delete");
|
||||
}
|
|
@ -33,17 +33,8 @@
|
|||
a.s6_addr32[3] == b.s6_addr32[3])
|
||||
#endif
|
||||
|
||||
enum {
|
||||
dev_src,
|
||||
dev_dst,
|
||||
};
|
||||
|
||||
struct bpf_map_def SEC("maps") ifindex_map = {
|
||||
.type = BPF_MAP_TYPE_ARRAY,
|
||||
.key_size = sizeof(int),
|
||||
.value_size = sizeof(int),
|
||||
.max_entries = 2,
|
||||
};
|
||||
static volatile const __u32 IFINDEX_SRC;
|
||||
static volatile const __u32 IFINDEX_DST;
|
||||
|
||||
static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb,
|
||||
__be32 addr)
|
||||
|
@ -79,14 +70,8 @@ static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb,
|
|||
return v6_equal(ip6h->daddr, addr);
|
||||
}
|
||||
|
||||
static __always_inline int get_dev_ifindex(int which)
|
||||
{
|
||||
int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
|
||||
|
||||
return ifindex ? *ifindex : 0;
|
||||
}
|
||||
|
||||
SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
|
||||
SEC("classifier/chk_egress")
|
||||
int tc_chk(struct __sk_buff *skb)
|
||||
{
|
||||
void *data_end = ctx_ptr(skb->data_end);
|
||||
void *data = ctx_ptr(skb->data);
|
||||
|
@ -98,7 +83,8 @@ SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
|
|||
return !raw[0] && !raw[1] && !raw[2] ? TC_ACT_SHOT : TC_ACT_OK;
|
||||
}
|
||||
|
||||
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
|
||||
SEC("classifier/dst_ingress")
|
||||
int tc_dst(struct __sk_buff *skb)
|
||||
{
|
||||
__u8 zero[ETH_ALEN * 2];
|
||||
bool redirect = false;
|
||||
|
@ -119,10 +105,11 @@ SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
|
|||
if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
|
||||
return TC_ACT_SHOT;
|
||||
|
||||
return bpf_redirect_neigh(get_dev_ifindex(dev_src), NULL, 0, 0);
|
||||
return bpf_redirect_neigh(IFINDEX_SRC, NULL, 0, 0);
|
||||
}
|
||||
|
||||
SEC("src_ingress") int tc_src(struct __sk_buff *skb)
|
||||
SEC("classifier/src_ingress")
|
||||
int tc_src(struct __sk_buff *skb)
|
||||
{
|
||||
__u8 zero[ETH_ALEN * 2];
|
||||
bool redirect = false;
|
||||
|
@ -143,7 +130,7 @@ SEC("src_ingress") int tc_src(struct __sk_buff *skb)
|
|||
if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
|
||||
return TC_ACT_SHOT;
|
||||
|
||||
return bpf_redirect_neigh(get_dev_ifindex(dev_dst), NULL, 0, 0);
|
||||
return bpf_redirect_neigh(IFINDEX_DST, NULL, 0, 0);
|
||||
}
|
||||
|
||||
char __license[] SEC("license") = "GPL";
|
||||
|
|
|
@ -75,7 +75,8 @@ static __always_inline int fill_fib_params_v6(struct __sk_buff *skb,
|
|||
return 0;
|
||||
}
|
||||
|
||||
SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
|
||||
SEC("classifier/chk_egress")
|
||||
int tc_chk(struct __sk_buff *skb)
|
||||
{
|
||||
void *data_end = ctx_ptr(skb->data_end);
|
||||
void *data = ctx_ptr(skb->data);
|
||||
|
@ -142,12 +143,14 @@ static __always_inline int tc_redir(struct __sk_buff *skb)
|
|||
/* these are identical, but keep them separate for compatibility with the
|
||||
* section names expected by test_tc_redirect.sh
|
||||
*/
|
||||
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
|
||||
SEC("classifier/dst_ingress")
|
||||
int tc_dst(struct __sk_buff *skb)
|
||||
{
|
||||
return tc_redir(skb);
|
||||
}
|
||||
|
||||
SEC("src_ingress") int tc_src(struct __sk_buff *skb)
|
||||
SEC("classifier/src_ingress")
|
||||
int tc_src(struct __sk_buff *skb)
|
||||
{
|
||||
return tc_redir(skb);
|
||||
}
|
||||
|
|
|
@ -8,38 +8,25 @@
|
|||
|
||||
#include <bpf/bpf_helpers.h>
|
||||
|
||||
enum {
|
||||
dev_src,
|
||||
dev_dst,
|
||||
};
|
||||
static volatile const __u32 IFINDEX_SRC;
|
||||
static volatile const __u32 IFINDEX_DST;
|
||||
|
||||
struct bpf_map_def SEC("maps") ifindex_map = {
|
||||
.type = BPF_MAP_TYPE_ARRAY,
|
||||
.key_size = sizeof(int),
|
||||
.value_size = sizeof(int),
|
||||
.max_entries = 2,
|
||||
};
|
||||
|
||||
static __always_inline int get_dev_ifindex(int which)
|
||||
{
|
||||
int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
|
||||
|
||||
return ifindex ? *ifindex : 0;
|
||||
}
|
||||
|
||||
SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
|
||||
SEC("classifier/chk_egress")
|
||||
int tc_chk(struct __sk_buff *skb)
|
||||
{
|
||||
return TC_ACT_SHOT;
|
||||
}
|
||||
|
||||
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
|
||||
SEC("classifier/dst_ingress")
|
||||
int tc_dst(struct __sk_buff *skb)
|
||||
{
|
||||
return bpf_redirect_peer(get_dev_ifindex(dev_src), 0);
|
||||
return bpf_redirect_peer(IFINDEX_SRC, 0);
|
||||
}
|
||||
|
||||
SEC("src_ingress") int tc_src(struct __sk_buff *skb)
|
||||
SEC("classifier/src_ingress")
|
||||
int tc_src(struct __sk_buff *skb)
|
||||
{
|
||||
return bpf_redirect_peer(get_dev_ifindex(dev_dst), 0);
|
||||
return bpf_redirect_peer(IFINDEX_DST, 0);
|
||||
}
|
||||
|
||||
char __license[] SEC("license") = "GPL";
|
||||
|
|
|
@ -1,216 +0,0 @@
|
|||
#!/bin/bash
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
#
|
||||
# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
|
||||
# between src and dst. The netns fwd has veth links to each src and dst. The
|
||||
# client is in src and server in dst. The test installs a TC BPF program to each
|
||||
# host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the
|
||||
# neigh addr population and redirect or ii) bpf_redirect_peer() for namespace
|
||||
# switch from ingress side; it also installs a checker prog on the egress side
|
||||
# to drop unexpected traffic.
|
||||
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
echo "This script must be run as root"
|
||||
echo "FAIL"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check that needed tools are present
|
||||
command -v nc >/dev/null 2>&1 || \
|
||||
{ echo >&2 "nc is not available"; exit 1; }
|
||||
command -v dd >/dev/null 2>&1 || \
|
||||
{ echo >&2 "dd is not available"; exit 1; }
|
||||
command -v timeout >/dev/null 2>&1 || \
|
||||
{ echo >&2 "timeout is not available"; exit 1; }
|
||||
command -v ping >/dev/null 2>&1 || \
|
||||
{ echo >&2 "ping is not available"; exit 1; }
|
||||
if command -v ping6 >/dev/null 2>&1; then PING6=ping6; else PING6=ping; fi
|
||||
command -v perl >/dev/null 2>&1 || \
|
||||
{ echo >&2 "perl is not available"; exit 1; }
|
||||
command -v jq >/dev/null 2>&1 || \
|
||||
{ echo >&2 "jq is not available"; exit 1; }
|
||||
command -v bpftool >/dev/null 2>&1 || \
|
||||
{ echo >&2 "bpftool is not available"; exit 1; }
|
||||
|
||||
readonly GREEN='\033[0;92m'
|
||||
readonly RED='\033[0;31m'
|
||||
readonly NC='\033[0m' # No Color
|
||||
|
||||
readonly PING_ARG="-c 3 -w 10 -q"
|
||||
|
||||
readonly TIMEOUT=10
|
||||
|
||||
readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
|
||||
readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)"
|
||||
readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
|
||||
|
||||
readonly IP4_SRC="172.16.1.100"
|
||||
readonly IP4_DST="172.16.2.100"
|
||||
|
||||
readonly IP6_SRC="::1:dead:beef:cafe"
|
||||
readonly IP6_DST="::2:dead:beef:cafe"
|
||||
|
||||
readonly IP4_SLL="169.254.0.1"
|
||||
readonly IP4_DLL="169.254.0.2"
|
||||
readonly IP4_NET="169.254.0.0"
|
||||
|
||||
netns_cleanup()
|
||||
{
|
||||
ip netns del ${NS_SRC}
|
||||
ip netns del ${NS_FWD}
|
||||
ip netns del ${NS_DST}
|
||||
}
|
||||
|
||||
netns_setup()
|
||||
{
|
||||
ip netns add "${NS_SRC}"
|
||||
ip netns add "${NS_FWD}"
|
||||
ip netns add "${NS_DST}"
|
||||
|
||||
ip link add veth_src type veth peer name veth_src_fwd
|
||||
ip link add veth_dst type veth peer name veth_dst_fwd
|
||||
|
||||
ip link set veth_src netns ${NS_SRC}
|
||||
ip link set veth_src_fwd netns ${NS_FWD}
|
||||
|
||||
ip link set veth_dst netns ${NS_DST}
|
||||
ip link set veth_dst_fwd netns ${NS_FWD}
|
||||
|
||||
ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src
|
||||
ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst
|
||||
|
||||
# The fwd netns automatically get a v6 LL address / routes, but also
|
||||
# needs v4 one in order to start ARP probing. IP4_NET route is added
|
||||
# to the endpoints so that the ARP processing will reply.
|
||||
|
||||
ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd
|
||||
ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd
|
||||
|
||||
ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad
|
||||
ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad
|
||||
|
||||
ip -netns ${NS_SRC} link set dev veth_src up
|
||||
ip -netns ${NS_FWD} link set dev veth_src_fwd up
|
||||
|
||||
ip -netns ${NS_DST} link set dev veth_dst up
|
||||
ip -netns ${NS_FWD} link set dev veth_dst_fwd up
|
||||
|
||||
ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global
|
||||
ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global
|
||||
ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global
|
||||
|
||||
ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global
|
||||
ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global
|
||||
|
||||
ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global
|
||||
ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global
|
||||
ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global
|
||||
|
||||
ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global
|
||||
ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global
|
||||
|
||||
fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address)
|
||||
fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address)
|
||||
|
||||
ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src
|
||||
ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst
|
||||
|
||||
ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src
|
||||
ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst
|
||||
}
|
||||
|
||||
netns_test_connectivity()
|
||||
{
|
||||
set +e
|
||||
|
||||
ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &"
|
||||
ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &"
|
||||
|
||||
TEST="TCPv4 connectivity test"
|
||||
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004"
|
||||
if [ $? -ne 0 ]; then
|
||||
echo -e "${TEST}: ${RED}FAIL${NC}"
|
||||
exit 1
|
||||
fi
|
||||
echo -e "${TEST}: ${GREEN}PASS${NC}"
|
||||
|
||||
TEST="TCPv6 connectivity test"
|
||||
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006"
|
||||
if [ $? -ne 0 ]; then
|
||||
echo -e "${TEST}: ${RED}FAIL${NC}"
|
||||
exit 1
|
||||
fi
|
||||
echo -e "${TEST}: ${GREEN}PASS${NC}"
|
||||
|
||||
TEST="ICMPv4 connectivity test"
|
||||
ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST}
|
||||
if [ $? -ne 0 ]; then
|
||||
echo -e "${TEST}: ${RED}FAIL${NC}"
|
||||
exit 1
|
||||
fi
|
||||
echo -e "${TEST}: ${GREEN}PASS${NC}"
|
||||
|
||||
TEST="ICMPv6 connectivity test"
|
||||
ip netns exec ${NS_SRC} $PING6 $PING_ARG ${IP6_DST}
|
||||
if [ $? -ne 0 ]; then
|
||||
echo -e "${TEST}: ${RED}FAIL${NC}"
|
||||
exit 1
|
||||
fi
|
||||
echo -e "${TEST}: ${GREEN}PASS${NC}"
|
||||
|
||||
set -e
|
||||
}
|
||||
|
||||
hex_mem_str()
|
||||
{
|
||||
perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1
|
||||
}
|
||||
|
||||
netns_setup_bpf()
|
||||
{
|
||||
local obj=$1
|
||||
local use_forwarding=${2:-0}
|
||||
|
||||
ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
|
||||
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress
|
||||
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj $obj sec chk_egress
|
||||
|
||||
ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
|
||||
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress
|
||||
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj $obj sec chk_egress
|
||||
|
||||
if [ "$use_forwarding" -eq "1" ]; then
|
||||
# bpf_fib_lookup() checks if forwarding is enabled
|
||||
ip netns exec ${NS_FWD} sysctl -w net.ipv4.ip_forward=1
|
||||
ip netns exec ${NS_FWD} sysctl -w net.ipv6.conf.veth_dst_fwd.forwarding=1
|
||||
ip netns exec ${NS_FWD} sysctl -w net.ipv6.conf.veth_src_fwd.forwarding=1
|
||||
return 0
|
||||
fi
|
||||
|
||||
veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex)
|
||||
veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex)
|
||||
|
||||
progs=$(ip netns exec ${NS_FWD} bpftool net --json | jq -r '.[] | .tc | map(.id) | .[]')
|
||||
for prog in $progs; do
|
||||
map=$(bpftool prog show id $prog --json | jq -r '.map_ids | .? | .[]')
|
||||
if [ ! -z "$map" ]; then
|
||||
bpftool map update id $map key hex $(hex_mem_str 0) value hex $(hex_mem_str $veth_src)
|
||||
bpftool map update id $map key hex $(hex_mem_str 1) value hex $(hex_mem_str $veth_dst)
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
trap netns_cleanup EXIT
|
||||
set -e
|
||||
|
||||
netns_setup
|
||||
netns_setup_bpf test_tc_neigh.o
|
||||
netns_test_connectivity
|
||||
netns_cleanup
|
||||
netns_setup
|
||||
netns_setup_bpf test_tc_neigh_fib.o 1
|
||||
netns_test_connectivity
|
||||
netns_cleanup
|
||||
netns_setup
|
||||
netns_setup_bpf test_tc_peer.o
|
||||
netns_test_connectivity
|
Loading…
Reference in New Issue