2014-12-02 07:06:37 +08:00
|
|
|
#include <stdio.h>
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <fcntl.h>
|
|
|
|
#include <libelf.h>
|
|
|
|
#include <gelf.h>
|
|
|
|
#include <errno.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <stdbool.h>
|
2015-03-26 03:49:23 +08:00
|
|
|
#include <stdlib.h>
|
2014-12-02 07:06:37 +08:00
|
|
|
#include <linux/bpf.h>
|
|
|
|
#include <linux/filter.h>
|
2015-03-26 03:49:23 +08:00
|
|
|
#include <linux/perf_event.h>
|
2016-12-08 07:53:14 +08:00
|
|
|
#include <linux/netlink.h>
|
|
|
|
#include <linux/rtnetlink.h>
|
2017-05-01 17:26:15 +08:00
|
|
|
#include <linux/types.h>
|
2016-12-08 07:53:14 +08:00
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/socket.h>
|
2015-03-26 03:49:23 +08:00
|
|
|
#include <sys/syscall.h>
|
|
|
|
#include <sys/ioctl.h>
|
|
|
|
#include <sys/mman.h>
|
|
|
|
#include <poll.h>
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
#include <ctype.h>
|
2017-04-15 01:30:28 +08:00
|
|
|
#include <assert.h>
|
2014-12-02 07:06:37 +08:00
|
|
|
#include "libbpf.h"
|
|
|
|
#include "bpf_load.h"
|
2016-12-09 10:46:19 +08:00
|
|
|
#include "perf-sys.h"
|
2014-12-02 07:06:37 +08:00
|
|
|
|
2015-03-26 03:49:23 +08:00
|
|
|
#define DEBUGFS "/sys/kernel/debug/tracing/"
|
|
|
|
|
2014-12-02 07:06:37 +08:00
|
|
|
static char license[128];
|
2015-03-26 03:49:23 +08:00
|
|
|
static int kern_version;
|
2014-12-02 07:06:37 +08:00
|
|
|
static bool processed_sec[128];
|
samples/bpf: Make samples more libbpf-centric
Switch all of the sample code to use the function names from
tools/lib/bpf so that they're consistent with that, and to declare their
own log buffers. This allow the next commit to be purely devoted to
getting rid of the duplicate library in samples/bpf.
Committer notes:
Testing it:
On a fedora rawhide container, with clang/llvm 3.9, sharing the host
linux kernel git tree:
# make O=/tmp/build/linux/ headers_install
# make O=/tmp/build/linux -C samples/bpf/
Since I forgot to make it privileged, just tested it outside the
container, using what it generated:
# uname -a
Linux jouet 4.9.0-rc8+ #1 SMP Mon Dec 12 11:20:49 BRT 2016 x86_64 x86_64 x86_64 GNU/Linux
# cd /var/lib/docker/devicemapper/mnt/c43e09a53ff56c86a07baf79847f00e2cc2a17a1e2220e1adbf8cbc62734feda/rootfs/tmp/build/linux/samples/bpf/
# ls -la offwaketime
-rwxr-xr-x. 1 root root 24200 Dec 15 12:19 offwaketime
# file offwaketime
offwaketime: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 2.6.32, BuildID[sha1]=c940d3f127d5e66cdd680e42d885cb0b64f8a0e4, not stripped
# readelf -SW offwaketime_kern.o | grep PROGBITS
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] kprobe/try_to_wake_up PROGBITS 0000000000000000 000040 0000d8 00 AX 0 0 8
[ 5] tracepoint/sched/sched_switch PROGBITS 0000000000000000 000118 000318 00 AX 0 0 8
[ 7] maps PROGBITS 0000000000000000 000430 000050 00 WA 0 0 4
[ 8] license PROGBITS 0000000000000000 000480 000004 00 WA 0 0 1
[ 9] version PROGBITS 0000000000000000 000484 000004 00 WA 0 0 4
# ./offwaketime | head -5
swapper/1;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 106
CPU 0/KVM;entry_SYSCALL_64_fastpath;sys_ioctl;do_vfs_ioctl;kvm_vcpu_ioctl;kvm_arch_vcpu_ioctl_run;kvm_vcpu_block;schedule;__schedule;-;try_to_wake_up;swake_up_locked;swake_up;apic_timer_expired;apic_timer_fn;__hrtimer_run_queues;hrtimer_interrupt;local_apic_timer_interrupt;smp_apic_timer_interrupt;__irqentry_text_start;cpuidle_enter;call_cpuidle;cpu_startup_entry;start_secondary;;swapper/3 2
Compositor;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;futex_requeue;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;SoftwareVsyncTh 5
firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 13
JS Helper;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;firefox 2
#
Signed-off-by: Joe Stringer <joe@ovn.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/20161214224342.12858-2-joe@ovn.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-15 06:43:38 +08:00
|
|
|
char bpf_log_buf[BPF_LOG_BUF_SIZE];
|
2014-12-02 07:06:37 +08:00
|
|
|
int map_fd[MAX_MAPS];
|
|
|
|
int prog_fd[MAX_PROGS];
|
2015-03-26 03:49:23 +08:00
|
|
|
int event_fd[MAX_PROGS];
|
2014-12-02 07:06:37 +08:00
|
|
|
int prog_cnt;
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
int prog_array_fd = -1;
|
|
|
|
|
2017-05-02 20:31:56 +08:00
|
|
|
struct bpf_map_data map_data[MAX_MAPS];
|
|
|
|
int map_data_count = 0;
|
|
|
|
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
static int populate_prog_array(const char *event, int prog_fd)
|
|
|
|
{
|
|
|
|
int ind = atoi(event), err;
|
|
|
|
|
samples/bpf: Make samples more libbpf-centric
Switch all of the sample code to use the function names from
tools/lib/bpf so that they're consistent with that, and to declare their
own log buffers. This allow the next commit to be purely devoted to
getting rid of the duplicate library in samples/bpf.
Committer notes:
Testing it:
On a fedora rawhide container, with clang/llvm 3.9, sharing the host
linux kernel git tree:
# make O=/tmp/build/linux/ headers_install
# make O=/tmp/build/linux -C samples/bpf/
Since I forgot to make it privileged, just tested it outside the
container, using what it generated:
# uname -a
Linux jouet 4.9.0-rc8+ #1 SMP Mon Dec 12 11:20:49 BRT 2016 x86_64 x86_64 x86_64 GNU/Linux
# cd /var/lib/docker/devicemapper/mnt/c43e09a53ff56c86a07baf79847f00e2cc2a17a1e2220e1adbf8cbc62734feda/rootfs/tmp/build/linux/samples/bpf/
# ls -la offwaketime
-rwxr-xr-x. 1 root root 24200 Dec 15 12:19 offwaketime
# file offwaketime
offwaketime: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 2.6.32, BuildID[sha1]=c940d3f127d5e66cdd680e42d885cb0b64f8a0e4, not stripped
# readelf -SW offwaketime_kern.o | grep PROGBITS
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] kprobe/try_to_wake_up PROGBITS 0000000000000000 000040 0000d8 00 AX 0 0 8
[ 5] tracepoint/sched/sched_switch PROGBITS 0000000000000000 000118 000318 00 AX 0 0 8
[ 7] maps PROGBITS 0000000000000000 000430 000050 00 WA 0 0 4
[ 8] license PROGBITS 0000000000000000 000480 000004 00 WA 0 0 1
[ 9] version PROGBITS 0000000000000000 000484 000004 00 WA 0 0 4
# ./offwaketime | head -5
swapper/1;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 106
CPU 0/KVM;entry_SYSCALL_64_fastpath;sys_ioctl;do_vfs_ioctl;kvm_vcpu_ioctl;kvm_arch_vcpu_ioctl_run;kvm_vcpu_block;schedule;__schedule;-;try_to_wake_up;swake_up_locked;swake_up;apic_timer_expired;apic_timer_fn;__hrtimer_run_queues;hrtimer_interrupt;local_apic_timer_interrupt;smp_apic_timer_interrupt;__irqentry_text_start;cpuidle_enter;call_cpuidle;cpu_startup_entry;start_secondary;;swapper/3 2
Compositor;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;futex_requeue;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;SoftwareVsyncTh 5
firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 13
JS Helper;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;firefox 2
#
Signed-off-by: Joe Stringer <joe@ovn.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/20161214224342.12858-2-joe@ovn.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-15 06:43:38 +08:00
|
|
|
err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY);
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
if (err < 0) {
|
|
|
|
printf("failed to store prog_fd in prog_array\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2014-12-02 07:06:37 +08:00
|
|
|
|
|
|
|
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
|
|
|
|
{
|
|
|
|
bool is_socket = strncmp(event, "socket", 6) == 0;
|
2015-03-26 03:49:23 +08:00
|
|
|
bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
|
|
|
|
bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
|
2016-04-07 09:43:29 +08:00
|
|
|
bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
|
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-20 03:16:51 +08:00
|
|
|
bool is_xdp = strncmp(event, "xdp", 3) == 0;
|
2016-09-02 09:37:25 +08:00
|
|
|
bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
|
2016-12-02 00:48:07 +08:00
|
|
|
bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
|
|
|
|
bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
|
bpf: BPF support for sock_ops
Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding
struct that allows BPF programs of this type to access some of the
socket's fields (such as IP addresses, ports, etc.). It uses the
existing bpf cgroups infrastructure so the programs can be attached per
cgroup with full inheritance support. The program will be called at
appropriate times to set relevant connections parameters such as buffer
sizes, SYN and SYN-ACK RTOs, etc., based on connection information such
as IP addresses, port numbers, etc.
Alghough there are already 3 mechanisms to set parameters (sysctls,
route metrics and setsockopts), this new mechanism provides some
distinct advantages. Unlike sysctls, it can set parameters per
connection. In contrast to route metrics, it can also use port numbers
and information provided by a user level program. In addition, it could
set parameters probabilistically for evaluation purposes (i.e. do
something different on 10% of the flows and compare results with the
other 90% of the flows). Also, in cases where IPv6 addresses contain
geographic information, the rules to make changes based on the distance
(or RTT) between the hosts are much easier than route metric rules and
can be global. Finally, unlike setsockopt, it oes not require
application changes and it can be updated easily at any time.
Although the bpf cgroup framework already contains a sock related
program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type
(BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called
only once during the connections's lifetime. In contrast, the new
program type will be called multiple times from different places in the
network stack code. For example, before sending SYN and SYN-ACKs to set
an appropriate timeout, when the connection is established to set
congestion control, etc. As a result it has "op" field to specify the
type of operation requested.
The purpose of this new program type is to simplify setting connection
parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is
easy to use facebook's internal IPv6 addresses to determine if both hosts
of a connection are in the same datacenter. Therefore, it is easy to
write a BPF program to choose a small SYN RTO value when both hosts are
in the same datacenter.
This patch only contains the framework to support the new BPF program
type, following patches add the functionality to set various connection
parameters.
This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS
and a new bpf syscall command to load a new program of this type:
BPF_PROG_LOAD_SOCKET_OPS.
Two new corresponding structs (one for the kernel one for the user/BPF
program):
/* kernel version */
struct bpf_sock_ops_kern {
struct sock *sk;
__u32 op;
union {
__u32 reply;
__u32 replylong[4];
};
};
/* user version
* Some fields are in network byte order reflecting the sock struct
* Use the bpf_ntohl helper macro in samples/bpf/bpf_endian.h to
* convert them to host byte order.
*/
struct bpf_sock_ops {
__u32 op;
union {
__u32 reply;
__u32 replylong[4];
};
__u32 family;
__u32 remote_ip4; /* In network byte order */
__u32 local_ip4; /* In network byte order */
__u32 remote_ip6[4]; /* In network byte order */
__u32 local_ip6[4]; /* In network byte order */
__u32 remote_port; /* In network byte order */
__u32 local_port; /* In host byte horder */
};
Currently there are two types of ops. The first type expects the BPF
program to return a value which is then used by the caller (or a
negative value to indicate the operation is not supported). The second
type expects state changes to be done by the BPF program, for example
through a setsockopt BPF helper function, and they ignore the return
value.
The reply fields of the bpf_sockt_ops struct are there in case a bpf
program needs to return a value larger than an integer.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-01 11:02:40 +08:00
|
|
|
bool is_sockops = strncmp(event, "sockops", 7) == 0;
|
2017-08-16 13:33:32 +08:00
|
|
|
bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
|
samples/bpf: Switch over to libbpf
Now that libbpf under tools/lib/bpf/* is synced with the version from
samples/bpf, we can get rid most of the libbpf library here.
Committer notes:
Built it in a docker fedora rawhide container and ran it in the f25 host, seems
to work just like it did before this patch, i.e. the switch to tools/lib/bpf/
doesn't seem to have introduced problems and Joe said he tested it with
all the entries in samples/bpf/ and other code he found:
[root@f5065a7d6272 linux]# make -j4 O=/tmp/build/linux headers_install
<SNIP>
[root@f5065a7d6272 linux]# rm -rf /tmp/build/linux/samples/bpf/
[root@f5065a7d6272 linux]# make -j4 O=/tmp/build/linux samples/bpf/
make[1]: Entering directory '/tmp/build/linux'
CHK include/config/kernel.release
HOSTCC scripts/basic/fixdep
GEN ./Makefile
CHK include/generated/uapi/linux/version.h
Using /git/linux as source for kernel
CHK include/generated/utsrelease.h
HOSTCC scripts/basic/bin2c
HOSTCC arch/x86/tools/relocs_32.o
HOSTCC arch/x86/tools/relocs_64.o
LD samples/bpf/built-in.o
<SNIP>
HOSTCC samples/bpf/fds_example.o
HOSTCC samples/bpf/sockex1_user.o
/git/linux/samples/bpf/fds_example.c: In function 'bpf_prog_create':
/git/linux/samples/bpf/fds_example.c:63:6: warning: passing argument 2 of 'bpf_load_program' discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers]
insns, insns_cnt, "GPL", 0,
^~~~~
In file included from /git/linux/samples/bpf/libbpf.h:5:0,
from /git/linux/samples/bpf/bpf_load.h:4,
from /git/linux/samples/bpf/fds_example.c:15:
/git/linux/tools/lib/bpf/bpf.h:31:5: note: expected 'struct bpf_insn *' but argument is of type 'const struct bpf_insn *'
int bpf_load_program(enum bpf_prog_type type, struct bpf_insn *insns,
^~~~~~~~~~~~~~~~
HOSTCC samples/bpf/sockex2_user.o
<SNIP>
HOSTCC samples/bpf/xdp_tx_iptunnel_user.o
clang -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/6.2.1/include -I/git/linux/arch/x86/include -I./arch/x86/include/generated/uapi -I./arch/x86/include/generated -I/git/linux/include -I./include -I/git/linux/arch/x86/include/uapi -I/git/linux/include/uapi -I./include/generated/uapi -include /git/linux/include/linux/kconfig.h \
-D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \
-Wno-compare-distinct-pointer-types \
-Wno-gnu-variable-sized-type-not-at-end \
-Wno-address-of-packed-member -Wno-tautological-compare \
-O2 -emit-llvm -c /git/linux/samples/bpf/sockex1_kern.c -o -| llc -march=bpf -filetype=obj -o samples/bpf/sockex1_kern.o
HOSTLD samples/bpf/tc_l2_redirect
<SNIP>
HOSTLD samples/bpf/lwt_len_hist
HOSTLD samples/bpf/xdp_tx_iptunnel
make[1]: Leaving directory '/tmp/build/linux'
[root@f5065a7d6272 linux]#
And then, in the host:
[root@jouet bpf]# mount | grep "docker.*devicemapper\/"
/dev/mapper/docker-253:0-1705076-9bd8aa1e0af33adce89ff42090847868ca676932878942be53941a06ec5923f9 on /var/lib/docker/devicemapper/mnt/9bd8aa1e0af33adce89ff42090847868ca676932878942be53941a06ec5923f9 type xfs (rw,relatime,context="system_u:object_r:container_file_t:s0:c73,c276",nouuid,attr2,inode64,sunit=1024,swidth=1024,noquota)
[root@jouet bpf]# cd /var/lib/docker/devicemapper/mnt/9bd8aa1e0af33adce89ff42090847868ca676932878942be53941a06ec5923f9/rootfs/tmp/build/linux/samples/bpf/
[root@jouet bpf]# file offwaketime
offwaketime: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 2.6.32, BuildID[sha1]=f423d171e0487b2f802b6a792657f0f3c8f6d155, not stripped
[root@jouet bpf]# readelf -SW offwaketime
offwaketime offwaketime_kern.o offwaketime_user.o
[root@jouet bpf]# readelf -SW offwaketime_kern.o
There are 11 section headers, starting at offset 0x700:
Section Headers:
[Nr] Name Type Address Off Size ES Flg Lk Inf Al
[ 0] NULL 0000000000000000 000000 000000 00 0 0 0
[ 1] .strtab STRTAB 0000000000000000 000658 0000a8 00 0 0 1
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] kprobe/try_to_wake_up PROGBITS 0000000000000000 000040 0000d8 00 AX 0 0 8
[ 4] .relkprobe/try_to_wake_up REL 0000000000000000 0005a8 000020 10 10 3 8
[ 5] tracepoint/sched/sched_switch PROGBITS 0000000000000000 000118 000318 00 AX 0 0 8
[ 6] .reltracepoint/sched/sched_switch REL 0000000000000000 0005c8 000090 10 10 5 8
[ 7] maps PROGBITS 0000000000000000 000430 000050 00 WA 0 0 4
[ 8] license PROGBITS 0000000000000000 000480 000004 00 WA 0 0 1
[ 9] version PROGBITS 0000000000000000 000484 000004 00 WA 0 0 4
[10] .symtab SYMTAB 0000000000000000 000488 000120 18 1 4 8
Key to Flags:
W (write), A (alloc), X (execute), M (merge), S (strings)
I (info), L (link order), G (group), T (TLS), E (exclude), x (unknown)
O (extra OS processing required) o (OS specific), p (processor specific)
[root@jouet bpf]# ./offwaketime | head -3
qemu-system-x86;entry_SYSCALL_64_fastpath;sys_ppoll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;local_apic_timer_interrupt;smp_apic_timer_interrupt;__irqentry_text_start;cpuidle_enter_state;cpuidle_enter;call_cpuidle;cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel;start_cpu;;swapper/0 4
firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 1
swapper/2;start_cpu;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 61
[root@jouet bpf]#
Signed-off-by: Joe Stringer <joe@ovn.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: netdev@vger.kernel.org
Link: https://github.com/joestringer/linux/commit/5c40f54a52b1f437123c81e21873f4b4b1f9bd55.patch
Link: http://lkml.kernel.org/n/tip-xr8twtx7sjh5821g8qw47yxk@git.kernel.org
[ Use -I$(srctree)/tools/lib/ to support out of source code tree builds, as noticed by Wang Nan ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-15 06:43:39 +08:00
|
|
|
size_t insns_cnt = size / sizeof(struct bpf_insn);
|
2015-03-26 03:49:23 +08:00
|
|
|
enum bpf_prog_type prog_type;
|
|
|
|
char buf[256];
|
|
|
|
int fd, efd, err, id;
|
|
|
|
struct perf_event_attr attr = {};
|
|
|
|
|
|
|
|
attr.type = PERF_TYPE_TRACEPOINT;
|
|
|
|
attr.sample_type = PERF_SAMPLE_RAW;
|
|
|
|
attr.sample_period = 1;
|
|
|
|
attr.wakeup_events = 1;
|
|
|
|
|
|
|
|
if (is_socket) {
|
|
|
|
prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
|
|
|
|
} else if (is_kprobe || is_kretprobe) {
|
|
|
|
prog_type = BPF_PROG_TYPE_KPROBE;
|
2016-04-07 09:43:29 +08:00
|
|
|
} else if (is_tracepoint) {
|
|
|
|
prog_type = BPF_PROG_TYPE_TRACEPOINT;
|
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-20 03:16:51 +08:00
|
|
|
} else if (is_xdp) {
|
|
|
|
prog_type = BPF_PROG_TYPE_XDP;
|
2016-09-02 09:37:25 +08:00
|
|
|
} else if (is_perf_event) {
|
|
|
|
prog_type = BPF_PROG_TYPE_PERF_EVENT;
|
2016-12-02 00:48:07 +08:00
|
|
|
} else if (is_cgroup_skb) {
|
|
|
|
prog_type = BPF_PROG_TYPE_CGROUP_SKB;
|
|
|
|
} else if (is_cgroup_sk) {
|
|
|
|
prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
|
bpf: BPF support for sock_ops
Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding
struct that allows BPF programs of this type to access some of the
socket's fields (such as IP addresses, ports, etc.). It uses the
existing bpf cgroups infrastructure so the programs can be attached per
cgroup with full inheritance support. The program will be called at
appropriate times to set relevant connections parameters such as buffer
sizes, SYN and SYN-ACK RTOs, etc., based on connection information such
as IP addresses, port numbers, etc.
Alghough there are already 3 mechanisms to set parameters (sysctls,
route metrics and setsockopts), this new mechanism provides some
distinct advantages. Unlike sysctls, it can set parameters per
connection. In contrast to route metrics, it can also use port numbers
and information provided by a user level program. In addition, it could
set parameters probabilistically for evaluation purposes (i.e. do
something different on 10% of the flows and compare results with the
other 90% of the flows). Also, in cases where IPv6 addresses contain
geographic information, the rules to make changes based on the distance
(or RTT) between the hosts are much easier than route metric rules and
can be global. Finally, unlike setsockopt, it oes not require
application changes and it can be updated easily at any time.
Although the bpf cgroup framework already contains a sock related
program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type
(BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called
only once during the connections's lifetime. In contrast, the new
program type will be called multiple times from different places in the
network stack code. For example, before sending SYN and SYN-ACKs to set
an appropriate timeout, when the connection is established to set
congestion control, etc. As a result it has "op" field to specify the
type of operation requested.
The purpose of this new program type is to simplify setting connection
parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is
easy to use facebook's internal IPv6 addresses to determine if both hosts
of a connection are in the same datacenter. Therefore, it is easy to
write a BPF program to choose a small SYN RTO value when both hosts are
in the same datacenter.
This patch only contains the framework to support the new BPF program
type, following patches add the functionality to set various connection
parameters.
This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS
and a new bpf syscall command to load a new program of this type:
BPF_PROG_LOAD_SOCKET_OPS.
Two new corresponding structs (one for the kernel one for the user/BPF
program):
/* kernel version */
struct bpf_sock_ops_kern {
struct sock *sk;
__u32 op;
union {
__u32 reply;
__u32 replylong[4];
};
};
/* user version
* Some fields are in network byte order reflecting the sock struct
* Use the bpf_ntohl helper macro in samples/bpf/bpf_endian.h to
* convert them to host byte order.
*/
struct bpf_sock_ops {
__u32 op;
union {
__u32 reply;
__u32 replylong[4];
};
__u32 family;
__u32 remote_ip4; /* In network byte order */
__u32 local_ip4; /* In network byte order */
__u32 remote_ip6[4]; /* In network byte order */
__u32 local_ip6[4]; /* In network byte order */
__u32 remote_port; /* In network byte order */
__u32 local_port; /* In host byte horder */
};
Currently there are two types of ops. The first type expects the BPF
program to return a value which is then used by the caller (or a
negative value to indicate the operation is not supported). The second
type expects state changes to be done by the BPF program, for example
through a setsockopt BPF helper function, and they ignore the return
value.
The reply fields of the bpf_sockt_ops struct are there in case a bpf
program needs to return a value larger than an integer.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-01 11:02:40 +08:00
|
|
|
} else if (is_sockops) {
|
|
|
|
prog_type = BPF_PROG_TYPE_SOCK_OPS;
|
2017-08-16 13:33:32 +08:00
|
|
|
} else if (is_sk_skb) {
|
|
|
|
prog_type = BPF_PROG_TYPE_SK_SKB;
|
2015-03-26 03:49:23 +08:00
|
|
|
} else {
|
|
|
|
printf("Unknown event '%s'\n", event);
|
2014-12-02 07:06:37 +08:00
|
|
|
return -1;
|
2015-03-26 03:49:23 +08:00
|
|
|
}
|
|
|
|
|
samples/bpf: Switch over to libbpf
Now that libbpf under tools/lib/bpf/* is synced with the version from
samples/bpf, we can get rid most of the libbpf library here.
Committer notes:
Built it in a docker fedora rawhide container and ran it in the f25 host, seems
to work just like it did before this patch, i.e. the switch to tools/lib/bpf/
doesn't seem to have introduced problems and Joe said he tested it with
all the entries in samples/bpf/ and other code he found:
[root@f5065a7d6272 linux]# make -j4 O=/tmp/build/linux headers_install
<SNIP>
[root@f5065a7d6272 linux]# rm -rf /tmp/build/linux/samples/bpf/
[root@f5065a7d6272 linux]# make -j4 O=/tmp/build/linux samples/bpf/
make[1]: Entering directory '/tmp/build/linux'
CHK include/config/kernel.release
HOSTCC scripts/basic/fixdep
GEN ./Makefile
CHK include/generated/uapi/linux/version.h
Using /git/linux as source for kernel
CHK include/generated/utsrelease.h
HOSTCC scripts/basic/bin2c
HOSTCC arch/x86/tools/relocs_32.o
HOSTCC arch/x86/tools/relocs_64.o
LD samples/bpf/built-in.o
<SNIP>
HOSTCC samples/bpf/fds_example.o
HOSTCC samples/bpf/sockex1_user.o
/git/linux/samples/bpf/fds_example.c: In function 'bpf_prog_create':
/git/linux/samples/bpf/fds_example.c:63:6: warning: passing argument 2 of 'bpf_load_program' discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers]
insns, insns_cnt, "GPL", 0,
^~~~~
In file included from /git/linux/samples/bpf/libbpf.h:5:0,
from /git/linux/samples/bpf/bpf_load.h:4,
from /git/linux/samples/bpf/fds_example.c:15:
/git/linux/tools/lib/bpf/bpf.h:31:5: note: expected 'struct bpf_insn *' but argument is of type 'const struct bpf_insn *'
int bpf_load_program(enum bpf_prog_type type, struct bpf_insn *insns,
^~~~~~~~~~~~~~~~
HOSTCC samples/bpf/sockex2_user.o
<SNIP>
HOSTCC samples/bpf/xdp_tx_iptunnel_user.o
clang -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/6.2.1/include -I/git/linux/arch/x86/include -I./arch/x86/include/generated/uapi -I./arch/x86/include/generated -I/git/linux/include -I./include -I/git/linux/arch/x86/include/uapi -I/git/linux/include/uapi -I./include/generated/uapi -include /git/linux/include/linux/kconfig.h \
-D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \
-Wno-compare-distinct-pointer-types \
-Wno-gnu-variable-sized-type-not-at-end \
-Wno-address-of-packed-member -Wno-tautological-compare \
-O2 -emit-llvm -c /git/linux/samples/bpf/sockex1_kern.c -o -| llc -march=bpf -filetype=obj -o samples/bpf/sockex1_kern.o
HOSTLD samples/bpf/tc_l2_redirect
<SNIP>
HOSTLD samples/bpf/lwt_len_hist
HOSTLD samples/bpf/xdp_tx_iptunnel
make[1]: Leaving directory '/tmp/build/linux'
[root@f5065a7d6272 linux]#
And then, in the host:
[root@jouet bpf]# mount | grep "docker.*devicemapper\/"
/dev/mapper/docker-253:0-1705076-9bd8aa1e0af33adce89ff42090847868ca676932878942be53941a06ec5923f9 on /var/lib/docker/devicemapper/mnt/9bd8aa1e0af33adce89ff42090847868ca676932878942be53941a06ec5923f9 type xfs (rw,relatime,context="system_u:object_r:container_file_t:s0:c73,c276",nouuid,attr2,inode64,sunit=1024,swidth=1024,noquota)
[root@jouet bpf]# cd /var/lib/docker/devicemapper/mnt/9bd8aa1e0af33adce89ff42090847868ca676932878942be53941a06ec5923f9/rootfs/tmp/build/linux/samples/bpf/
[root@jouet bpf]# file offwaketime
offwaketime: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 2.6.32, BuildID[sha1]=f423d171e0487b2f802b6a792657f0f3c8f6d155, not stripped
[root@jouet bpf]# readelf -SW offwaketime
offwaketime offwaketime_kern.o offwaketime_user.o
[root@jouet bpf]# readelf -SW offwaketime_kern.o
There are 11 section headers, starting at offset 0x700:
Section Headers:
[Nr] Name Type Address Off Size ES Flg Lk Inf Al
[ 0] NULL 0000000000000000 000000 000000 00 0 0 0
[ 1] .strtab STRTAB 0000000000000000 000658 0000a8 00 0 0 1
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] kprobe/try_to_wake_up PROGBITS 0000000000000000 000040 0000d8 00 AX 0 0 8
[ 4] .relkprobe/try_to_wake_up REL 0000000000000000 0005a8 000020 10 10 3 8
[ 5] tracepoint/sched/sched_switch PROGBITS 0000000000000000 000118 000318 00 AX 0 0 8
[ 6] .reltracepoint/sched/sched_switch REL 0000000000000000 0005c8 000090 10 10 5 8
[ 7] maps PROGBITS 0000000000000000 000430 000050 00 WA 0 0 4
[ 8] license PROGBITS 0000000000000000 000480 000004 00 WA 0 0 1
[ 9] version PROGBITS 0000000000000000 000484 000004 00 WA 0 0 4
[10] .symtab SYMTAB 0000000000000000 000488 000120 18 1 4 8
Key to Flags:
W (write), A (alloc), X (execute), M (merge), S (strings)
I (info), L (link order), G (group), T (TLS), E (exclude), x (unknown)
O (extra OS processing required) o (OS specific), p (processor specific)
[root@jouet bpf]# ./offwaketime | head -3
qemu-system-x86;entry_SYSCALL_64_fastpath;sys_ppoll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;local_apic_timer_interrupt;smp_apic_timer_interrupt;__irqentry_text_start;cpuidle_enter_state;cpuidle_enter;call_cpuidle;cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel;start_cpu;;swapper/0 4
firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 1
swapper/2;start_cpu;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 61
[root@jouet bpf]#
Signed-off-by: Joe Stringer <joe@ovn.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: netdev@vger.kernel.org
Link: https://github.com/joestringer/linux/commit/5c40f54a52b1f437123c81e21873f4b4b1f9bd55.patch
Link: http://lkml.kernel.org/n/tip-xr8twtx7sjh5821g8qw47yxk@git.kernel.org
[ Use -I$(srctree)/tools/lib/ to support out of source code tree builds, as noticed by Wang Nan ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-15 06:43:39 +08:00
|
|
|
fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version,
|
samples/bpf: Make samples more libbpf-centric
Switch all of the sample code to use the function names from
tools/lib/bpf so that they're consistent with that, and to declare their
own log buffers. This allow the next commit to be purely devoted to
getting rid of the duplicate library in samples/bpf.
Committer notes:
Testing it:
On a fedora rawhide container, with clang/llvm 3.9, sharing the host
linux kernel git tree:
# make O=/tmp/build/linux/ headers_install
# make O=/tmp/build/linux -C samples/bpf/
Since I forgot to make it privileged, just tested it outside the
container, using what it generated:
# uname -a
Linux jouet 4.9.0-rc8+ #1 SMP Mon Dec 12 11:20:49 BRT 2016 x86_64 x86_64 x86_64 GNU/Linux
# cd /var/lib/docker/devicemapper/mnt/c43e09a53ff56c86a07baf79847f00e2cc2a17a1e2220e1adbf8cbc62734feda/rootfs/tmp/build/linux/samples/bpf/
# ls -la offwaketime
-rwxr-xr-x. 1 root root 24200 Dec 15 12:19 offwaketime
# file offwaketime
offwaketime: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 2.6.32, BuildID[sha1]=c940d3f127d5e66cdd680e42d885cb0b64f8a0e4, not stripped
# readelf -SW offwaketime_kern.o | grep PROGBITS
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] kprobe/try_to_wake_up PROGBITS 0000000000000000 000040 0000d8 00 AX 0 0 8
[ 5] tracepoint/sched/sched_switch PROGBITS 0000000000000000 000118 000318 00 AX 0 0 8
[ 7] maps PROGBITS 0000000000000000 000430 000050 00 WA 0 0 4
[ 8] license PROGBITS 0000000000000000 000480 000004 00 WA 0 0 1
[ 9] version PROGBITS 0000000000000000 000484 000004 00 WA 0 0 4
# ./offwaketime | head -5
swapper/1;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 106
CPU 0/KVM;entry_SYSCALL_64_fastpath;sys_ioctl;do_vfs_ioctl;kvm_vcpu_ioctl;kvm_arch_vcpu_ioctl_run;kvm_vcpu_block;schedule;__schedule;-;try_to_wake_up;swake_up_locked;swake_up;apic_timer_expired;apic_timer_fn;__hrtimer_run_queues;hrtimer_interrupt;local_apic_timer_interrupt;smp_apic_timer_interrupt;__irqentry_text_start;cpuidle_enter;call_cpuidle;cpu_startup_entry;start_secondary;;swapper/3 2
Compositor;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;futex_requeue;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;SoftwareVsyncTh 5
firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 13
JS Helper;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;firefox 2
#
Signed-off-by: Joe Stringer <joe@ovn.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/20161214224342.12858-2-joe@ovn.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-15 06:43:38 +08:00
|
|
|
bpf_log_buf, BPF_LOG_BUF_SIZE);
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
if (fd < 0) {
|
samples/bpf: Make samples more libbpf-centric
Switch all of the sample code to use the function names from
tools/lib/bpf so that they're consistent with that, and to declare their
own log buffers. This allow the next commit to be purely devoted to
getting rid of the duplicate library in samples/bpf.
Committer notes:
Testing it:
On a fedora rawhide container, with clang/llvm 3.9, sharing the host
linux kernel git tree:
# make O=/tmp/build/linux/ headers_install
# make O=/tmp/build/linux -C samples/bpf/
Since I forgot to make it privileged, just tested it outside the
container, using what it generated:
# uname -a
Linux jouet 4.9.0-rc8+ #1 SMP Mon Dec 12 11:20:49 BRT 2016 x86_64 x86_64 x86_64 GNU/Linux
# cd /var/lib/docker/devicemapper/mnt/c43e09a53ff56c86a07baf79847f00e2cc2a17a1e2220e1adbf8cbc62734feda/rootfs/tmp/build/linux/samples/bpf/
# ls -la offwaketime
-rwxr-xr-x. 1 root root 24200 Dec 15 12:19 offwaketime
# file offwaketime
offwaketime: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 2.6.32, BuildID[sha1]=c940d3f127d5e66cdd680e42d885cb0b64f8a0e4, not stripped
# readelf -SW offwaketime_kern.o | grep PROGBITS
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] kprobe/try_to_wake_up PROGBITS 0000000000000000 000040 0000d8 00 AX 0 0 8
[ 5] tracepoint/sched/sched_switch PROGBITS 0000000000000000 000118 000318 00 AX 0 0 8
[ 7] maps PROGBITS 0000000000000000 000430 000050 00 WA 0 0 4
[ 8] license PROGBITS 0000000000000000 000480 000004 00 WA 0 0 1
[ 9] version PROGBITS 0000000000000000 000484 000004 00 WA 0 0 4
# ./offwaketime | head -5
swapper/1;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 106
CPU 0/KVM;entry_SYSCALL_64_fastpath;sys_ioctl;do_vfs_ioctl;kvm_vcpu_ioctl;kvm_arch_vcpu_ioctl_run;kvm_vcpu_block;schedule;__schedule;-;try_to_wake_up;swake_up_locked;swake_up;apic_timer_expired;apic_timer_fn;__hrtimer_run_queues;hrtimer_interrupt;local_apic_timer_interrupt;smp_apic_timer_interrupt;__irqentry_text_start;cpuidle_enter;call_cpuidle;cpu_startup_entry;start_secondary;;swapper/3 2
Compositor;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;futex_requeue;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;SoftwareVsyncTh 5
firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 13
JS Helper;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;firefox 2
#
Signed-off-by: Joe Stringer <joe@ovn.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/20161214224342.12858-2-joe@ovn.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-15 06:43:38 +08:00
|
|
|
printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf);
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
prog_fd[prog_cnt++] = fd;
|
|
|
|
|
2016-12-02 00:48:07 +08:00
|
|
|
if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
|
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-20 03:16:51 +08:00
|
|
|
return 0;
|
|
|
|
|
2017-08-16 13:33:32 +08:00
|
|
|
if (is_socket || is_sockops || is_sk_skb) {
|
bpf: BPF support for sock_ops
Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding
struct that allows BPF programs of this type to access some of the
socket's fields (such as IP addresses, ports, etc.). It uses the
existing bpf cgroups infrastructure so the programs can be attached per
cgroup with full inheritance support. The program will be called at
appropriate times to set relevant connections parameters such as buffer
sizes, SYN and SYN-ACK RTOs, etc., based on connection information such
as IP addresses, port numbers, etc.
Alghough there are already 3 mechanisms to set parameters (sysctls,
route metrics and setsockopts), this new mechanism provides some
distinct advantages. Unlike sysctls, it can set parameters per
connection. In contrast to route metrics, it can also use port numbers
and information provided by a user level program. In addition, it could
set parameters probabilistically for evaluation purposes (i.e. do
something different on 10% of the flows and compare results with the
other 90% of the flows). Also, in cases where IPv6 addresses contain
geographic information, the rules to make changes based on the distance
(or RTT) between the hosts are much easier than route metric rules and
can be global. Finally, unlike setsockopt, it oes not require
application changes and it can be updated easily at any time.
Although the bpf cgroup framework already contains a sock related
program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type
(BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called
only once during the connections's lifetime. In contrast, the new
program type will be called multiple times from different places in the
network stack code. For example, before sending SYN and SYN-ACKs to set
an appropriate timeout, when the connection is established to set
congestion control, etc. As a result it has "op" field to specify the
type of operation requested.
The purpose of this new program type is to simplify setting connection
parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is
easy to use facebook's internal IPv6 addresses to determine if both hosts
of a connection are in the same datacenter. Therefore, it is easy to
write a BPF program to choose a small SYN RTO value when both hosts are
in the same datacenter.
This patch only contains the framework to support the new BPF program
type, following patches add the functionality to set various connection
parameters.
This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS
and a new bpf syscall command to load a new program of this type:
BPF_PROG_LOAD_SOCKET_OPS.
Two new corresponding structs (one for the kernel one for the user/BPF
program):
/* kernel version */
struct bpf_sock_ops_kern {
struct sock *sk;
__u32 op;
union {
__u32 reply;
__u32 replylong[4];
};
};
/* user version
* Some fields are in network byte order reflecting the sock struct
* Use the bpf_ntohl helper macro in samples/bpf/bpf_endian.h to
* convert them to host byte order.
*/
struct bpf_sock_ops {
__u32 op;
union {
__u32 reply;
__u32 replylong[4];
};
__u32 family;
__u32 remote_ip4; /* In network byte order */
__u32 local_ip4; /* In network byte order */
__u32 remote_ip6[4]; /* In network byte order */
__u32 local_ip6[4]; /* In network byte order */
__u32 remote_port; /* In network byte order */
__u32 local_port; /* In host byte horder */
};
Currently there are two types of ops. The first type expects the BPF
program to return a value which is then used by the caller (or a
negative value to indicate the operation is not supported). The second
type expects state changes to be done by the BPF program, for example
through a setsockopt BPF helper function, and they ignore the return
value.
The reply fields of the bpf_sockt_ops struct are there in case a bpf
program needs to return a value larger than an integer.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-01 11:02:40 +08:00
|
|
|
if (is_socket)
|
|
|
|
event += 6;
|
|
|
|
else
|
|
|
|
event += 7;
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
if (*event != '/')
|
|
|
|
return 0;
|
|
|
|
event++;
|
|
|
|
if (!isdigit(*event)) {
|
|
|
|
printf("invalid prog number\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return populate_prog_array(event, fd);
|
|
|
|
}
|
|
|
|
|
2015-03-26 03:49:23 +08:00
|
|
|
if (is_kprobe || is_kretprobe) {
|
|
|
|
if (is_kprobe)
|
|
|
|
event += 7;
|
|
|
|
else
|
|
|
|
event += 10;
|
|
|
|
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
if (*event == 0) {
|
|
|
|
printf("event name cannot be empty\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isdigit(*event))
|
|
|
|
return populate_prog_array(event, fd);
|
|
|
|
|
2015-03-26 03:49:23 +08:00
|
|
|
snprintf(buf, sizeof(buf),
|
|
|
|
"echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
|
|
|
|
is_kprobe ? 'p' : 'r', event, event);
|
|
|
|
err = system(buf);
|
|
|
|
if (err < 0) {
|
|
|
|
printf("failed to create kprobe '%s' error '%s'\n",
|
|
|
|
event, strerror(errno));
|
|
|
|
return -1;
|
|
|
|
}
|
2014-12-02 07:06:37 +08:00
|
|
|
|
2016-04-07 09:43:29 +08:00
|
|
|
strcpy(buf, DEBUGFS);
|
|
|
|
strcat(buf, "events/kprobes/");
|
|
|
|
strcat(buf, event);
|
|
|
|
strcat(buf, "/id");
|
|
|
|
} else if (is_tracepoint) {
|
|
|
|
event += 11;
|
|
|
|
|
|
|
|
if (*event == 0) {
|
|
|
|
printf("event name cannot be empty\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
strcpy(buf, DEBUGFS);
|
|
|
|
strcat(buf, "events/");
|
|
|
|
strcat(buf, event);
|
|
|
|
strcat(buf, "/id");
|
|
|
|
}
|
2015-03-26 03:49:23 +08:00
|
|
|
|
|
|
|
efd = open(buf, O_RDONLY, 0);
|
|
|
|
if (efd < 0) {
|
|
|
|
printf("failed to open event %s\n", event);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = read(efd, buf, sizeof(buf));
|
|
|
|
if (err < 0 || err >= sizeof(buf)) {
|
|
|
|
printf("read from '%s' failed '%s'\n", event, strerror(errno));
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
close(efd);
|
|
|
|
|
|
|
|
buf[err] = 0;
|
|
|
|
id = atoi(buf);
|
|
|
|
attr.config = id;
|
|
|
|
|
2016-12-09 10:46:19 +08:00
|
|
|
efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
|
2015-03-26 03:49:23 +08:00
|
|
|
if (efd < 0) {
|
|
|
|
printf("event %d fd %d err %s\n", id, efd, strerror(errno));
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
event_fd[prog_cnt - 1] = efd;
|
|
|
|
ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
|
|
|
|
ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
|
|
|
|
|
2014-12-02 07:06:37 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-05-02 20:31:56 +08:00
|
|
|
static int load_maps(struct bpf_map_data *maps, int nr_maps,
|
|
|
|
fixup_map_cb fixup_map)
|
2014-12-02 07:06:37 +08:00
|
|
|
{
|
2017-08-19 02:28:01 +08:00
|
|
|
int i, numa_node;
|
2017-05-02 20:31:56 +08:00
|
|
|
|
2017-04-28 22:25:04 +08:00
|
|
|
for (i = 0; i < nr_maps; i++) {
|
2017-05-02 20:32:01 +08:00
|
|
|
if (fixup_map) {
|
|
|
|
fixup_map(&maps[i], i);
|
|
|
|
/* Allow userspace to assign map FD prior to creation */
|
|
|
|
if (maps[i].fd != -1) {
|
|
|
|
map_fd[i] = maps[i].fd;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
2014-12-02 07:06:37 +08:00
|
|
|
|
2017-08-19 02:28:01 +08:00
|
|
|
numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ?
|
|
|
|
maps[i].def.numa_node : -1;
|
|
|
|
|
2017-05-02 20:31:56 +08:00
|
|
|
if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
|
|
|
|
maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
|
|
|
|
int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
|
2017-03-23 01:00:35 +08:00
|
|
|
|
2017-08-19 02:28:01 +08:00
|
|
|
map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type,
|
2017-05-02 20:31:56 +08:00
|
|
|
maps[i].def.key_size,
|
|
|
|
inner_map_fd,
|
|
|
|
maps[i].def.max_entries,
|
2017-08-19 02:28:01 +08:00
|
|
|
maps[i].def.map_flags,
|
|
|
|
numa_node);
|
2017-03-23 01:00:35 +08:00
|
|
|
} else {
|
2017-08-19 02:28:01 +08:00
|
|
|
map_fd[i] = bpf_create_map_node(maps[i].def.type,
|
|
|
|
maps[i].def.key_size,
|
|
|
|
maps[i].def.value_size,
|
|
|
|
maps[i].def.max_entries,
|
|
|
|
maps[i].def.map_flags,
|
|
|
|
numa_node);
|
2017-03-23 01:00:35 +08:00
|
|
|
}
|
2016-03-08 13:57:18 +08:00
|
|
|
if (map_fd[i] < 0) {
|
|
|
|
printf("failed to create a map: %d %s\n",
|
|
|
|
errno, strerror(errno));
|
2014-12-02 07:06:37 +08:00
|
|
|
return 1;
|
2016-03-08 13:57:18 +08:00
|
|
|
}
|
2017-05-02 20:31:56 +08:00
|
|
|
maps[i].fd = map_fd[i];
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
|
2017-05-02 20:31:56 +08:00
|
|
|
if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY)
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
prog_array_fd = map_fd[i];
|
2014-12-02 07:06:37 +08:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
|
|
|
|
GElf_Shdr *shdr, Elf_Data **data)
|
|
|
|
{
|
|
|
|
Elf_Scn *scn;
|
|
|
|
|
|
|
|
scn = elf_getscn(elf, i);
|
|
|
|
if (!scn)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (gelf_getshdr(scn, shdr) != shdr)
|
|
|
|
return 2;
|
|
|
|
|
|
|
|
*shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
|
|
|
|
if (!*shname || !shdr->sh_size)
|
|
|
|
return 3;
|
|
|
|
|
|
|
|
*data = elf_getdata(scn, 0);
|
|
|
|
if (!*data || elf_getdata(scn, *data) != NULL)
|
|
|
|
return 4;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
|
2017-05-02 20:31:56 +08:00
|
|
|
GElf_Shdr *shdr, struct bpf_insn *insn,
|
|
|
|
struct bpf_map_data *maps, int nr_maps)
|
2014-12-02 07:06:37 +08:00
|
|
|
{
|
|
|
|
int i, nrels;
|
|
|
|
|
|
|
|
nrels = shdr->sh_size / shdr->sh_entsize;
|
|
|
|
|
|
|
|
for (i = 0; i < nrels; i++) {
|
|
|
|
GElf_Sym sym;
|
|
|
|
GElf_Rel rel;
|
|
|
|
unsigned int insn_idx;
|
2017-05-02 20:31:56 +08:00
|
|
|
bool match = false;
|
|
|
|
int j, map_idx;
|
2014-12-02 07:06:37 +08:00
|
|
|
|
|
|
|
gelf_getrel(data, i, &rel);
|
|
|
|
|
|
|
|
insn_idx = rel.r_offset / sizeof(struct bpf_insn);
|
|
|
|
|
|
|
|
gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);
|
|
|
|
|
|
|
|
if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
|
|
|
|
printf("invalid relo for insn[%d].code 0x%x\n",
|
|
|
|
insn_idx, insn[insn_idx].code);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
|
2017-05-02 20:31:56 +08:00
|
|
|
|
|
|
|
/* Match FD relocation against recorded map_data[] offset */
|
|
|
|
for (map_idx = 0; map_idx < nr_maps; map_idx++) {
|
|
|
|
if (maps[map_idx].elf_offset == sym.st_value) {
|
|
|
|
match = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (match) {
|
|
|
|
insn[insn_idx].imm = maps[map_idx].fd;
|
|
|
|
} else {
|
|
|
|
printf("invalid relo for insn[%d] no map_data match\n",
|
|
|
|
insn_idx);
|
|
|
|
return 1;
|
|
|
|
}
|
2014-12-02 07:06:37 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-04-15 01:30:28 +08:00
|
|
|
static int cmp_symbols(const void *l, const void *r)
|
|
|
|
{
|
|
|
|
const GElf_Sym *lsym = (const GElf_Sym *)l;
|
|
|
|
const GElf_Sym *rsym = (const GElf_Sym *)r;
|
|
|
|
|
|
|
|
if (lsym->st_value < rsym->st_value)
|
|
|
|
return -1;
|
|
|
|
else if (lsym->st_value > rsym->st_value)
|
|
|
|
return 1;
|
|
|
|
else
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-05-02 20:31:56 +08:00
|
|
|
static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
|
|
|
|
Elf *elf, Elf_Data *symbols, int strtabidx)
|
2017-04-15 01:30:28 +08:00
|
|
|
{
|
2017-05-02 20:31:56 +08:00
|
|
|
int map_sz_elf, map_sz_copy;
|
|
|
|
bool validate_zero = false;
|
|
|
|
Elf_Data *data_maps;
|
|
|
|
int i, nr_maps;
|
|
|
|
GElf_Sym *sym;
|
|
|
|
Elf_Scn *scn;
|
|
|
|
int copy_sz;
|
|
|
|
|
|
|
|
if (maps_shndx < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
if (!symbols)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* Get data for maps section via elf index */
|
|
|
|
scn = elf_getscn(elf, maps_shndx);
|
|
|
|
if (scn)
|
|
|
|
data_maps = elf_getdata(scn, NULL);
|
|
|
|
if (!scn || !data_maps) {
|
|
|
|
printf("Failed to get Elf_Data from maps section %d\n",
|
|
|
|
maps_shndx);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2017-04-15 01:30:28 +08:00
|
|
|
|
2017-05-02 20:31:56 +08:00
|
|
|
/* For each map get corrosponding symbol table entry */
|
|
|
|
sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym));
|
|
|
|
for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
|
|
|
|
assert(nr_maps < MAX_MAPS+1);
|
|
|
|
if (!gelf_getsym(symbols, i, &sym[nr_maps]))
|
2017-04-15 01:30:28 +08:00
|
|
|
continue;
|
2017-05-02 20:31:56 +08:00
|
|
|
if (sym[nr_maps].st_shndx != maps_shndx)
|
2017-04-15 01:30:28 +08:00
|
|
|
continue;
|
2017-05-02 20:31:56 +08:00
|
|
|
/* Only increment iif maps section */
|
2017-04-15 01:30:28 +08:00
|
|
|
nr_maps++;
|
|
|
|
}
|
|
|
|
|
2017-05-02 20:31:56 +08:00
|
|
|
/* Align to map_fd[] order, via sort on offset in sym.st_value */
|
|
|
|
qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);
|
|
|
|
|
|
|
|
/* Keeping compatible with ELF maps section changes
|
|
|
|
* ------------------------------------------------
|
|
|
|
* The program size of struct bpf_map_def is known by loader
|
|
|
|
* code, but struct stored in ELF file can be different.
|
|
|
|
*
|
|
|
|
* Unfortunately sym[i].st_size is zero. To calculate the
|
|
|
|
* struct size stored in the ELF file, assume all struct have
|
|
|
|
* the same size, and simply divide with number of map
|
|
|
|
* symbols.
|
|
|
|
*/
|
|
|
|
map_sz_elf = data_maps->d_size / nr_maps;
|
|
|
|
map_sz_copy = sizeof(struct bpf_map_def);
|
|
|
|
if (map_sz_elf < map_sz_copy) {
|
|
|
|
/*
|
|
|
|
* Backward compat, loading older ELF file with
|
|
|
|
* smaller struct, keeping remaining bytes zero.
|
|
|
|
*/
|
|
|
|
map_sz_copy = map_sz_elf;
|
|
|
|
} else if (map_sz_elf > map_sz_copy) {
|
|
|
|
/*
|
|
|
|
* Forward compat, loading newer ELF file with larger
|
|
|
|
* struct with unknown features. Assume zero means
|
|
|
|
* feature not used. Thus, validate rest of struct
|
|
|
|
* data is zero.
|
|
|
|
*/
|
|
|
|
validate_zero = true;
|
|
|
|
}
|
2017-04-15 01:30:28 +08:00
|
|
|
|
2017-05-02 20:31:56 +08:00
|
|
|
/* Memcpy relevant part of ELF maps data to loader maps */
|
2017-04-15 01:30:28 +08:00
|
|
|
for (i = 0; i < nr_maps; i++) {
|
2017-05-02 20:31:56 +08:00
|
|
|
unsigned char *addr, *end;
|
|
|
|
struct bpf_map_def *def;
|
|
|
|
const char *map_name;
|
|
|
|
size_t offset;
|
|
|
|
|
|
|
|
map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
|
|
|
|
maps[i].name = strdup(map_name);
|
|
|
|
if (!maps[i].name) {
|
2017-04-15 01:30:28 +08:00
|
|
|
printf("strdup(%s): %s(%d)\n", map_name,
|
|
|
|
strerror(errno), errno);
|
2017-05-02 20:31:56 +08:00
|
|
|
free(sym);
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Symbol value is offset into ELF maps section data area */
|
|
|
|
offset = sym[i].st_value;
|
|
|
|
def = (struct bpf_map_def *)(data_maps->d_buf + offset);
|
|
|
|
maps[i].elf_offset = offset;
|
|
|
|
memset(&maps[i].def, 0, sizeof(struct bpf_map_def));
|
|
|
|
memcpy(&maps[i].def, def, map_sz_copy);
|
|
|
|
|
|
|
|
/* Verify no newer features were requested */
|
|
|
|
if (validate_zero) {
|
|
|
|
addr = (unsigned char*) def + map_sz_copy;
|
|
|
|
end = (unsigned char*) def + map_sz_elf;
|
|
|
|
for (; addr < end; addr++) {
|
|
|
|
if (*addr != 0) {
|
|
|
|
free(sym);
|
|
|
|
return -EFBIG;
|
|
|
|
}
|
|
|
|
}
|
2017-04-15 01:30:28 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-02 20:31:56 +08:00
|
|
|
free(sym);
|
2017-04-28 22:25:04 +08:00
|
|
|
return nr_maps;
|
2017-04-15 01:30:28 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
|
2014-12-02 07:06:37 +08:00
|
|
|
{
|
2017-04-15 01:30:28 +08:00
|
|
|
int fd, i, ret, maps_shndx = -1, strtabidx = -1;
|
2014-12-02 07:06:37 +08:00
|
|
|
Elf *elf;
|
|
|
|
GElf_Ehdr ehdr;
|
|
|
|
GElf_Shdr shdr, shdr_prog;
|
2017-04-15 01:30:28 +08:00
|
|
|
Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL;
|
2017-05-02 20:31:56 +08:00
|
|
|
char *shname, *shname_prog;
|
|
|
|
int nr_maps = 0;
|
2014-12-02 07:06:37 +08:00
|
|
|
|
2017-02-09 04:27:43 +08:00
|
|
|
/* reset global variables */
|
|
|
|
kern_version = 0;
|
|
|
|
memset(license, 0, sizeof(license));
|
|
|
|
memset(processed_sec, 0, sizeof(processed_sec));
|
|
|
|
|
2014-12-02 07:06:37 +08:00
|
|
|
if (elf_version(EV_CURRENT) == EV_NONE)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
fd = open(path, O_RDONLY, 0);
|
|
|
|
if (fd < 0)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
elf = elf_begin(fd, ELF_C_READ, NULL);
|
|
|
|
|
|
|
|
if (!elf)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (gelf_getehdr(elf, &ehdr) != &ehdr)
|
|
|
|
return 1;
|
|
|
|
|
2015-03-26 03:49:23 +08:00
|
|
|
/* clear all kprobes */
|
|
|
|
i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
|
|
|
|
|
2014-12-02 07:06:37 +08:00
|
|
|
/* scan over all elf sections to get license and map info */
|
|
|
|
for (i = 1; i < ehdr.e_shnum; i++) {
|
|
|
|
|
|
|
|
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (0) /* helpful for llvm debugging */
|
|
|
|
printf("section %d:%s data %p size %zd link %d flags %d\n",
|
|
|
|
i, shname, data->d_buf, data->d_size,
|
|
|
|
shdr.sh_link, (int) shdr.sh_flags);
|
|
|
|
|
|
|
|
if (strcmp(shname, "license") == 0) {
|
|
|
|
processed_sec[i] = true;
|
|
|
|
memcpy(license, data->d_buf, data->d_size);
|
2015-03-26 03:49:23 +08:00
|
|
|
} else if (strcmp(shname, "version") == 0) {
|
|
|
|
processed_sec[i] = true;
|
|
|
|
if (data->d_size != sizeof(int)) {
|
|
|
|
printf("invalid size of version section %zd\n",
|
|
|
|
data->d_size);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
memcpy(&kern_version, data->d_buf, sizeof(int));
|
2014-12-02 07:06:37 +08:00
|
|
|
} else if (strcmp(shname, "maps") == 0) {
|
2017-05-02 20:31:56 +08:00
|
|
|
int j;
|
|
|
|
|
2017-04-15 01:30:28 +08:00
|
|
|
maps_shndx = i;
|
|
|
|
data_maps = data;
|
2017-05-02 20:31:56 +08:00
|
|
|
for (j = 0; j < MAX_MAPS; j++)
|
|
|
|
map_data[j].fd = -1;
|
2014-12-02 07:06:37 +08:00
|
|
|
} else if (shdr.sh_type == SHT_SYMTAB) {
|
2017-04-15 01:30:28 +08:00
|
|
|
strtabidx = shdr.sh_link;
|
2014-12-02 07:06:37 +08:00
|
|
|
symbols = data;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-04-15 01:30:28 +08:00
|
|
|
ret = 1;
|
|
|
|
|
|
|
|
if (!symbols) {
|
|
|
|
printf("missing SHT_SYMTAB section\n");
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (data_maps) {
|
2017-05-02 20:31:56 +08:00
|
|
|
nr_maps = load_elf_maps_section(map_data, maps_shndx,
|
|
|
|
elf, symbols, strtabidx);
|
|
|
|
if (nr_maps < 0) {
|
|
|
|
printf("Error: Failed loading ELF maps (errno:%d):%s\n",
|
|
|
|
nr_maps, strerror(-nr_maps));
|
2017-04-28 22:25:04 +08:00
|
|
|
ret = 1;
|
|
|
|
goto done;
|
|
|
|
}
|
2017-05-02 20:31:56 +08:00
|
|
|
if (load_maps(map_data, nr_maps, fixup_map))
|
2017-04-15 01:30:28 +08:00
|
|
|
goto done;
|
2017-05-02 20:31:56 +08:00
|
|
|
map_data_count = nr_maps;
|
2017-04-15 01:30:28 +08:00
|
|
|
|
|
|
|
processed_sec[maps_shndx] = true;
|
|
|
|
}
|
|
|
|
|
2017-05-30 20:37:51 +08:00
|
|
|
/* process all relo sections, and rewrite bpf insns for maps */
|
2014-12-02 07:06:37 +08:00
|
|
|
for (i = 1; i < ehdr.e_shnum; i++) {
|
2017-02-09 04:27:42 +08:00
|
|
|
if (processed_sec[i])
|
|
|
|
continue;
|
2014-12-02 07:06:37 +08:00
|
|
|
|
|
|
|
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
|
|
|
|
continue;
|
2017-05-30 20:37:51 +08:00
|
|
|
|
2014-12-02 07:06:37 +08:00
|
|
|
if (shdr.sh_type == SHT_REL) {
|
|
|
|
struct bpf_insn *insns;
|
|
|
|
|
2017-05-30 20:37:51 +08:00
|
|
|
/* locate prog sec that need map fixup (relocations) */
|
2014-12-02 07:06:37 +08:00
|
|
|
if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
|
|
|
|
&shdr_prog, &data_prog))
|
|
|
|
continue;
|
|
|
|
|
2016-11-23 08:52:09 +08:00
|
|
|
if (shdr_prog.sh_type != SHT_PROGBITS ||
|
|
|
|
!(shdr_prog.sh_flags & SHF_EXECINSTR))
|
|
|
|
continue;
|
|
|
|
|
2014-12-02 07:06:37 +08:00
|
|
|
insns = (struct bpf_insn *) data_prog->d_buf;
|
2017-05-30 20:37:51 +08:00
|
|
|
processed_sec[i] = true; /* relo section */
|
2014-12-02 07:06:37 +08:00
|
|
|
|
2017-05-02 20:31:56 +08:00
|
|
|
if (parse_relo_and_apply(data, symbols, &shdr, insns,
|
|
|
|
map_data, nr_maps))
|
2014-12-02 07:06:37 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-30 20:37:51 +08:00
|
|
|
/* load programs */
|
2014-12-02 07:06:37 +08:00
|
|
|
for (i = 1; i < ehdr.e_shnum; i++) {
|
|
|
|
|
|
|
|
if (processed_sec[i])
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
|
|
|
|
continue;
|
|
|
|
|
2015-03-26 03:49:23 +08:00
|
|
|
if (memcmp(shname, "kprobe/", 7) == 0 ||
|
|
|
|
memcmp(shname, "kretprobe/", 10) == 0 ||
|
2016-04-07 09:43:29 +08:00
|
|
|
memcmp(shname, "tracepoint/", 11) == 0 ||
|
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-20 03:16:51 +08:00
|
|
|
memcmp(shname, "xdp", 3) == 0 ||
|
2016-09-02 09:37:25 +08:00
|
|
|
memcmp(shname, "perf_event", 10) == 0 ||
|
2016-12-02 00:48:07 +08:00
|
|
|
memcmp(shname, "socket", 6) == 0 ||
|
bpf: BPF support for sock_ops
Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding
struct that allows BPF programs of this type to access some of the
socket's fields (such as IP addresses, ports, etc.). It uses the
existing bpf cgroups infrastructure so the programs can be attached per
cgroup with full inheritance support. The program will be called at
appropriate times to set relevant connections parameters such as buffer
sizes, SYN and SYN-ACK RTOs, etc., based on connection information such
as IP addresses, port numbers, etc.
Alghough there are already 3 mechanisms to set parameters (sysctls,
route metrics and setsockopts), this new mechanism provides some
distinct advantages. Unlike sysctls, it can set parameters per
connection. In contrast to route metrics, it can also use port numbers
and information provided by a user level program. In addition, it could
set parameters probabilistically for evaluation purposes (i.e. do
something different on 10% of the flows and compare results with the
other 90% of the flows). Also, in cases where IPv6 addresses contain
geographic information, the rules to make changes based on the distance
(or RTT) between the hosts are much easier than route metric rules and
can be global. Finally, unlike setsockopt, it oes not require
application changes and it can be updated easily at any time.
Although the bpf cgroup framework already contains a sock related
program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type
(BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called
only once during the connections's lifetime. In contrast, the new
program type will be called multiple times from different places in the
network stack code. For example, before sending SYN and SYN-ACKs to set
an appropriate timeout, when the connection is established to set
congestion control, etc. As a result it has "op" field to specify the
type of operation requested.
The purpose of this new program type is to simplify setting connection
parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is
easy to use facebook's internal IPv6 addresses to determine if both hosts
of a connection are in the same datacenter. Therefore, it is easy to
write a BPF program to choose a small SYN RTO value when both hosts are
in the same datacenter.
This patch only contains the framework to support the new BPF program
type, following patches add the functionality to set various connection
parameters.
This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS
and a new bpf syscall command to load a new program of this type:
BPF_PROG_LOAD_SOCKET_OPS.
Two new corresponding structs (one for the kernel one for the user/BPF
program):
/* kernel version */
struct bpf_sock_ops_kern {
struct sock *sk;
__u32 op;
union {
__u32 reply;
__u32 replylong[4];
};
};
/* user version
* Some fields are in network byte order reflecting the sock struct
* Use the bpf_ntohl helper macro in samples/bpf/bpf_endian.h to
* convert them to host byte order.
*/
struct bpf_sock_ops {
__u32 op;
union {
__u32 reply;
__u32 replylong[4];
};
__u32 family;
__u32 remote_ip4; /* In network byte order */
__u32 local_ip4; /* In network byte order */
__u32 remote_ip6[4]; /* In network byte order */
__u32 local_ip6[4]; /* In network byte order */
__u32 remote_port; /* In network byte order */
__u32 local_port; /* In host byte horder */
};
Currently there are two types of ops. The first type expects the BPF
program to return a value which is then used by the caller (or a
negative value to indicate the operation is not supported). The second
type expects state changes to be done by the BPF program, for example
through a setsockopt BPF helper function, and they ignore the return
value.
The reply fields of the bpf_sockt_ops struct are there in case a bpf
program needs to return a value larger than an integer.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-01 11:02:40 +08:00
|
|
|
memcmp(shname, "cgroup/", 7) == 0 ||
|
2017-08-16 13:33:32 +08:00
|
|
|
memcmp(shname, "sockops", 7) == 0 ||
|
|
|
|
memcmp(shname, "sk_skb", 6) == 0) {
|
2017-07-05 06:57:50 +08:00
|
|
|
ret = load_and_attach(shname, data->d_buf,
|
|
|
|
data->d_size);
|
|
|
|
if (ret != 0)
|
|
|
|
goto done;
|
|
|
|
}
|
2014-12-02 07:06:37 +08:00
|
|
|
}
|
|
|
|
|
2017-04-15 01:30:28 +08:00
|
|
|
ret = 0;
|
|
|
|
done:
|
2014-12-02 07:06:37 +08:00
|
|
|
close(fd);
|
2017-04-15 01:30:28 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int load_bpf_file(char *path)
|
|
|
|
{
|
|
|
|
return do_load_bpf_file(path, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map)
|
|
|
|
{
|
|
|
|
return do_load_bpf_file(path, fixup_map);
|
2014-12-02 07:06:37 +08:00
|
|
|
}
|
2015-03-26 03:49:23 +08:00
|
|
|
|
|
|
|
void read_trace_pipe(void)
|
|
|
|
{
|
|
|
|
int trace_fd;
|
|
|
|
|
|
|
|
trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
|
|
|
|
if (trace_fd < 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
static char buf[4096];
|
|
|
|
ssize_t sz;
|
|
|
|
|
|
|
|
sz = read(trace_fd, buf, sizeof(buf));
|
|
|
|
if (sz > 0) {
|
|
|
|
buf[sz] = 0;
|
|
|
|
puts(buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-03-08 13:57:19 +08:00
|
|
|
|
|
|
|
#define MAX_SYMS 300000
|
|
|
|
static struct ksym syms[MAX_SYMS];
|
|
|
|
static int sym_cnt;
|
|
|
|
|
|
|
|
static int ksym_cmp(const void *p1, const void *p2)
|
|
|
|
{
|
|
|
|
return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
|
|
|
|
}
|
|
|
|
|
|
|
|
int load_kallsyms(void)
|
|
|
|
{
|
|
|
|
FILE *f = fopen("/proc/kallsyms", "r");
|
|
|
|
char func[256], buf[256];
|
|
|
|
char symbol;
|
|
|
|
void *addr;
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
if (!f)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
while (!feof(f)) {
|
|
|
|
if (!fgets(buf, sizeof(buf), f))
|
|
|
|
break;
|
|
|
|
if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
|
|
|
|
break;
|
|
|
|
if (!addr)
|
|
|
|
continue;
|
|
|
|
syms[i].addr = (long) addr;
|
|
|
|
syms[i].name = strdup(func);
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
sym_cnt = i;
|
|
|
|
qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ksym *ksym_search(long key)
|
|
|
|
{
|
|
|
|
int start = 0, end = sym_cnt;
|
|
|
|
int result;
|
|
|
|
|
|
|
|
while (start < end) {
|
|
|
|
size_t mid = start + (end - start) / 2;
|
|
|
|
|
|
|
|
result = key - syms[mid].addr;
|
|
|
|
if (result < 0)
|
|
|
|
end = mid;
|
|
|
|
else if (result > 0)
|
|
|
|
start = mid + 1;
|
|
|
|
else
|
|
|
|
return &syms[mid];
|
|
|
|
}
|
|
|
|
|
|
|
|
if (start >= 1 && syms[start - 1].addr < key &&
|
|
|
|
key < syms[start].addr)
|
|
|
|
/* valid ksym */
|
|
|
|
return &syms[start - 1];
|
|
|
|
|
|
|
|
/* out of range. return _stext */
|
|
|
|
return &syms[0];
|
|
|
|
}
|
2016-12-08 07:53:14 +08:00
|
|
|
|
2017-05-01 17:26:15 +08:00
|
|
|
int set_link_xdp_fd(int ifindex, int fd, __u32 flags)
|
2016-12-08 07:53:14 +08:00
|
|
|
{
|
|
|
|
struct sockaddr_nl sa;
|
|
|
|
int sock, seq = 0, len, ret = -1;
|
|
|
|
char buf[4096];
|
|
|
|
struct nlattr *nla, *nla_xdp;
|
|
|
|
struct {
|
|
|
|
struct nlmsghdr nh;
|
|
|
|
struct ifinfomsg ifinfo;
|
|
|
|
char attrbuf[64];
|
|
|
|
} req;
|
|
|
|
struct nlmsghdr *nh;
|
|
|
|
struct nlmsgerr *err;
|
|
|
|
|
|
|
|
memset(&sa, 0, sizeof(sa));
|
|
|
|
sa.nl_family = AF_NETLINK;
|
|
|
|
|
|
|
|
sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
|
|
|
|
if (sock < 0) {
|
|
|
|
printf("open netlink socket: %s\n", strerror(errno));
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
|
|
|
|
printf("bind to netlink: %s\n", strerror(errno));
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
memset(&req, 0, sizeof(req));
|
|
|
|
req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
|
|
|
|
req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
|
|
|
|
req.nh.nlmsg_type = RTM_SETLINK;
|
|
|
|
req.nh.nlmsg_pid = 0;
|
|
|
|
req.nh.nlmsg_seq = ++seq;
|
|
|
|
req.ifinfo.ifi_family = AF_UNSPEC;
|
|
|
|
req.ifinfo.ifi_index = ifindex;
|
2017-04-28 00:11:13 +08:00
|
|
|
|
|
|
|
/* started nested attribute for XDP */
|
2016-12-08 07:53:14 +08:00
|
|
|
nla = (struct nlattr *)(((char *)&req)
|
|
|
|
+ NLMSG_ALIGN(req.nh.nlmsg_len));
|
|
|
|
nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
|
2017-04-28 00:11:13 +08:00
|
|
|
nla->nla_len = NLA_HDRLEN;
|
2016-12-08 07:53:14 +08:00
|
|
|
|
2017-04-28 00:11:13 +08:00
|
|
|
/* add XDP fd */
|
|
|
|
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
|
2016-12-08 07:53:14 +08:00
|
|
|
nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
|
|
|
|
nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
|
|
|
|
memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
|
2017-04-28 00:11:13 +08:00
|
|
|
nla->nla_len += nla_xdp->nla_len;
|
|
|
|
|
|
|
|
/* if user passed in any flags, add those too */
|
|
|
|
if (flags) {
|
|
|
|
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
|
|
|
|
nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/;
|
|
|
|
nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
|
|
|
|
memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
|
|
|
|
nla->nla_len += nla_xdp->nla_len;
|
|
|
|
}
|
2016-12-08 07:53:14 +08:00
|
|
|
|
|
|
|
req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
|
|
|
|
|
|
|
|
if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
|
|
|
|
printf("send to netlink: %s\n", strerror(errno));
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
len = recv(sock, buf, sizeof(buf), 0);
|
|
|
|
if (len < 0) {
|
|
|
|
printf("recv from netlink: %s\n", strerror(errno));
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
|
|
|
|
nh = NLMSG_NEXT(nh, len)) {
|
|
|
|
if (nh->nlmsg_pid != getpid()) {
|
|
|
|
printf("Wrong pid %d, expected %d\n",
|
|
|
|
nh->nlmsg_pid, getpid());
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
if (nh->nlmsg_seq != seq) {
|
|
|
|
printf("Wrong seq %d, expected %d\n",
|
|
|
|
nh->nlmsg_seq, seq);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
switch (nh->nlmsg_type) {
|
|
|
|
case NLMSG_ERROR:
|
|
|
|
err = (struct nlmsgerr *)NLMSG_DATA(nh);
|
|
|
|
if (!err->error)
|
|
|
|
continue;
|
|
|
|
printf("nlmsg error %s\n", strerror(-err->error));
|
|
|
|
goto cleanup;
|
|
|
|
case NLMSG_DONE:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
cleanup:
|
|
|
|
close(sock);
|
|
|
|
return ret;
|
|
|
|
}
|