linux-sg2042/samples/bpf/map_perf_test_user.c

289 lines
6.2 KiB
C
Raw Normal View History

/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <sys/types.h>
#include <asm/unistd.h>
#include <unistd.h>
#include <assert.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <signal.h>
#include <linux/bpf.h>
#include <string.h>
#include <time.h>
samples/bpf: Fix build breakage with map_perf_test_user.c Building BPF samples is failing with the below error: samples/bpf/map_perf_test_user.c: In function ‘main’: samples/bpf/map_perf_test_user.c:134:9: error: variable ‘r’ has initializer but incomplete type struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; ^ samples/bpf/map_perf_test_user.c:134:21: error: ‘RLIM_INFINITY’ undeclared (first use in this function) struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; ^ samples/bpf/map_perf_test_user.c:134:21: note: each undeclared identifier is reported only once for each function it appears in samples/bpf/map_perf_test_user.c:134:9: warning: excess elements in struct initializer [enabled by default] struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; ^ samples/bpf/map_perf_test_user.c:134:9: warning: (near initialization for ‘r’) [enabled by default] samples/bpf/map_perf_test_user.c:134:9: warning: excess elements in struct initializer [enabled by default] samples/bpf/map_perf_test_user.c:134:9: warning: (near initialization for ‘r’) [enabled by default] samples/bpf/map_perf_test_user.c:134:16: error: storage size of ‘r’ isn’t known struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; ^ samples/bpf/map_perf_test_user.c:139:2: warning: implicit declaration of function ‘setrlimit’ [-Wimplicit-function-declaration] setrlimit(RLIMIT_MEMLOCK, &r); ^ samples/bpf/map_perf_test_user.c:139:12: error: ‘RLIMIT_MEMLOCK’ undeclared (first use in this function) setrlimit(RLIMIT_MEMLOCK, &r); ^ samples/bpf/map_perf_test_user.c:134:16: warning: unused variable ‘r’ [-Wunused-variable] struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; ^ make[2]: *** [samples/bpf/map_perf_test_user.o] Error 1 Fix this by including the necessary header file. Cc: Alexei Starovoitov <ast@fb.com> Cc: Daniel Borkmann <daniel@iogearbox.net> Cc: David S. Miller <davem@davemloft.net> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 01:01:32 +08:00
#include <sys/resource.h>
#include <arpa/inet.h>
#include <errno.h>
#include "libbpf.h"
#include "bpf_load.h"
#define MAX_CNT 1000000
static __u64 time_get_ns(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000000000ull + ts.tv_nsec;
}
#define HASH_PREALLOC (1 << 0)
#define PERCPU_HASH_PREALLOC (1 << 1)
#define HASH_KMALLOC (1 << 2)
#define PERCPU_HASH_KMALLOC (1 << 3)
bpf: Add tests for the LRU bpf_htab This patch has some unit tests and a test_lru_dist. The test_lru_dist reads in the numeric keys from a file. The files used here are generated by a modified fio-genzipf tool originated from the fio test suit. The sample data file can be found here: https://github.com/iamkafai/bpf-lru The zipf.* data files have 100k numeric keys and the key is also ranged from 1 to 100k. The test_lru_dist outputs the number of unique keys (nr_unique). F.e. The following means, 61239 of them is unique out of 100k keys. nr_misses means it cannot be found in the LRU map, so nr_misses must be >= nr_unique. test_lru_dist also simulates a perfect LRU map as a comparison: [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a1_01.out 4000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31603(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) .... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31710(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a0_01.out 40000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67054(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) ... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67068(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) LRU map has also been added to map_perf_test: /* Global LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 16 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2934082 updates 4 cpus: 7391434 updates 8 cpus: 6500576 updates /* Percpu LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 32 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2896553 updates 4 cpus: 9766395 updates 8 cpus: 17460553 updates Signed-off-by: Martin KaFai Lau <kafai@fb.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-12 02:55:11 +08:00
#define LRU_HASH_PREALLOC (1 << 4)
#define NOCOMMON_LRU_HASH_PREALLOC (1 << 5)
#define LPM_KMALLOC (1 << 6)
samples/bpf: add map_lookup microbenchmark $ map_perf_test 128 speed of HASH bpf_map_lookup_elem() in lookups per second w/o JIT w/JIT before 46M 58M after 42M 74M perf report before: 54.23% map_perf_test [kernel.kallsyms] [k] __htab_map_lookup_elem 14.24% map_perf_test [kernel.kallsyms] [k] lookup_elem_raw 8.84% map_perf_test [kernel.kallsyms] [k] htab_map_lookup_elem 5.93% map_perf_test [kernel.kallsyms] [k] bpf_map_lookup_elem 2.30% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 1.49% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler after: 60.03% map_perf_test [kernel.kallsyms] [k] __htab_map_lookup_elem 18.07% map_perf_test [kernel.kallsyms] [k] lookup_elem_raw 2.91% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 1.94% map_perf_test [kernel.kallsyms] [k] _einittext 1.90% map_perf_test [kernel.kallsyms] [k] __audit_syscall_exit 1.72% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler Notice that bpf_map_lookup_elem() and htab_map_lookup_elem() are trivial functions, yet they take sizeable amount of cpu time. htab_map_gen_lookup() removes bpf_map_lookup_elem() and converts htab_map_lookup_elem() into three BPF insns which causing cpu time for bpf_prog_da4fc6a3f41761a2() slightly increase. $ map_perf_test 256 speed of ARRAY bpf_map_lookup_elem() in lookups per second w/o JIT w/JIT before 97M 174M after 64M 280M before: 37.33% map_perf_test [kernel.kallsyms] [k] array_map_lookup_elem 13.95% map_perf_test [kernel.kallsyms] [k] bpf_map_lookup_elem 6.54% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 4.57% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler after: 32.86% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 6.54% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler array_map_gen_lookup() removes calls to array_map_lookup_elem() and bpf_map_lookup_elem() and replaces them with 7 bpf insns. The performance without JIT is slower, since executing extra insns in the interpreter is slower than running native C code, but with JIT the performance gains are obvious, since native C->x86 code is replaced with fewer bpf->x86 instructions. Signed-off-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-16 09:26:44 +08:00
#define HASH_LOOKUP (1 << 7)
#define ARRAY_LOOKUP (1 << 8)
static int test_flags = ~0;
static void test_hash_prealloc(int cpu)
{
__u64 start_time;
int i;
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++)
syscall(__NR_getuid);
printf("%d:hash_map_perf pre-alloc %lld events per sec\n",
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
}
static void do_test_lru(int lru_test_flag, int cpu)
bpf: Add tests for the LRU bpf_htab This patch has some unit tests and a test_lru_dist. The test_lru_dist reads in the numeric keys from a file. The files used here are generated by a modified fio-genzipf tool originated from the fio test suit. The sample data file can be found here: https://github.com/iamkafai/bpf-lru The zipf.* data files have 100k numeric keys and the key is also ranged from 1 to 100k. The test_lru_dist outputs the number of unique keys (nr_unique). F.e. The following means, 61239 of them is unique out of 100k keys. nr_misses means it cannot be found in the LRU map, so nr_misses must be >= nr_unique. test_lru_dist also simulates a perfect LRU map as a comparison: [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a1_01.out 4000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31603(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) .... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31710(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a0_01.out 40000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67054(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) ... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67068(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) LRU map has also been added to map_perf_test: /* Global LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 16 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2934082 updates 4 cpus: 7391434 updates 8 cpus: 6500576 updates /* Percpu LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 32 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2896553 updates 4 cpus: 9766395 updates 8 cpus: 17460553 updates Signed-off-by: Martin KaFai Lau <kafai@fb.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-12 02:55:11 +08:00
{
struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 };
const char *test_name;
bpf: Add tests for the LRU bpf_htab This patch has some unit tests and a test_lru_dist. The test_lru_dist reads in the numeric keys from a file. The files used here are generated by a modified fio-genzipf tool originated from the fio test suit. The sample data file can be found here: https://github.com/iamkafai/bpf-lru The zipf.* data files have 100k numeric keys and the key is also ranged from 1 to 100k. The test_lru_dist outputs the number of unique keys (nr_unique). F.e. The following means, 61239 of them is unique out of 100k keys. nr_misses means it cannot be found in the LRU map, so nr_misses must be >= nr_unique. test_lru_dist also simulates a perfect LRU map as a comparison: [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a1_01.out 4000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31603(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) .... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31710(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a0_01.out 40000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67054(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) ... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67068(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) LRU map has also been added to map_perf_test: /* Global LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 16 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2934082 updates 4 cpus: 7391434 updates 8 cpus: 6500576 updates /* Percpu LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 32 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2896553 updates 4 cpus: 9766395 updates 8 cpus: 17460553 updates Signed-off-by: Martin KaFai Lau <kafai@fb.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-12 02:55:11 +08:00
__u64 start_time;
int i, ret;
in6.sin6_addr.s6_addr16[0] = 0xdead;
in6.sin6_addr.s6_addr16[1] = 0xbeef;
if (lru_test_flag & LRU_HASH_PREALLOC) {
test_name = "lru_hash_map_perf";
in6.sin6_addr.s6_addr16[7] = 0;
} else if (lru_test_flag & NOCOMMON_LRU_HASH_PREALLOC) {
test_name = "nocommon_lru_hash_map_perf";
in6.sin6_addr.s6_addr16[7] = 1;
} else {
assert(0);
}
bpf: Add tests for the LRU bpf_htab This patch has some unit tests and a test_lru_dist. The test_lru_dist reads in the numeric keys from a file. The files used here are generated by a modified fio-genzipf tool originated from the fio test suit. The sample data file can be found here: https://github.com/iamkafai/bpf-lru The zipf.* data files have 100k numeric keys and the key is also ranged from 1 to 100k. The test_lru_dist outputs the number of unique keys (nr_unique). F.e. The following means, 61239 of them is unique out of 100k keys. nr_misses means it cannot be found in the LRU map, so nr_misses must be >= nr_unique. test_lru_dist also simulates a perfect LRU map as a comparison: [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a1_01.out 4000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31603(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) .... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31710(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a0_01.out 40000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67054(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) ... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67068(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) LRU map has also been added to map_perf_test: /* Global LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 16 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2934082 updates 4 cpus: 7391434 updates 8 cpus: 6500576 updates /* Percpu LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 32 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2896553 updates 4 cpus: 9766395 updates 8 cpus: 17460553 updates Signed-off-by: Martin KaFai Lau <kafai@fb.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-12 02:55:11 +08:00
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++) {
ret = connect(-1, (const struct sockaddr *)&in6, sizeof(in6));
assert(ret == -1 && errno == EBADF);
}
printf("%d:%s pre-alloc %lld events per sec\n",
cpu, test_name,
MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
bpf: Add tests for the LRU bpf_htab This patch has some unit tests and a test_lru_dist. The test_lru_dist reads in the numeric keys from a file. The files used here are generated by a modified fio-genzipf tool originated from the fio test suit. The sample data file can be found here: https://github.com/iamkafai/bpf-lru The zipf.* data files have 100k numeric keys and the key is also ranged from 1 to 100k. The test_lru_dist outputs the number of unique keys (nr_unique). F.e. The following means, 61239 of them is unique out of 100k keys. nr_misses means it cannot be found in the LRU map, so nr_misses must be >= nr_unique. test_lru_dist also simulates a perfect LRU map as a comparison: [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a1_01.out 4000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31603(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) .... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31710(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a0_01.out 40000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67054(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) ... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67068(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) LRU map has also been added to map_perf_test: /* Global LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 16 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2934082 updates 4 cpus: 7391434 updates 8 cpus: 6500576 updates /* Percpu LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 32 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2896553 updates 4 cpus: 9766395 updates 8 cpus: 17460553 updates Signed-off-by: Martin KaFai Lau <kafai@fb.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-12 02:55:11 +08:00
}
static void test_lru_hash_prealloc(int cpu)
bpf: Add tests for the LRU bpf_htab This patch has some unit tests and a test_lru_dist. The test_lru_dist reads in the numeric keys from a file. The files used here are generated by a modified fio-genzipf tool originated from the fio test suit. The sample data file can be found here: https://github.com/iamkafai/bpf-lru The zipf.* data files have 100k numeric keys and the key is also ranged from 1 to 100k. The test_lru_dist outputs the number of unique keys (nr_unique). F.e. The following means, 61239 of them is unique out of 100k keys. nr_misses means it cannot be found in the LRU map, so nr_misses must be >= nr_unique. test_lru_dist also simulates a perfect LRU map as a comparison: [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a1_01.out 4000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31603(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) .... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31710(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a0_01.out 40000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67054(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) ... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67068(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) LRU map has also been added to map_perf_test: /* Global LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 16 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2934082 updates 4 cpus: 7391434 updates 8 cpus: 6500576 updates /* Percpu LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 32 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2896553 updates 4 cpus: 9766395 updates 8 cpus: 17460553 updates Signed-off-by: Martin KaFai Lau <kafai@fb.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-12 02:55:11 +08:00
{
do_test_lru(LRU_HASH_PREALLOC, cpu);
}
bpf: Add tests for the LRU bpf_htab This patch has some unit tests and a test_lru_dist. The test_lru_dist reads in the numeric keys from a file. The files used here are generated by a modified fio-genzipf tool originated from the fio test suit. The sample data file can be found here: https://github.com/iamkafai/bpf-lru The zipf.* data files have 100k numeric keys and the key is also ranged from 1 to 100k. The test_lru_dist outputs the number of unique keys (nr_unique). F.e. The following means, 61239 of them is unique out of 100k keys. nr_misses means it cannot be found in the LRU map, so nr_misses must be >= nr_unique. test_lru_dist also simulates a perfect LRU map as a comparison: [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a1_01.out 4000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31603(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) .... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31710(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a0_01.out 40000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67054(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) ... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67068(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) LRU map has also been added to map_perf_test: /* Global LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 16 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2934082 updates 4 cpus: 7391434 updates 8 cpus: 6500576 updates /* Percpu LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 32 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2896553 updates 4 cpus: 9766395 updates 8 cpus: 17460553 updates Signed-off-by: Martin KaFai Lau <kafai@fb.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-12 02:55:11 +08:00
static void test_nocommon_lru_hash_prealloc(int cpu)
{
do_test_lru(NOCOMMON_LRU_HASH_PREALLOC, cpu);
bpf: Add tests for the LRU bpf_htab This patch has some unit tests and a test_lru_dist. The test_lru_dist reads in the numeric keys from a file. The files used here are generated by a modified fio-genzipf tool originated from the fio test suit. The sample data file can be found here: https://github.com/iamkafai/bpf-lru The zipf.* data files have 100k numeric keys and the key is also ranged from 1 to 100k. The test_lru_dist outputs the number of unique keys (nr_unique). F.e. The following means, 61239 of them is unique out of 100k keys. nr_misses means it cannot be found in the LRU map, so nr_misses must be >= nr_unique. test_lru_dist also simulates a perfect LRU map as a comparison: [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a1_01.out 4000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31603(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) .... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31710(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a0_01.out 40000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67054(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) ... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67068(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) LRU map has also been added to map_perf_test: /* Global LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 16 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2934082 updates 4 cpus: 7391434 updates 8 cpus: 6500576 updates /* Percpu LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 32 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2896553 updates 4 cpus: 9766395 updates 8 cpus: 17460553 updates Signed-off-by: Martin KaFai Lau <kafai@fb.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-12 02:55:11 +08:00
}
static void test_percpu_hash_prealloc(int cpu)
{
__u64 start_time;
int i;
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++)
syscall(__NR_geteuid);
printf("%d:percpu_hash_map_perf pre-alloc %lld events per sec\n",
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
}
static void test_hash_kmalloc(int cpu)
{
__u64 start_time;
int i;
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++)
syscall(__NR_getgid);
printf("%d:hash_map_perf kmalloc %lld events per sec\n",
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
}
static void test_percpu_hash_kmalloc(int cpu)
{
__u64 start_time;
int i;
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++)
syscall(__NR_getegid);
printf("%d:percpu_hash_map_perf kmalloc %lld events per sec\n",
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
}
static void test_lpm_kmalloc(int cpu)
{
__u64 start_time;
int i;
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++)
syscall(__NR_gettid);
printf("%d:lpm_perf kmalloc %lld events per sec\n",
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
}
samples/bpf: add map_lookup microbenchmark $ map_perf_test 128 speed of HASH bpf_map_lookup_elem() in lookups per second w/o JIT w/JIT before 46M 58M after 42M 74M perf report before: 54.23% map_perf_test [kernel.kallsyms] [k] __htab_map_lookup_elem 14.24% map_perf_test [kernel.kallsyms] [k] lookup_elem_raw 8.84% map_perf_test [kernel.kallsyms] [k] htab_map_lookup_elem 5.93% map_perf_test [kernel.kallsyms] [k] bpf_map_lookup_elem 2.30% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 1.49% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler after: 60.03% map_perf_test [kernel.kallsyms] [k] __htab_map_lookup_elem 18.07% map_perf_test [kernel.kallsyms] [k] lookup_elem_raw 2.91% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 1.94% map_perf_test [kernel.kallsyms] [k] _einittext 1.90% map_perf_test [kernel.kallsyms] [k] __audit_syscall_exit 1.72% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler Notice that bpf_map_lookup_elem() and htab_map_lookup_elem() are trivial functions, yet they take sizeable amount of cpu time. htab_map_gen_lookup() removes bpf_map_lookup_elem() and converts htab_map_lookup_elem() into three BPF insns which causing cpu time for bpf_prog_da4fc6a3f41761a2() slightly increase. $ map_perf_test 256 speed of ARRAY bpf_map_lookup_elem() in lookups per second w/o JIT w/JIT before 97M 174M after 64M 280M before: 37.33% map_perf_test [kernel.kallsyms] [k] array_map_lookup_elem 13.95% map_perf_test [kernel.kallsyms] [k] bpf_map_lookup_elem 6.54% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 4.57% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler after: 32.86% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 6.54% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler array_map_gen_lookup() removes calls to array_map_lookup_elem() and bpf_map_lookup_elem() and replaces them with 7 bpf insns. The performance without JIT is slower, since executing extra insns in the interpreter is slower than running native C code, but with JIT the performance gains are obvious, since native C->x86 code is replaced with fewer bpf->x86 instructions. Signed-off-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-16 09:26:44 +08:00
static void test_hash_lookup(int cpu)
{
__u64 start_time;
int i;
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++)
syscall(__NR_getpgid, 0);
printf("%d:hash_lookup %lld lookups per sec\n",
cpu, MAX_CNT * 1000000000ll * 64 / (time_get_ns() - start_time));
}
static void test_array_lookup(int cpu)
{
__u64 start_time;
int i;
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++)
syscall(__NR_getpgrp, 0);
printf("%d:array_lookup %lld lookups per sec\n",
cpu, MAX_CNT * 1000000000ll * 64 / (time_get_ns() - start_time));
}
static void loop(int cpu)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(cpu, &cpuset);
sched_setaffinity(0, sizeof(cpuset), &cpuset);
if (test_flags & HASH_PREALLOC)
test_hash_prealloc(cpu);
if (test_flags & PERCPU_HASH_PREALLOC)
test_percpu_hash_prealloc(cpu);
if (test_flags & HASH_KMALLOC)
test_hash_kmalloc(cpu);
if (test_flags & PERCPU_HASH_KMALLOC)
test_percpu_hash_kmalloc(cpu);
bpf: Add tests for the LRU bpf_htab This patch has some unit tests and a test_lru_dist. The test_lru_dist reads in the numeric keys from a file. The files used here are generated by a modified fio-genzipf tool originated from the fio test suit. The sample data file can be found here: https://github.com/iamkafai/bpf-lru The zipf.* data files have 100k numeric keys and the key is also ranged from 1 to 100k. The test_lru_dist outputs the number of unique keys (nr_unique). F.e. The following means, 61239 of them is unique out of 100k keys. nr_misses means it cannot be found in the LRU map, so nr_misses must be >= nr_unique. test_lru_dist also simulates a perfect LRU map as a comparison: [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a1_01.out 4000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31603(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) .... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:23093(/100000) nr_misses:31710(/100000) task:0 Perfect LRU: nr_unique:23093(/100000 nr_misses:34328(/100000) [root@arch-fb-vm1 ~]# ~/devshare/fb-kernel/linux/samples/bpf/test_lru_dist \ /root/zipf.100k.a0_01.out 40000 1 ... test_parallel_lru_dist (map_type:9 map_flags:0x0): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67054(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) ... test_parallel_lru_dist (map_type:9 map_flags:0x2): task:0 BPF LRU: nr_unique:61239(/100000) nr_misses:67068(/100000) task:0 Perfect LRU: nr_unique:61239(/100000 nr_misses:66993(/100000) LRU map has also been added to map_perf_test: /* Global LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 16 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2934082 updates 4 cpus: 7391434 updates 8 cpus: 6500576 updates /* Percpu LRU */ [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 32 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2896553 updates 4 cpus: 9766395 updates 8 cpus: 17460553 updates Signed-off-by: Martin KaFai Lau <kafai@fb.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-12 02:55:11 +08:00
if (test_flags & LRU_HASH_PREALLOC)
test_lru_hash_prealloc(cpu);
if (test_flags & NOCOMMON_LRU_HASH_PREALLOC)
test_nocommon_lru_hash_prealloc(cpu);
if (test_flags & LPM_KMALLOC)
test_lpm_kmalloc(cpu);
samples/bpf: add map_lookup microbenchmark $ map_perf_test 128 speed of HASH bpf_map_lookup_elem() in lookups per second w/o JIT w/JIT before 46M 58M after 42M 74M perf report before: 54.23% map_perf_test [kernel.kallsyms] [k] __htab_map_lookup_elem 14.24% map_perf_test [kernel.kallsyms] [k] lookup_elem_raw 8.84% map_perf_test [kernel.kallsyms] [k] htab_map_lookup_elem 5.93% map_perf_test [kernel.kallsyms] [k] bpf_map_lookup_elem 2.30% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 1.49% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler after: 60.03% map_perf_test [kernel.kallsyms] [k] __htab_map_lookup_elem 18.07% map_perf_test [kernel.kallsyms] [k] lookup_elem_raw 2.91% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 1.94% map_perf_test [kernel.kallsyms] [k] _einittext 1.90% map_perf_test [kernel.kallsyms] [k] __audit_syscall_exit 1.72% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler Notice that bpf_map_lookup_elem() and htab_map_lookup_elem() are trivial functions, yet they take sizeable amount of cpu time. htab_map_gen_lookup() removes bpf_map_lookup_elem() and converts htab_map_lookup_elem() into three BPF insns which causing cpu time for bpf_prog_da4fc6a3f41761a2() slightly increase. $ map_perf_test 256 speed of ARRAY bpf_map_lookup_elem() in lookups per second w/o JIT w/JIT before 97M 174M after 64M 280M before: 37.33% map_perf_test [kernel.kallsyms] [k] array_map_lookup_elem 13.95% map_perf_test [kernel.kallsyms] [k] bpf_map_lookup_elem 6.54% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 4.57% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler after: 32.86% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 6.54% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler array_map_gen_lookup() removes calls to array_map_lookup_elem() and bpf_map_lookup_elem() and replaces them with 7 bpf insns. The performance without JIT is slower, since executing extra insns in the interpreter is slower than running native C code, but with JIT the performance gains are obvious, since native C->x86 code is replaced with fewer bpf->x86 instructions. Signed-off-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-16 09:26:44 +08:00
if (test_flags & HASH_LOOKUP)
test_hash_lookup(cpu);
if (test_flags & ARRAY_LOOKUP)
test_array_lookup(cpu);
}
static void run_perf_test(int tasks)
{
pid_t pid[tasks];
int i;
for (i = 0; i < tasks; i++) {
pid[i] = fork();
if (pid[i] == 0) {
loop(i);
exit(0);
} else if (pid[i] == -1) {
printf("couldn't spawn #%d process\n", i);
exit(1);
}
}
for (i = 0; i < tasks; i++) {
int status;
assert(waitpid(pid[i], &status, 0) == pid[i]);
assert(status == 0);
}
}
static void fill_lpm_trie(void)
{
struct bpf_lpm_trie_key *key;
unsigned long value = 0;
unsigned int i;
int r;
key = alloca(sizeof(*key) + 4);
key->prefixlen = 32;
for (i = 0; i < 512; ++i) {
key->prefixlen = rand() % 33;
key->data[0] = rand() & 0xff;
key->data[1] = rand() & 0xff;
key->data[2] = rand() & 0xff;
key->data[3] = rand() & 0xff;
r = bpf_map_update_elem(map_fd[6], key, &value, 0);
assert(!r);
}
key->prefixlen = 32;
key->data[0] = 192;
key->data[1] = 168;
key->data[2] = 0;
key->data[3] = 1;
value = 128;
r = bpf_map_update_elem(map_fd[6], key, &value, 0);
assert(!r);
}
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
char filename[256];
int num_cpu = 8;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
setrlimit(RLIMIT_MEMLOCK, &r);
if (argc > 1)
test_flags = atoi(argv[1]) ? : test_flags;
if (argc > 2)
num_cpu = atoi(argv[2]) ? : num_cpu;
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
}
fill_lpm_trie();
run_perf_test(num_cpu);
return 0;
}