From 95ff141e52f84f476fcde50560f42d4f118539c0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 15 Mar 2017 18:26:44 -0700 Subject: [PATCH] samples/bpf: add map_lookup microbenchmark $ map_perf_test 128 speed of HASH bpf_map_lookup_elem() in lookups per second w/o JIT w/JIT before 46M 58M after 42M 74M perf report before: 54.23% map_perf_test [kernel.kallsyms] [k] __htab_map_lookup_elem 14.24% map_perf_test [kernel.kallsyms] [k] lookup_elem_raw 8.84% map_perf_test [kernel.kallsyms] [k] htab_map_lookup_elem 5.93% map_perf_test [kernel.kallsyms] [k] bpf_map_lookup_elem 2.30% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 1.49% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler after: 60.03% map_perf_test [kernel.kallsyms] [k] __htab_map_lookup_elem 18.07% map_perf_test [kernel.kallsyms] [k] lookup_elem_raw 2.91% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 1.94% map_perf_test [kernel.kallsyms] [k] _einittext 1.90% map_perf_test [kernel.kallsyms] [k] __audit_syscall_exit 1.72% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler Notice that bpf_map_lookup_elem() and htab_map_lookup_elem() are trivial functions, yet they take sizeable amount of cpu time. htab_map_gen_lookup() removes bpf_map_lookup_elem() and converts htab_map_lookup_elem() into three BPF insns which causing cpu time for bpf_prog_da4fc6a3f41761a2() slightly increase. $ map_perf_test 256 speed of ARRAY bpf_map_lookup_elem() in lookups per second w/o JIT w/JIT before 97M 174M after 64M 280M before: 37.33% map_perf_test [kernel.kallsyms] [k] array_map_lookup_elem 13.95% map_perf_test [kernel.kallsyms] [k] bpf_map_lookup_elem 6.54% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 4.57% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler after: 32.86% map_perf_test [kernel.kallsyms] [k] bpf_prog_da4fc6a3f41761a2 6.54% map_perf_test [kernel.kallsyms] [k] kprobe_ftrace_handler array_map_gen_lookup() removes calls to array_map_lookup_elem() and bpf_map_lookup_elem() and replaces them with 7 bpf insns. The performance without JIT is slower, since executing extra insns in the interpreter is slower than running native C code, but with JIT the performance gains are obvious, since native C->x86 code is replaced with fewer bpf->x86 instructions. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/map_perf_test_kern.c | 33 ++++++++++++++++++++++++++++++++ samples/bpf/map_perf_test_user.c | 32 +++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index a91872a97742..9da2a3441b0a 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -65,6 +65,13 @@ struct bpf_map_def SEC("maps") lpm_trie_map_alloc = { .map_flags = BPF_F_NO_PREALLOC, }; +struct bpf_map_def SEC("maps") array_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = MAX_ENTRIES, +}; + SEC("kprobe/sys_getuid") int stress_hmap(struct pt_regs *ctx) { @@ -165,5 +172,31 @@ int stress_lpm_trie_map_alloc(struct pt_regs *ctx) return 0; } +SEC("kprobe/sys_getpgid") +int stress_hash_map_lookup(struct pt_regs *ctx) +{ + u32 key = 1, i; + long *value; + +#pragma clang loop unroll(full) + for (i = 0; i < 64; ++i) + value = bpf_map_lookup_elem(&hash_map, &key); + + return 0; +} + +SEC("kprobe/sys_getpgrp") +int stress_array_map_lookup(struct pt_regs *ctx) +{ + u32 key = 1, i; + long *value; + +#pragma clang loop unroll(full) + for (i = 0; i < 64; ++i) + value = bpf_map_lookup_elem(&array_map, &key); + + return 0; +} + char _license[] SEC("license") = "GPL"; u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index 680260a91f50..e29ff318a793 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -38,6 +38,8 @@ static __u64 time_get_ns(void) #define LRU_HASH_PREALLOC (1 << 4) #define PERCPU_LRU_HASH_PREALLOC (1 << 5) #define LPM_KMALLOC (1 << 6) +#define HASH_LOOKUP (1 << 7) +#define ARRAY_LOOKUP (1 << 8) static int test_flags = ~0; @@ -125,6 +127,30 @@ static void test_lpm_kmalloc(int cpu) cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); } +static void test_hash_lookup(int cpu) +{ + __u64 start_time; + int i; + + start_time = time_get_ns(); + for (i = 0; i < MAX_CNT; i++) + syscall(__NR_getpgid, 0); + printf("%d:hash_lookup %lld lookups per sec\n", + cpu, MAX_CNT * 1000000000ll * 64 / (time_get_ns() - start_time)); +} + +static void test_array_lookup(int cpu) +{ + __u64 start_time; + int i; + + start_time = time_get_ns(); + for (i = 0; i < MAX_CNT; i++) + syscall(__NR_getpgrp, 0); + printf("%d:array_lookup %lld lookups per sec\n", + cpu, MAX_CNT * 1000000000ll * 64 / (time_get_ns() - start_time)); +} + static void loop(int cpu) { cpu_set_t cpuset; @@ -153,6 +179,12 @@ static void loop(int cpu) if (test_flags & LPM_KMALLOC) test_lpm_kmalloc(cpu); + + if (test_flags & HASH_LOOKUP) + test_hash_lookup(cpu); + + if (test_flags & ARRAY_LOOKUP) + test_array_lookup(cpu); } static void run_perf_test(int tasks)