From 6dd1ec6c7a2c304e9e2e2edd9e7ccb8e8791d36a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Jan 2018 15:06:07 -0800 Subject: [PATCH 1/2] bpf: fix kernel page fault in lpm map trie_get_next_key Commit b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map") introduces a bug likes below: if (!rcu_dereference(trie->root)) return -ENOENT; if (!key || key->prefixlen > trie->max_prefixlen) { root = &trie->root; goto find_leftmost; } ...... find_leftmost: for (node = rcu_dereference(*root); node;) { In the code after label find_leftmost, it is assumed that *root should not be NULL, but it is not true as it is possbile trie->root is changed to NULL by an asynchronous delete operation. The issue is reported by syzbot and Eric Dumazet with the below error log: ...... kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 8033 Comm: syz-executor3 Not tainted 4.15.0-rc8+ #4 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:trie_get_next_key+0x3c2/0xf10 kernel/bpf/lpm_trie.c:682 ...... This patch fixed the issue by use local rcu_dereferenced pointer instead of *(&trie->root) later on. Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command or LPM_TRIE map") Reported-by: syzbot Reported-by: Eric Dumazet Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/lpm_trie.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 8f083ea9d4b9..7b469d10d0e9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -593,11 +593,10 @@ unlock: static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { + struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; - struct lpm_trie_node *node, *next_node = NULL, *parent; struct lpm_trie_node **node_stack = NULL; - struct lpm_trie_node __rcu **root; int err = 0, stack_ptr = -1; unsigned int next_bit; size_t matchlen; @@ -614,14 +613,13 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) */ /* Empty trie */ - if (!rcu_dereference(trie->root)) + search_root = rcu_dereference(trie->root); + if (!search_root) return -ENOENT; /* For invalid key, find the leftmost node in the trie */ - if (!key || key->prefixlen > trie->max_prefixlen) { - root = &trie->root; + if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - } node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), GFP_ATOMIC | __GFP_NOWARN); @@ -629,7 +627,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) return -ENOMEM; /* Try to find the exact node for the given key */ - for (node = rcu_dereference(trie->root); node;) { + for (node = search_root; node;) { node_stack[++stack_ptr] = node; matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -640,10 +638,8 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) node = rcu_dereference(node->child[next_bit]); } if (!node || node->prefixlen != key->prefixlen || - (node->flags & LPM_TREE_NODE_FLAG_IM)) { - root = &trie->root; + (node->flags & LPM_TREE_NODE_FLAG_IM)) goto find_leftmost; - } /* The node with the exactly-matching key has been found, * find the first node in postorder after the matched node. @@ -651,10 +647,10 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) node = node_stack[stack_ptr]; while (stack_ptr > 0) { parent = node_stack[stack_ptr - 1]; - if (rcu_dereference(parent->child[0]) == node && - rcu_dereference(parent->child[1])) { - root = &parent->child[1]; - goto find_leftmost; + if (rcu_dereference(parent->child[0]) == node) { + search_root = rcu_dereference(parent->child[1]); + if (search_root) + goto find_leftmost; } if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { next_node = parent; @@ -673,7 +669,7 @@ find_leftmost: /* Find the leftmost non-intermediate node, all intermediate nodes * have exact two children, so this function will never return NULL. */ - for (node = rcu_dereference(*root); node;) { + for (node = search_root; node;) { if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) next_node = node; node = rcu_dereference(node->child[0]); From af32efeede9e188fefe0af51d117c31cf281de65 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Jan 2018 15:06:08 -0800 Subject: [PATCH 2/2] tools/bpf: add a multithreaded stress test in bpf selftests test_lpm_map The new test will spawn four threads, doing map update, delete, lookup and get_next_key in parallel. It is able to reproduce the issue in the previous commit found by syzbot and Eric Dumazet. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/test_lpm_map.c | 95 ++++++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 98688352208b..bf05bc5e36e5 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -11,7 +11,7 @@ ifneq ($(wildcard $(GENHDR)),) endif CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include -LDLIBS += -lcap -lelf -lrt +LDLIBS += -lcap -lelf -lrt -lpthread TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_align test_verifier_log test_dev_cgroup test_tcpbpf_user diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index 081510853c6d..2be87e9ee28d 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -641,6 +642,98 @@ static void test_lpm_get_next_key(void) close(map_fd); } +#define MAX_TEST_KEYS 4 +struct lpm_mt_test_info { + int cmd; /* 0: update, 1: delete, 2: lookup, 3: get_next_key */ + int iter; + int map_fd; + struct { + __u32 prefixlen; + __u32 data; + } key[MAX_TEST_KEYS]; +}; + +static void *lpm_test_command(void *arg) +{ + int i, j, ret, iter, key_size; + struct lpm_mt_test_info *info = arg; + struct bpf_lpm_trie_key *key_p; + + key_size = sizeof(struct bpf_lpm_trie_key) + sizeof(__u32); + key_p = alloca(key_size); + for (iter = 0; iter < info->iter; iter++) + for (i = 0; i < MAX_TEST_KEYS; i++) { + /* first half of iterations in forward order, + * and second half in backward order. + */ + j = (iter < (info->iter / 2)) ? i : MAX_TEST_KEYS - i - 1; + key_p->prefixlen = info->key[j].prefixlen; + memcpy(key_p->data, &info->key[j].data, sizeof(__u32)); + if (info->cmd == 0) { + __u32 value = j; + /* update must succeed */ + assert(bpf_map_update_elem(info->map_fd, key_p, &value, 0) == 0); + } else if (info->cmd == 1) { + ret = bpf_map_delete_elem(info->map_fd, key_p); + assert(ret == 0 || errno == ENOENT); + } else if (info->cmd == 2) { + __u32 value; + ret = bpf_map_lookup_elem(info->map_fd, key_p, &value); + assert(ret == 0 || errno == ENOENT); + } else { + struct bpf_lpm_trie_key *next_key_p = alloca(key_size); + ret = bpf_map_get_next_key(info->map_fd, key_p, next_key_p); + assert(ret == 0 || errno == ENOENT || errno == ENOMEM); + } + } + + // Pass successful exit info back to the main thread + pthread_exit((void *)info); +} + +static void setup_lpm_mt_test_info(struct lpm_mt_test_info *info, int map_fd) +{ + info->iter = 2000; + info->map_fd = map_fd; + info->key[0].prefixlen = 16; + inet_pton(AF_INET, "192.168.0.0", &info->key[0].data); + info->key[1].prefixlen = 24; + inet_pton(AF_INET, "192.168.0.0", &info->key[1].data); + info->key[2].prefixlen = 24; + inet_pton(AF_INET, "192.168.128.0", &info->key[2].data); + info->key[3].prefixlen = 24; + inet_pton(AF_INET, "192.168.1.0", &info->key[3].data); +} + +static void test_lpm_multi_thread(void) +{ + struct lpm_mt_test_info info[4]; + size_t key_size, value_size; + pthread_t thread_id[4]; + int i, map_fd; + void *ret; + + /* create a trie */ + value_size = sizeof(__u32); + key_size = sizeof(struct bpf_lpm_trie_key) + value_size; + map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, value_size, + 100, BPF_F_NO_PREALLOC); + + /* create 4 threads to test update, delete, lookup and get_next_key */ + setup_lpm_mt_test_info(&info[0], map_fd); + for (i = 0; i < 4; i++) { + if (i != 0) + memcpy(&info[i], &info[0], sizeof(info[i])); + info[i].cmd = i; + assert(pthread_create(&thread_id[i], NULL, &lpm_test_command, &info[i]) == 0); + } + + for (i = 0; i < 4; i++) + assert(pthread_join(thread_id[i], &ret) == 0 && ret == (void *)&info[i]); + + close(map_fd); +} + int main(void) { struct rlimit limit = { RLIM_INFINITY, RLIM_INFINITY }; @@ -667,6 +760,8 @@ int main(void) test_lpm_get_next_key(); + test_lpm_multi_thread(); + printf("test_lpm: OK\n"); return 0; }