From 85782e037f8aba8922dadb24a1523ca0b82ab8bc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 28 Jun 2018 23:34:59 +0200
Subject: [PATCH] bpf: undo prog rejection on read-only lock failure

Partially undo commit 9facc336876f ("bpf: reject any prog that failed
read-only lock") since it caused a regression, that is, syzkaller was
able to manage to cause a panic via fault injection deep in set_memory_ro()
path by letting an allocation fail: In x86's __change_page_attr_set_clr()
it was able to change the attributes of the primary mapping but not in
the alias mapping via cpa_process_alias(), so the second, inner call
to the __change_page_attr() via __change_page_attr_set_clr() had to split
a larger page and failed in the alloc_pages() with the artifically triggered
allocation error which is then propagated down to the call site.

Thus, for set_memory_ro() this means that it returned with an error, but
from debugging a probe_kernel_write() revealed EFAULT on that memory since
the primary mapping succeeded to get changed. Therefore the subsequent
hdr->locked = 0 reset triggered the panic as it was performed on read-only
memory, so call-site assumptions were infact wrong to assume that it would
either succeed /or/ not succeed at all since there's no such rollback in
set_memory_*() calls from partial change of mappings, in other words, we're
left in a state that is "half done". A later undo via set_memory_rw() is
succeeding though due to matching permissions on that part (aka due to the
try_preserve_large_page() succeeding). While reproducing locally with
explicitly triggering this error, the initial splitting only happens on
rare occasions and in real world it would additionally need oom conditions,
but that said, it could partially fail. Therefore, it is definitely wrong
to bail out on set_memory_ro() error and reject the program with the
set_memory_*() semantics we have today. Shouldn't have gone the extra mile
since no other user in tree today infact checks for any set_memory_*()
errors, e.g. neither module_enable_ro() / module_disable_ro() for module
RO/NX handling which is mostly default these days nor kprobes core with
alloc_insn_page() / free_insn_page() as examples that could be invoked long
after bootup and original 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit
against spraying attacks") did neither when it got first introduced to BPF
so "improving" with bailing out was clearly not right when set_memory_*()
cannot handle it today.

Kees suggested that if set_memory_*() can fail, we should annotate it with
__must_check, and all callers need to deal with it gracefully given those
set_memory_*() markings aren't "advisory", but they're expected to actually
do what they say. This might be an option worth to move forward in future
but would at the same time require that set_memory_*() calls from supporting
archs are guaranteed to be "atomic" in that they provide rollback if part
of the range fails, once that happened, the transition from RW -> RO could
be made more robust that way, while subsequent RO -> RW transition /must/
continue guaranteeing to always succeed the undo part.

Reported-by: syzbot+a4eb8c7766952a1ca872@syzkaller.appspotmail.com
Reported-by: syzbot+d866d1925855328eac3b@syzkaller.appspotmail.com
Fixes: 9facc336876f ("bpf: reject any prog that failed read-only lock")
Cc: Laura Abbott <labbott@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 56 ++++++------------------------------------
 kernel/bpf/core.c      | 28 ---------------------
 2 files changed, 8 insertions(+), 76 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 20f2659dd829..300baad62c88 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -470,9 +470,7 @@ struct sock_fprog_kern {
 };
 
 struct bpf_binary_header {
-	u16 pages;
-	u16 locked:1;
-
+	u32 pages;
 	/* Some arches need word alignment for their instructions */
 	u8 image[] __aligned(4);
 };
@@ -481,7 +479,7 @@ struct bpf_prog {
 	u16			pages;		/* Number of allocated pages */
 	u16			jited:1,	/* Is our filter JIT'ed? */
 				jit_requested:1,/* archs need to JIT the prog */
-				locked:1,	/* Program image locked? */
+				undo_set_mem:1,	/* Passed set_memory_ro() checkpoint */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
 				dst_needed:1,	/* Do we need dst entry? */
@@ -677,46 +675,24 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
 
 static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 {
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-	fp->locked = 1;
-	if (set_memory_ro((unsigned long)fp, fp->pages))
-		fp->locked = 0;
-#endif
+	fp->undo_set_mem = 1;
+	set_memory_ro((unsigned long)fp, fp->pages);
 }
 
 static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 {
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-	if (fp->locked) {
-		WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages));
-		/* In case set_memory_rw() fails, we want to be the first
-		 * to crash here instead of some random place later on.
-		 */
-		fp->locked = 0;
-	}
-#endif
+	if (fp->undo_set_mem)
+		set_memory_rw((unsigned long)fp, fp->pages);
 }
 
 static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
 {
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-	hdr->locked = 1;
-	if (set_memory_ro((unsigned long)hdr, hdr->pages))
-		hdr->locked = 0;
-#endif
+	set_memory_ro((unsigned long)hdr, hdr->pages);
 }
 
 static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
 {
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-	if (hdr->locked) {
-		WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages));
-		/* In case set_memory_rw() fails, we want to be the first
-		 * to crash here instead of some random place later on.
-		 */
-		hdr->locked = 0;
-	}
-#endif
+	set_memory_rw((unsigned long)hdr, hdr->pages);
 }
 
 static inline struct bpf_binary_header *
@@ -728,22 +704,6 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp)
 	return (void *)addr;
 }
 
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp)
-{
-	if (!fp->locked)
-		return -ENOLCK;
-	if (fp->jited) {
-		const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
-
-		if (!hdr->locked)
-			return -ENOLCK;
-	}
-
-	return 0;
-}
-#endif
-
 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
 static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
 {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a9e6c04d0f4a..1e5625d46414 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	bpf_fill_ill_insns(hdr, size);
 
 	hdr->pages = size / PAGE_SIZE;
-	hdr->locked = 0;
-
 	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
 		     PAGE_SIZE - sizeof(*hdr));
 	start = (get_random_int() % hole) & ~(alignment - 1);
@@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
 	return 0;
 }
 
-static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp)
-{
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-	int i, err;
-
-	for (i = 0; i < fp->aux->func_cnt; i++) {
-		err = bpf_prog_check_pages_ro_single(fp->aux->func[i]);
-		if (err)
-			return err;
-	}
-
-	return bpf_prog_check_pages_ro_single(fp);
-#endif
-	return 0;
-}
-
 static void bpf_prog_select_func(struct bpf_prog *fp)
 {
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
@@ -1524,17 +1506,7 @@ finalize:
 	 * all eBPF JITs might immediately support all features.
 	 */
 	*err = bpf_check_tail_call(fp);
-	if (*err)
-		return fp;
 
-	/* Checkpoint: at this point onwards any cBPF -> eBPF or
-	 * native eBPF program is read-only. If we failed to change
-	 * the page attributes (e.g. allocation failure from
-	 * splitting large pages), then reject the whole program
-	 * in order to guarantee not ending up with any W+X pages
-	 * from BPF side in kernel.
-	 */
-	*err = bpf_prog_check_pages_ro_locked(fp);
 	return fp;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);