2019-05-27 14:55:05 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Rewritten by Rusty Russell, on the backs of many others...
|
|
|
|
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
|
|
|
|
|
|
|
|
*/
|
2009-03-20 18:05:04 +08:00
|
|
|
#include <linux/ftrace.h>
|
2009-03-23 01:11:10 +08:00
|
|
|
#include <linux/memory.h>
|
2016-07-24 02:01:45 +08:00
|
|
|
#include <linux/extable.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/module.h>
|
2009-03-20 18:05:04 +08:00
|
|
|
#include <linux/mutex.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/init.h>
|
2017-01-08 22:58:09 +08:00
|
|
|
#include <linux/kprobes.h>
|
bpf: make jited programs visible in traces
Long standing issue with JITed programs is that stack traces from
function tracing check whether a given address is kernel code
through {__,}kernel_text_address(), which checks for code in core
kernel, modules and dynamically allocated ftrace trampolines. But
what is still missing is BPF JITed programs (interpreted programs
are not an issue as __bpf_prog_run() will be attributed to them),
thus when a stack trace is triggered, the code walking the stack
won't see any of the JITed ones. The same for address correlation
done from user space via reading /proc/kallsyms. This is read by
tools like perf, but the latter is also useful for permanent live
tracing with eBPF itself in combination with stack maps when other
eBPF types are part of the callchain. See offwaketime example on
dumping stack from a map.
This work tries to tackle that issue by making the addresses and
symbols known to the kernel. The lookup from *kernel_text_address()
is implemented through a latched RB tree that can be read under
RCU in fast-path that is also shared for symbol/size/offset lookup
for a specific given address in kallsyms. The slow-path iteration
through all symbols in the seq file done via RCU list, which holds
a tiny fraction of all exported ksyms, usually below 0.1 percent.
Function symbols are exported as bpf_prog_<tag>, in order to aide
debugging and attribution. This facility is currently enabled for
root-only when bpf_jit_kallsyms is set to 1, and disabled if hardening
is active in any mode. The rationale behind this is that still a lot
of systems ship with world read permissions on kallsyms thus addresses
should not get suddenly exposed for them. If that situation gets
much better in future, we always have the option to change the
default on this. Likewise, unprivileged programs are not allowed
to add entries there either, but that is less of a concern as most
such programs types relevant in this context are for root-only anyway.
If enabled, call graphs and stack traces will then show a correct
attribution; one example is illustrated below, where the trace is
now visible in tooling such as perf script --kallsyms=/proc/kallsyms
and friends.
Before:
7fff8166889d bpf_clone_redirect+0x80007f0020ed (/lib/modules/4.9.0-rc8+/build/vmlinux)
f5d80 __sendmsg_nocancel+0xffff006451f1a007 (/usr/lib64/libc-2.18.so)
After:
7fff816688b7 bpf_clone_redirect+0x80007f002107 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fffa0575728 bpf_prog_33c45a467c9e061a+0x8000600020fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fffa07ef1fc cls_bpf_classify+0x8000600020dc (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff81678b68 tc_classify+0x80007f002078 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8164d40b __netif_receive_skb_core+0x80007f0025fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8164d718 __netif_receive_skb+0x80007f002018 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8164e565 process_backlog+0x80007f002095 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8164dc71 net_rx_action+0x80007f002231 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff81767461 __softirqentry_text_start+0x80007f0020d1 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff817658ac do_softirq_own_stack+0x80007f00201c (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff810a2c20 do_softirq+0x80007f002050 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff810a2cb5 __local_bh_enable_ip+0x80007f002085 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8168d452 ip_finish_output2+0x80007f002152 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8168ea3d ip_finish_output+0x80007f00217d (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8168f2af ip_output+0x80007f00203f (/lib/modules/4.9.0-rc8+/build/vmlinux)
[...]
7fff81005854 do_syscall_64+0x80007f002054 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff817649eb return_from_SYSCALL_64+0x80007f002000 (/lib/modules/4.9.0-rc8+/build/vmlinux)
f5d80 __sendmsg_nocancel+0xffff01c484812007 (/usr/lib64/libc-2.18.so)
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-17 05:24:50 +08:00
|
|
|
#include <linux/filter.h>
|
2009-03-20 18:05:04 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <asm/sections.h>
|
2016-12-25 03:46:01 +08:00
|
|
|
#include <linux/uaccess.h>
|
2009-03-20 18:05:04 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* mutex protecting text section modification (dynamic code patching).
|
|
|
|
* some users need to sleep (allocating memory...) while they hold this lock.
|
|
|
|
*
|
2017-11-02 09:18:21 +08:00
|
|
|
* Note: Also protects SMP-alternatives modification on x86.
|
|
|
|
*
|
2009-03-20 18:05:04 +08:00
|
|
|
* NOT exported to modules - patching kernel text is a really delicate matter.
|
|
|
|
*/
|
|
|
|
DEFINE_MUTEX(text_mutex);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern struct exception_table_entry __start___ex_table[];
|
|
|
|
extern struct exception_table_entry __stop___ex_table[];
|
|
|
|
|
2012-04-20 05:59:56 +08:00
|
|
|
/* Cleared by build time tools if the table is already sorted. */
|
2014-02-08 15:52:04 +08:00
|
|
|
u32 __initdata __visible main_extable_sort_needed = 1;
|
2012-04-20 05:59:56 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Sort the kernel's built-in exception table */
|
|
|
|
void __init sort_main_extable(void)
|
|
|
|
{
|
2013-09-12 05:23:27 +08:00
|
|
|
if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
|
2013-04-15 18:51:49 +08:00
|
|
|
pr_notice("Sorting __ex_table...\n");
|
2012-04-20 05:59:56 +08:00
|
|
|
sort_extable(__start___ex_table, __stop___ex_table);
|
2013-04-15 18:51:49 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2019-08-20 16:13:49 +08:00
|
|
|
/* Given an address, look for it in the kernel exception table */
|
|
|
|
const
|
|
|
|
struct exception_table_entry *search_kernel_exception_table(unsigned long addr)
|
|
|
|
{
|
|
|
|
return search_extable(__start___ex_table,
|
|
|
|
__stop___ex_table - __start___ex_table, addr);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Given an address, look for it in the exception tables. */
|
|
|
|
const struct exception_table_entry *search_exception_tables(unsigned long addr)
|
|
|
|
{
|
|
|
|
const struct exception_table_entry *e;
|
|
|
|
|
2019-08-20 16:13:49 +08:00
|
|
|
e = search_kernel_exception_table(addr);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!e)
|
|
|
|
e = search_module_extables(addr);
|
2024-06-11 20:26:44 +08:00
|
|
|
if (!e)
|
|
|
|
e = search_bpf_extables(addr);
|
2005-04-17 06:20:36 +08:00
|
|
|
return e;
|
|
|
|
}
|
|
|
|
|
2018-02-21 01:37:53 +08:00
|
|
|
int init_kernel_text(unsigned long addr)
|
2009-03-19 20:21:44 +08:00
|
|
|
{
|
|
|
|
if (addr >= (unsigned long)_sinittext &&
|
2013-11-28 16:16:33 +08:00
|
|
|
addr < (unsigned long)_einittext)
|
2009-03-19 20:21:44 +08:00
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-07-07 06:35:31 +08:00
|
|
|
int notrace core_kernel_text(unsigned long addr)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
if (addr >= (unsigned long)_stext &&
|
2013-11-28 16:16:33 +08:00
|
|
|
addr < (unsigned long)_etext)
|
2005-04-17 06:20:36 +08:00
|
|
|
return 1;
|
|
|
|
|
2017-05-17 02:42:44 +08:00
|
|
|
if (system_state < SYSTEM_RUNNING &&
|
2009-03-19 20:21:44 +08:00
|
|
|
init_kernel_text(addr))
|
2005-04-17 06:20:36 +08:00
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-05-20 09:34:58 +08:00
|
|
|
/**
|
|
|
|
* core_kernel_data - tell if addr points to kernel data
|
|
|
|
* @addr: address to test
|
|
|
|
*
|
|
|
|
* Returns true if @addr passed in is from the core kernel data
|
|
|
|
* section.
|
|
|
|
*
|
|
|
|
* Note: On some archs it may return true for core RODATA, and false
|
|
|
|
* for others. But will always be true for core RW data.
|
|
|
|
*/
|
2011-05-06 09:14:55 +08:00
|
|
|
int core_kernel_data(unsigned long addr)
|
|
|
|
{
|
2011-05-20 09:34:58 +08:00
|
|
|
if (addr >= (unsigned long)_sdata &&
|
2011-05-06 09:14:55 +08:00
|
|
|
addr < (unsigned long)_edata)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-02-08 07:04:02 +08:00
|
|
|
int __kernel_text_address(unsigned long addr)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2017-09-23 05:22:19 +08:00
|
|
|
if (kernel_text_address(addr))
|
bpf: make jited programs visible in traces
Long standing issue with JITed programs is that stack traces from
function tracing check whether a given address is kernel code
through {__,}kernel_text_address(), which checks for code in core
kernel, modules and dynamically allocated ftrace trampolines. But
what is still missing is BPF JITed programs (interpreted programs
are not an issue as __bpf_prog_run() will be attributed to them),
thus when a stack trace is triggered, the code walking the stack
won't see any of the JITed ones. The same for address correlation
done from user space via reading /proc/kallsyms. This is read by
tools like perf, but the latter is also useful for permanent live
tracing with eBPF itself in combination with stack maps when other
eBPF types are part of the callchain. See offwaketime example on
dumping stack from a map.
This work tries to tackle that issue by making the addresses and
symbols known to the kernel. The lookup from *kernel_text_address()
is implemented through a latched RB tree that can be read under
RCU in fast-path that is also shared for symbol/size/offset lookup
for a specific given address in kallsyms. The slow-path iteration
through all symbols in the seq file done via RCU list, which holds
a tiny fraction of all exported ksyms, usually below 0.1 percent.
Function symbols are exported as bpf_prog_<tag>, in order to aide
debugging and attribution. This facility is currently enabled for
root-only when bpf_jit_kallsyms is set to 1, and disabled if hardening
is active in any mode. The rationale behind this is that still a lot
of systems ship with world read permissions on kallsyms thus addresses
should not get suddenly exposed for them. If that situation gets
much better in future, we always have the option to change the
default on this. Likewise, unprivileged programs are not allowed
to add entries there either, but that is less of a concern as most
such programs types relevant in this context are for root-only anyway.
If enabled, call graphs and stack traces will then show a correct
attribution; one example is illustrated below, where the trace is
now visible in tooling such as perf script --kallsyms=/proc/kallsyms
and friends.
Before:
7fff8166889d bpf_clone_redirect+0x80007f0020ed (/lib/modules/4.9.0-rc8+/build/vmlinux)
f5d80 __sendmsg_nocancel+0xffff006451f1a007 (/usr/lib64/libc-2.18.so)
After:
7fff816688b7 bpf_clone_redirect+0x80007f002107 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fffa0575728 bpf_prog_33c45a467c9e061a+0x8000600020fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fffa07ef1fc cls_bpf_classify+0x8000600020dc (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff81678b68 tc_classify+0x80007f002078 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8164d40b __netif_receive_skb_core+0x80007f0025fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8164d718 __netif_receive_skb+0x80007f002018 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8164e565 process_backlog+0x80007f002095 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8164dc71 net_rx_action+0x80007f002231 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff81767461 __softirqentry_text_start+0x80007f0020d1 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff817658ac do_softirq_own_stack+0x80007f00201c (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff810a2c20 do_softirq+0x80007f002050 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff810a2cb5 __local_bh_enable_ip+0x80007f002085 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8168d452 ip_finish_output2+0x80007f002152 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8168ea3d ip_finish_output+0x80007f00217d (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8168f2af ip_output+0x80007f00203f (/lib/modules/4.9.0-rc8+/build/vmlinux)
[...]
7fff81005854 do_syscall_64+0x80007f002054 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff817649eb return_from_SYSCALL_64+0x80007f002000 (/lib/modules/4.9.0-rc8+/build/vmlinux)
f5d80 __sendmsg_nocancel+0xffff01c484812007 (/usr/lib64/libc-2.18.so)
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-17 05:24:50 +08:00
|
|
|
return 1;
|
2009-03-19 20:21:44 +08:00
|
|
|
/*
|
|
|
|
* There might be init symbols in saved stacktraces.
|
|
|
|
* Give those symbols a chance to be printed in
|
|
|
|
* backtraces (such as lockdep traces).
|
|
|
|
*
|
|
|
|
* Since we are after the module-symbols check, there's
|
|
|
|
* no danger of address overlap:
|
|
|
|
*/
|
|
|
|
if (init_kernel_text(addr))
|
|
|
|
return 1;
|
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int kernel_text_address(unsigned long addr)
|
|
|
|
{
|
2017-09-23 05:36:32 +08:00
|
|
|
bool no_rcu;
|
|
|
|
int ret = 1;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (core_kernel_text(addr))
|
|
|
|
return 1;
|
2017-09-23 05:36:32 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If a stack dump happens while RCU is not watching, then
|
|
|
|
* RCU needs to be notified that it requires to start
|
|
|
|
* watching again. This can happen either by tracing that
|
|
|
|
* triggers a stack trace, or a WARN() that happens during
|
|
|
|
* coming back from idle, or cpu on or offlining.
|
|
|
|
*
|
|
|
|
* is_module_text_address() as well as the kprobe slots
|
|
|
|
* and is_bpf_text_address() require RCU to be watching.
|
|
|
|
*/
|
|
|
|
no_rcu = !rcu_is_watching();
|
|
|
|
|
|
|
|
/* Treat this like an NMI as it can happen anywhere */
|
|
|
|
if (no_rcu)
|
|
|
|
rcu_nmi_enter();
|
|
|
|
|
2014-11-19 10:14:11 +08:00
|
|
|
if (is_module_text_address(addr))
|
2017-09-23 05:36:32 +08:00
|
|
|
goto out;
|
2017-01-08 22:58:09 +08:00
|
|
|
if (is_ftrace_trampoline(addr))
|
2017-09-23 05:36:32 +08:00
|
|
|
goto out;
|
2017-01-08 22:58:09 +08:00
|
|
|
if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
|
2017-09-23 05:36:32 +08:00
|
|
|
goto out;
|
bpf: make jited programs visible in traces
Long standing issue with JITed programs is that stack traces from
function tracing check whether a given address is kernel code
through {__,}kernel_text_address(), which checks for code in core
kernel, modules and dynamically allocated ftrace trampolines. But
what is still missing is BPF JITed programs (interpreted programs
are not an issue as __bpf_prog_run() will be attributed to them),
thus when a stack trace is triggered, the code walking the stack
won't see any of the JITed ones. The same for address correlation
done from user space via reading /proc/kallsyms. This is read by
tools like perf, but the latter is also useful for permanent live
tracing with eBPF itself in combination with stack maps when other
eBPF types are part of the callchain. See offwaketime example on
dumping stack from a map.
This work tries to tackle that issue by making the addresses and
symbols known to the kernel. The lookup from *kernel_text_address()
is implemented through a latched RB tree that can be read under
RCU in fast-path that is also shared for symbol/size/offset lookup
for a specific given address in kallsyms. The slow-path iteration
through all symbols in the seq file done via RCU list, which holds
a tiny fraction of all exported ksyms, usually below 0.1 percent.
Function symbols are exported as bpf_prog_<tag>, in order to aide
debugging and attribution. This facility is currently enabled for
root-only when bpf_jit_kallsyms is set to 1, and disabled if hardening
is active in any mode. The rationale behind this is that still a lot
of systems ship with world read permissions on kallsyms thus addresses
should not get suddenly exposed for them. If that situation gets
much better in future, we always have the option to change the
default on this. Likewise, unprivileged programs are not allowed
to add entries there either, but that is less of a concern as most
such programs types relevant in this context are for root-only anyway.
If enabled, call graphs and stack traces will then show a correct
attribution; one example is illustrated below, where the trace is
now visible in tooling such as perf script --kallsyms=/proc/kallsyms
and friends.
Before:
7fff8166889d bpf_clone_redirect+0x80007f0020ed (/lib/modules/4.9.0-rc8+/build/vmlinux)
f5d80 __sendmsg_nocancel+0xffff006451f1a007 (/usr/lib64/libc-2.18.so)
After:
7fff816688b7 bpf_clone_redirect+0x80007f002107 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fffa0575728 bpf_prog_33c45a467c9e061a+0x8000600020fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fffa07ef1fc cls_bpf_classify+0x8000600020dc (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff81678b68 tc_classify+0x80007f002078 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8164d40b __netif_receive_skb_core+0x80007f0025fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8164d718 __netif_receive_skb+0x80007f002018 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8164e565 process_backlog+0x80007f002095 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8164dc71 net_rx_action+0x80007f002231 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff81767461 __softirqentry_text_start+0x80007f0020d1 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff817658ac do_softirq_own_stack+0x80007f00201c (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff810a2c20 do_softirq+0x80007f002050 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff810a2cb5 __local_bh_enable_ip+0x80007f002085 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8168d452 ip_finish_output2+0x80007f002152 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8168ea3d ip_finish_output+0x80007f00217d (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff8168f2af ip_output+0x80007f00203f (/lib/modules/4.9.0-rc8+/build/vmlinux)
[...]
7fff81005854 do_syscall_64+0x80007f002054 (/lib/modules/4.9.0-rc8+/build/vmlinux)
7fff817649eb return_from_SYSCALL_64+0x80007f002000 (/lib/modules/4.9.0-rc8+/build/vmlinux)
f5d80 __sendmsg_nocancel+0xffff01c484812007 (/usr/lib64/libc-2.18.so)
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-17 05:24:50 +08:00
|
|
|
if (is_bpf_text_address(addr))
|
2017-09-23 05:36:32 +08:00
|
|
|
goto out;
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
if (no_rcu)
|
|
|
|
rcu_nmi_exit();
|
|
|
|
|
|
|
|
return ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2008-08-16 06:29:38 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* On some architectures (PPC64, IA64) function pointers
|
|
|
|
* are actually only tokens to some data that then holds the
|
|
|
|
* real function address. As a result, to find if a function
|
|
|
|
* pointer is part of the kernel text, we need to do some
|
|
|
|
* special dereferencing first.
|
|
|
|
*/
|
|
|
|
int func_ptr_is_kernel_text(void *ptr)
|
|
|
|
{
|
|
|
|
unsigned long addr;
|
|
|
|
addr = (unsigned long) dereference_function_descriptor(ptr);
|
|
|
|
if (core_kernel_text(addr))
|
|
|
|
return 1;
|
2009-04-01 03:05:31 +08:00
|
|
|
return is_module_text_address(addr);
|
2008-08-16 06:29:38 +08:00
|
|
|
}
|