2019-10-18 12:50:53 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-only */
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Dynamic loading of modules into the kernel.
|
|
|
|
*
|
|
|
|
* Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996
|
|
|
|
* Rewritten again by Rusty Russell, 2002
|
|
|
|
*/
|
2019-10-18 12:50:53 +08:00
|
|
|
|
|
|
|
#ifndef _LINUX_MODULE_H
|
|
|
|
#define _LINUX_MODULE_H
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/stat.h>
|
2021-07-08 09:09:20 +08:00
|
|
|
#include <linux/buildid.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/compiler.h>
|
|
|
|
#include <linux/cache.h>
|
|
|
|
#include <linux/kmod.h>
|
2015-05-02 08:13:42 +08:00
|
|
|
#include <linux/init.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/elf.h>
|
|
|
|
#include <linux/stringify.h>
|
|
|
|
#include <linux/kobject.h>
|
|
|
|
#include <linux/moduleparam.h>
|
2014-02-27 02:12:06 +08:00
|
|
|
#include <linux/jump_label.h>
|
2011-05-24 02:11:39 +08:00
|
|
|
#include <linux/export.h>
|
2015-05-27 09:39:37 +08:00
|
|
|
#include <linux/rbtree_latch.h>
|
2018-01-13 01:55:33 +08:00
|
|
|
#include <linux/error-injection.h>
|
tracepoint: Fix tracepoint array element size mismatch
commit 46e0c9be206f ("kernel: tracepoints: add support for relative
references") changes the layout of the __tracepoint_ptrs section on
architectures supporting relative references. However, it does so
without turning struct tracepoint * const into const int elsewhere in
the tracepoint code, which has the following side-effect:
Setting mod->num_tracepoints is done in by module.c:
mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
sizeof(*mod->tracepoints_ptrs),
&mod->num_tracepoints);
Basically, since sizeof(*mod->tracepoints_ptrs) is a pointer size
(rather than sizeof(int)), num_tracepoints is erroneously set to half the
size it should be on 64-bit arch. So a module with an odd number of
tracepoints misses the last tracepoint due to effect of integer
division.
So in the module going notifier:
for_each_tracepoint_range(mod->tracepoints_ptrs,
mod->tracepoints_ptrs + mod->num_tracepoints,
tp_module_going_check_quiescent, NULL);
the expression (mod->tracepoints_ptrs + mod->num_tracepoints) actually
evaluates to something within the bounds of the array, but miss the
last tracepoint if the number of tracepoints is odd on 64-bit arch.
Fix this by introducing a new typedef: tracepoint_ptr_t, which
is either "const int" on architectures that have PREL32 relocations,
or "struct tracepoint * const" on architectures that does not have
this feature.
Also provide a new tracepoint_ptr_defer() static inline to
encapsulate deferencing this type rather than duplicate code and
ugly idefs within the for_each_tracepoint_range() implementation.
This issue appears in 4.19-rc kernels, and should ideally be fixed
before the end of the rc cycle.
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Jessica Yu <jeyu@kernel.org>
Link: http://lkml.kernel.org/r/20181013191050.22389-1-mathieu.desnoyers@efficios.com
Link: http://lkml.kernel.org/r/20180704083651.24360-7-ard.biesheuvel@linaro.org
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morris <james.morris@microsoft.com>
Cc: James Morris <jmorris@namei.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Nicolas Pitre <nico@linaro.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-10-14 03:10:50 +08:00
|
|
|
#include <linux/tracepoint-defs.h>
|
2019-04-06 07:15:00 +08:00
|
|
|
#include <linux/srcu.h>
|
2020-08-18 21:57:42 +08:00
|
|
|
#include <linux/static_call_types.h>
|
add support for Clang CFI
This change adds support for Clang’s forward-edge Control Flow
Integrity (CFI) checking. With CONFIG_CFI_CLANG, the compiler
injects a runtime check before each indirect function call to ensure
the target is a valid function with the correct static type. This
restricts possible call targets and makes it more difficult for
an attacker to exploit bugs that allow the modification of stored
function pointers. For more details, see:
https://clang.llvm.org/docs/ControlFlowIntegrity.html
Clang requires CONFIG_LTO_CLANG to be enabled with CFI to gain
visibility to possible call targets. Kernel modules are supported
with Clang’s cross-DSO CFI mode, which allows checking between
independently compiled components.
With CFI enabled, the compiler injects a __cfi_check() function into
the kernel and each module for validating local call targets. For
cross-module calls that cannot be validated locally, the compiler
calls the global __cfi_slowpath_diag() function, which determines
the target module and calls the correct __cfi_check() function. This
patch includes a slowpath implementation that uses __module_address()
to resolve call targets, and with CONFIG_CFI_CLANG_SHADOW enabled, a
shadow map that speeds up module look-ups by ~3x.
Clang implements indirect call checking using jump tables and
offers two methods of generating them. With canonical jump tables,
the compiler renames each address-taken function to <function>.cfi
and points the original symbol to a jump table entry, which passes
__cfi_check() validation. This isn’t compatible with stand-alone
assembly code, which the compiler doesn’t instrument, and would
result in indirect calls to assembly code to fail. Therefore, we
default to using non-canonical jump tables instead, where the compiler
generates a local jump table entry <function>.cfi_jt for each
address-taken function, and replaces all references to the function
with the address of the jump table entry.
Note that because non-canonical jump table addresses are local
to each component, they break cross-module function address
equality. Specifically, the address of a global function will be
different in each module, as it's replaced with the address of a local
jump table entry. If this address is passed to a different module,
it won’t match the address of the same function taken there. This
may break code that relies on comparing addresses passed from other
components.
CFI checking can be disabled in a function with the __nocfi attribute.
Additionally, CFI can be disabled for an entire compilation unit by
filtering out CC_FLAGS_CFI.
By default, CFI failures result in a kernel panic to stop a potential
exploit. CONFIG_CFI_PERMISSIVE enables a permissive mode, where the
kernel prints out a rate-limited warning instead, and allows execution
to continue. This option is helpful for locating type mismatches, but
should only be enabled during development.
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210408182843.1754385-2-samitolvanen@google.com
2021-04-09 02:28:26 +08:00
|
|
|
#include <linux/cfi.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-01-05 14:34:50 +08:00
|
|
|
#include <linux/percpu.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <asm/module.h>
|
|
|
|
|
2008-10-22 23:00:22 +08:00
|
|
|
#define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-01-16 07:48:49 +08:00
|
|
|
struct modversion_info {
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long crc;
|
|
|
|
char name[MODULE_NAME_LEN];
|
|
|
|
};
|
|
|
|
|
|
|
|
struct module;
|
2016-07-27 10:36:34 +08:00
|
|
|
struct exception_table_entry;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-07-24 20:36:04 +08:00
|
|
|
struct module_kobject {
|
|
|
|
struct kobject kobj;
|
|
|
|
struct module *mod;
|
|
|
|
struct kobject *drivers_dir;
|
|
|
|
struct module_param_attrs *mp;
|
2013-09-03 15:03:57 +08:00
|
|
|
struct completion *kobj_completion;
|
2016-10-28 16:22:25 +08:00
|
|
|
} __randomize_layout;
|
2011-07-24 20:36:04 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct module_attribute {
|
2011-07-24 20:36:04 +08:00
|
|
|
struct attribute attr;
|
|
|
|
ssize_t (*show)(struct module_attribute *, struct module_kobject *,
|
|
|
|
char *);
|
|
|
|
ssize_t (*store)(struct module_attribute *, struct module_kobject *,
|
2005-04-17 06:20:36 +08:00
|
|
|
const char *, size_t count);
|
[PATCH] modules: add version and srcversion to sysfs
This patch adds version and srcversion files to
/sys/module/${modulename} containing the version and srcversion fields
of the module's modinfo section (if present).
/sys/module/e1000
|-- srcversion
`-- version
This patch differs slightly from the version posted in January, as it
now uses the new kstrdup() call in -mm.
Why put this in sysfs?
a) Tools like DKMS, which deal with changing out individual kernel
modules without replacing the whole kernel, can behave smarter if they
can tell the version of a given module. The autoinstaller feature, for
example, which determines if your system has a "good" version of a
driver (i.e. if the one provided by DKMS has a newer verson than that
provided by the kernel package installed), and to automatically compile
and install a newer version if DKMS has it but your kernel doesn't yet
have that version.
b) Because sysadmins manually, or with tools like DKMS, can switch out
modules on the file system, you can't count on 'modinfo foo.ko', which
looks at /lib/modules/${kernelver}/... actually matching what is loaded
into the kernel already. Hence asking sysfs for this.
c) as the unbind-driver-from-device work takes shape, it will be
possible to rebind a driver that's built-in (no .ko to modinfo for the
version) to a newly loaded module. sysfs will have the
currently-built-in version info, for comparison.
d) tech support scripts can then easily grab the version info for what's
running presently - a question I get often.
There has been renewed interest in this patch on linux-scsi by driver
authors.
As the idea originated from GregKH, I leave his Signed-off-by: intact,
though the implementation is nearly completely new. Compiled and run on
x86 and x86_64.
From: Matthew Dobson <colpatch@us.ibm.com>
build fix
From: Thierry Vignaud <tvignaud@mandriva.com>
build fix
From: Matthew Dobson <colpatch@us.ibm.com>
warning fix
Signed-off-by: Greg Kroah-Hartman <greg@kroah.com>
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-24 13:05:15 +08:00
|
|
|
void (*setup)(struct module *, const char *);
|
|
|
|
int (*test)(struct module *);
|
|
|
|
void (*free)(struct module *);
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2010-12-16 06:00:19 +08:00
|
|
|
struct module_version_attribute {
|
|
|
|
struct module_attribute mattr;
|
|
|
|
const char *module_name;
|
|
|
|
const char *version;
|
2020-11-23 18:23:14 +08:00
|
|
|
};
|
2010-12-16 06:00:19 +08:00
|
|
|
|
2011-02-08 08:02:27 +08:00
|
|
|
extern ssize_t __modver_version_show(struct module_attribute *,
|
2011-07-24 20:36:04 +08:00
|
|
|
struct module_kobject *, char *);
|
2011-02-08 08:02:27 +08:00
|
|
|
|
2011-07-24 20:36:04 +08:00
|
|
|
extern struct module_attribute module_uevent;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* These are either module local, or the kernel's dummy ones. */
|
|
|
|
extern int init_module(void);
|
|
|
|
extern void cleanup_module(void);
|
|
|
|
|
2015-05-02 08:13:42 +08:00
|
|
|
#ifndef MODULE
|
|
|
|
/**
|
|
|
|
* module_init() - driver initialization entry point
|
|
|
|
* @x: function to be run at kernel boot time or module insertion
|
|
|
|
*
|
|
|
|
* module_init() will either be called during do_initcalls() (if
|
|
|
|
* builtin) or at module insertion time (if a module). There can only
|
|
|
|
* be one per module.
|
|
|
|
*/
|
|
|
|
#define module_init(x) __initcall(x);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* module_exit() - driver exit entry point
|
|
|
|
* @x: function to be run when driver is removed
|
|
|
|
*
|
|
|
|
* module_exit() will wrap the driver clean-up code
|
|
|
|
* with cleanup_module() when used with rmmod when
|
|
|
|
* the driver is a module. If the driver is statically
|
|
|
|
* compiled into the kernel, module_exit() has no effect.
|
|
|
|
* There can only be one per module.
|
|
|
|
*/
|
|
|
|
#define module_exit(x) __exitcall(x);
|
|
|
|
|
|
|
|
#else /* MODULE */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In most cases loadable modules do not need custom
|
|
|
|
* initcall levels. There are still some valid cases where
|
|
|
|
* a driver may be needed early if built in, and does not
|
|
|
|
* matter when built as a loadable module. Like bus
|
|
|
|
* snooping debug drivers.
|
|
|
|
*/
|
|
|
|
#define early_initcall(fn) module_init(fn)
|
|
|
|
#define core_initcall(fn) module_init(fn)
|
|
|
|
#define core_initcall_sync(fn) module_init(fn)
|
|
|
|
#define postcore_initcall(fn) module_init(fn)
|
|
|
|
#define postcore_initcall_sync(fn) module_init(fn)
|
|
|
|
#define arch_initcall(fn) module_init(fn)
|
|
|
|
#define subsys_initcall(fn) module_init(fn)
|
|
|
|
#define subsys_initcall_sync(fn) module_init(fn)
|
|
|
|
#define fs_initcall(fn) module_init(fn)
|
|
|
|
#define fs_initcall_sync(fn) module_init(fn)
|
|
|
|
#define rootfs_initcall(fn) module_init(fn)
|
|
|
|
#define device_initcall(fn) module_init(fn)
|
|
|
|
#define device_initcall_sync(fn) module_init(fn)
|
|
|
|
#define late_initcall(fn) module_init(fn)
|
|
|
|
#define late_initcall_sync(fn) module_init(fn)
|
|
|
|
|
|
|
|
#define console_initcall(fn) module_init(fn)
|
|
|
|
|
|
|
|
/* Each module must use one module_init(). */
|
|
|
|
#define module_init(initfn) \
|
2017-02-02 01:00:14 +08:00
|
|
|
static inline initcall_t __maybe_unused __inittest(void) \
|
2015-05-02 08:13:42 +08:00
|
|
|
{ return initfn; } \
|
add support for Clang CFI
This change adds support for Clang’s forward-edge Control Flow
Integrity (CFI) checking. With CONFIG_CFI_CLANG, the compiler
injects a runtime check before each indirect function call to ensure
the target is a valid function with the correct static type. This
restricts possible call targets and makes it more difficult for
an attacker to exploit bugs that allow the modification of stored
function pointers. For more details, see:
https://clang.llvm.org/docs/ControlFlowIntegrity.html
Clang requires CONFIG_LTO_CLANG to be enabled with CFI to gain
visibility to possible call targets. Kernel modules are supported
with Clang’s cross-DSO CFI mode, which allows checking between
independently compiled components.
With CFI enabled, the compiler injects a __cfi_check() function into
the kernel and each module for validating local call targets. For
cross-module calls that cannot be validated locally, the compiler
calls the global __cfi_slowpath_diag() function, which determines
the target module and calls the correct __cfi_check() function. This
patch includes a slowpath implementation that uses __module_address()
to resolve call targets, and with CONFIG_CFI_CLANG_SHADOW enabled, a
shadow map that speeds up module look-ups by ~3x.
Clang implements indirect call checking using jump tables and
offers two methods of generating them. With canonical jump tables,
the compiler renames each address-taken function to <function>.cfi
and points the original symbol to a jump table entry, which passes
__cfi_check() validation. This isn’t compatible with stand-alone
assembly code, which the compiler doesn’t instrument, and would
result in indirect calls to assembly code to fail. Therefore, we
default to using non-canonical jump tables instead, where the compiler
generates a local jump table entry <function>.cfi_jt for each
address-taken function, and replaces all references to the function
with the address of the jump table entry.
Note that because non-canonical jump table addresses are local
to each component, they break cross-module function address
equality. Specifically, the address of a global function will be
different in each module, as it's replaced with the address of a local
jump table entry. If this address is passed to a different module,
it won’t match the address of the same function taken there. This
may break code that relies on comparing addresses passed from other
components.
CFI checking can be disabled in a function with the __nocfi attribute.
Additionally, CFI can be disabled for an entire compilation unit by
filtering out CC_FLAGS_CFI.
By default, CFI failures result in a kernel panic to stop a potential
exploit. CONFIG_CFI_PERMISSIVE enables a permissive mode, where the
kernel prints out a rate-limited warning instead, and allows execution
to continue. This option is helpful for locating type mismatches, but
should only be enabled during development.
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210408182843.1754385-2-samitolvanen@google.com
2021-04-09 02:28:26 +08:00
|
|
|
int init_module(void) __copy(initfn) \
|
|
|
|
__attribute__((alias(#initfn))); \
|
|
|
|
__CFI_ADDRESSABLE(init_module, __initdata);
|
2015-05-02 08:13:42 +08:00
|
|
|
|
|
|
|
/* This is only required if you want to be unloadable. */
|
|
|
|
#define module_exit(exitfn) \
|
2017-02-02 01:00:14 +08:00
|
|
|
static inline exitcall_t __maybe_unused __exittest(void) \
|
2015-05-02 08:13:42 +08:00
|
|
|
{ return exitfn; } \
|
add support for Clang CFI
This change adds support for Clang’s forward-edge Control Flow
Integrity (CFI) checking. With CONFIG_CFI_CLANG, the compiler
injects a runtime check before each indirect function call to ensure
the target is a valid function with the correct static type. This
restricts possible call targets and makes it more difficult for
an attacker to exploit bugs that allow the modification of stored
function pointers. For more details, see:
https://clang.llvm.org/docs/ControlFlowIntegrity.html
Clang requires CONFIG_LTO_CLANG to be enabled with CFI to gain
visibility to possible call targets. Kernel modules are supported
with Clang’s cross-DSO CFI mode, which allows checking between
independently compiled components.
With CFI enabled, the compiler injects a __cfi_check() function into
the kernel and each module for validating local call targets. For
cross-module calls that cannot be validated locally, the compiler
calls the global __cfi_slowpath_diag() function, which determines
the target module and calls the correct __cfi_check() function. This
patch includes a slowpath implementation that uses __module_address()
to resolve call targets, and with CONFIG_CFI_CLANG_SHADOW enabled, a
shadow map that speeds up module look-ups by ~3x.
Clang implements indirect call checking using jump tables and
offers two methods of generating them. With canonical jump tables,
the compiler renames each address-taken function to <function>.cfi
and points the original symbol to a jump table entry, which passes
__cfi_check() validation. This isn’t compatible with stand-alone
assembly code, which the compiler doesn’t instrument, and would
result in indirect calls to assembly code to fail. Therefore, we
default to using non-canonical jump tables instead, where the compiler
generates a local jump table entry <function>.cfi_jt for each
address-taken function, and replaces all references to the function
with the address of the jump table entry.
Note that because non-canonical jump table addresses are local
to each component, they break cross-module function address
equality. Specifically, the address of a global function will be
different in each module, as it's replaced with the address of a local
jump table entry. If this address is passed to a different module,
it won’t match the address of the same function taken there. This
may break code that relies on comparing addresses passed from other
components.
CFI checking can be disabled in a function with the __nocfi attribute.
Additionally, CFI can be disabled for an entire compilation unit by
filtering out CC_FLAGS_CFI.
By default, CFI failures result in a kernel panic to stop a potential
exploit. CONFIG_CFI_PERMISSIVE enables a permissive mode, where the
kernel prints out a rate-limited warning instead, and allows execution
to continue. This option is helpful for locating type mismatches, but
should only be enabled during development.
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210408182843.1754385-2-samitolvanen@google.com
2021-04-09 02:28:26 +08:00
|
|
|
void cleanup_module(void) __copy(exitfn) \
|
|
|
|
__attribute__((alias(#exitfn))); \
|
|
|
|
__CFI_ADDRESSABLE(cleanup_module, __exitdata);
|
2015-05-02 08:13:42 +08:00
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* This means "can be init if no module support, otherwise module load
|
|
|
|
may call it." */
|
|
|
|
#ifdef CONFIG_MODULES
|
|
|
|
#define __init_or_module
|
|
|
|
#define __initdata_or_module
|
|
|
|
#define __initconst_or_module
|
|
|
|
#define __INIT_OR_MODULE .text
|
|
|
|
#define __INITDATA_OR_MODULE .data
|
|
|
|
#define __INITRODATA_OR_MODULE .section ".rodata","a",%progbits
|
|
|
|
#else
|
|
|
|
#define __init_or_module __init
|
|
|
|
#define __initdata_or_module __initdata
|
|
|
|
#define __initconst_or_module __initconst
|
|
|
|
#define __INIT_OR_MODULE __INIT
|
|
|
|
#define __INITDATA_OR_MODULE __INITDATA
|
|
|
|
#define __INITRODATA_OR_MODULE __INITRODATA
|
|
|
|
#endif /*CONFIG_MODULES*/
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Generic info of form tag = "info" */
|
|
|
|
#define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info)
|
|
|
|
|
|
|
|
/* For userspace: you can also call me... */
|
|
|
|
#define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias)
|
|
|
|
|
2013-08-20 14:03:19 +08:00
|
|
|
/* Soft module dependencies. See man modprobe.d for details.
|
|
|
|
* Example: MODULE_SOFTDEP("pre: module-foo module-bar post: module-baz")
|
|
|
|
*/
|
|
|
|
#define MODULE_SOFTDEP(_softdep) MODULE_INFO(softdep, _softdep)
|
|
|
|
|
kbuild: create modules.builtin without Makefile.modbuiltin or tristate.conf
Commit bc081dd6e9f6 ("kbuild: generate modules.builtin") added
infrastructure to generate modules.builtin, the list of all
builtin modules.
Basically, it works like this:
- Kconfig generates include/config/tristate.conf, the list of
tristate CONFIG options with a value in a capital letter.
- scripts/Makefile.modbuiltin makes Kbuild descend into
directories to collect the information of builtin modules.
I am not a big fan of it because Kbuild ends up with traversing
the source tree twice.
I am not sure how perfectly it should work, but this approach cannot
avoid false positives; even if the relevant CONFIG option is tristate,
some Makefiles forces obj-m to obj-y.
Some examples are:
arch/powerpc/platforms/powermac/Makefile:
obj-$(CONFIG_NVRAM:m=y) += nvram.o
net/ipv6/Makefile:
obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
net/netlabel/Makefile:
obj-$(subst m,y,$(CONFIG_IPV6)) += netlabel_calipso.o
Nobody has complained about (or noticed) it, so it is probably fine to
have false positives in modules.builtin.
This commit simplifies the implementation. Let's exploit the fact
that every module has MODULE_LICENSE(). (modpost shows a warning if
MODULE_LICENSE is missing. If so, 0-day bot would already have blocked
such a module.)
I added MODULE_FILE to <linux/module.h>. When the code is being compiled
as builtin, it will be filled with the file path of the module, and
collected into modules.builtin.info. Then, scripts/link-vmlinux.sh
extracts the list of builtin modules out of it.
This new approach fixes the false-positives above, but adds another
type of false-positives; non-modular code may have MODULE_LICENSE()
by mistake. This is not a big deal, it is just the code is always
orphan. We can clean it up if we like. You can see cleanup examples by:
$ git log --grep='make.* explicitly non-modular'
To sum up, this commits deletes lots of code, but still produces almost
equivalent results. Please note it does not increase the vmlinux size at
all. As you can see in include/asm-generic/vmlinux.lds.h, the .modinfo
section is discarded in the link stage.
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
2019-12-19 16:33:29 +08:00
|
|
|
/*
|
|
|
|
* MODULE_FILE is used for generating modules.builtin
|
|
|
|
* So, make it no-op when this is being built as a module
|
|
|
|
*/
|
|
|
|
#ifdef MODULE
|
|
|
|
#define MODULE_FILE
|
|
|
|
#else
|
|
|
|
#define MODULE_FILE MODULE_INFO(file, KBUILD_MODFILE);
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* The following license idents are currently accepted as indicating free
|
|
|
|
* software modules
|
|
|
|
*
|
2019-02-09 00:02:56 +08:00
|
|
|
* "GPL" [GNU Public License v2]
|
2005-04-17 06:20:36 +08:00
|
|
|
* "GPL v2" [GNU Public License v2]
|
|
|
|
* "GPL and additional rights" [GNU Public License v2 rights and more]
|
|
|
|
* "Dual BSD/GPL" [GNU Public License v2
|
|
|
|
* or BSD license choice]
|
2006-06-23 17:05:13 +08:00
|
|
|
* "Dual MIT/GPL" [GNU Public License v2
|
|
|
|
* or MIT license choice]
|
2005-04-17 06:20:36 +08:00
|
|
|
* "Dual MPL/GPL" [GNU Public License v2
|
|
|
|
* or Mozilla license choice]
|
|
|
|
*
|
|
|
|
* The following other idents are available
|
|
|
|
*
|
|
|
|
* "Proprietary" [Non free products]
|
|
|
|
*
|
2019-02-09 00:02:56 +08:00
|
|
|
* Both "GPL v2" and "GPL" (the latter also in dual licensed strings) are
|
|
|
|
* merely stating that the module is licensed under the GPL v2, but are not
|
|
|
|
* telling whether "GPL v2 only" or "GPL v2 or later". The reason why there
|
|
|
|
* are two variants is a historic and failed attempt to convey more
|
|
|
|
* information in the MODULE_LICENSE string. For module loading the
|
|
|
|
* "only/or later" distinction is completely irrelevant and does neither
|
|
|
|
* replace the proper license identifiers in the corresponding source file
|
|
|
|
* nor amends them in any way. The sole purpose is to make the
|
|
|
|
* 'Proprietary' flagging work and to refuse to bind symbols which are
|
|
|
|
* exported with EXPORT_SYMBOL_GPL when a non free module is loaded.
|
|
|
|
*
|
|
|
|
* In the same way "BSD" is not a clear license information. It merely
|
|
|
|
* states, that the module is licensed under one of the compatible BSD
|
|
|
|
* license variants. The detailed and correct license information is again
|
|
|
|
* to be found in the corresponding source files.
|
|
|
|
*
|
2005-04-17 06:20:36 +08:00
|
|
|
* There are dual licensed components, but when running with Linux it is the
|
|
|
|
* GPL that is relevant so this is a non issue. Similarly LGPL linked with GPL
|
|
|
|
* is a GPL combined work.
|
|
|
|
*
|
|
|
|
* This exists for several reasons
|
2014-01-16 07:48:49 +08:00
|
|
|
* 1. So modinfo can show license info for users wanting to vet their setup
|
2005-04-17 06:20:36 +08:00
|
|
|
* is free
|
|
|
|
* 2. So the community can ignore bug reports including proprietary modules
|
|
|
|
* 3. So vendors can do likewise based on their own policies
|
|
|
|
*/
|
kbuild: create modules.builtin without Makefile.modbuiltin or tristate.conf
Commit bc081dd6e9f6 ("kbuild: generate modules.builtin") added
infrastructure to generate modules.builtin, the list of all
builtin modules.
Basically, it works like this:
- Kconfig generates include/config/tristate.conf, the list of
tristate CONFIG options with a value in a capital letter.
- scripts/Makefile.modbuiltin makes Kbuild descend into
directories to collect the information of builtin modules.
I am not a big fan of it because Kbuild ends up with traversing
the source tree twice.
I am not sure how perfectly it should work, but this approach cannot
avoid false positives; even if the relevant CONFIG option is tristate,
some Makefiles forces obj-m to obj-y.
Some examples are:
arch/powerpc/platforms/powermac/Makefile:
obj-$(CONFIG_NVRAM:m=y) += nvram.o
net/ipv6/Makefile:
obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
net/netlabel/Makefile:
obj-$(subst m,y,$(CONFIG_IPV6)) += netlabel_calipso.o
Nobody has complained about (or noticed) it, so it is probably fine to
have false positives in modules.builtin.
This commit simplifies the implementation. Let's exploit the fact
that every module has MODULE_LICENSE(). (modpost shows a warning if
MODULE_LICENSE is missing. If so, 0-day bot would already have blocked
such a module.)
I added MODULE_FILE to <linux/module.h>. When the code is being compiled
as builtin, it will be filled with the file path of the module, and
collected into modules.builtin.info. Then, scripts/link-vmlinux.sh
extracts the list of builtin modules out of it.
This new approach fixes the false-positives above, but adds another
type of false-positives; non-modular code may have MODULE_LICENSE()
by mistake. This is not a big deal, it is just the code is always
orphan. We can clean it up if we like. You can see cleanup examples by:
$ git log --grep='make.* explicitly non-modular'
To sum up, this commits deletes lots of code, but still produces almost
equivalent results. Please note it does not increase the vmlinux size at
all. As you can see in include/asm-generic/vmlinux.lds.h, the .modinfo
section is discarded in the link stage.
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
2019-12-19 16:33:29 +08:00
|
|
|
#define MODULE_LICENSE(_license) MODULE_FILE MODULE_INFO(license, _license)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-09-25 14:32:58 +08:00
|
|
|
/*
|
|
|
|
* Author(s), use "Name <email>" or just "Name", for multiple
|
|
|
|
* authors use multiple MODULE_AUTHOR() statements/lines.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
#define MODULE_AUTHOR(_author) MODULE_INFO(author, _author)
|
2014-01-16 07:48:49 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* What your module does. */
|
|
|
|
#define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description)
|
|
|
|
|
2014-02-03 08:45:13 +08:00
|
|
|
#ifdef MODULE
|
|
|
|
/* Creates an alias so file2alias.c can find device table. */
|
|
|
|
#define MODULE_DEVICE_TABLE(type, name) \
|
2017-07-25 09:27:25 +08:00
|
|
|
extern typeof(name) __mod_##type##__##name##_device_table \
|
2014-02-03 08:45:13 +08:00
|
|
|
__attribute__ ((unused, alias(__stringify(name))))
|
|
|
|
#else /* !MODULE */
|
|
|
|
#define MODULE_DEVICE_TABLE(type, name)
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Version of form [<epoch>:]<version>[-<extra-version>].
|
2014-01-16 07:48:49 +08:00
|
|
|
* Or for CVS/RCS ID version, everything but the number is stripped.
|
|
|
|
* <epoch>: A (small) unsigned integer which allows you to start versions
|
|
|
|
* anew. If not mentioned, it's zero. eg. "2:1.0" is after
|
|
|
|
* "1:2.0".
|
|
|
|
|
|
|
|
* <version>: The <version> may contain only alphanumerics and the
|
|
|
|
* character `.'. Ordered by numeric sort for numeric parts,
|
|
|
|
* ascii sort for ascii parts (as per RPM or DEB algorithm).
|
|
|
|
|
|
|
|
* <extraversion>: Like <version>, but inserted for local
|
|
|
|
* customizations, eg "rh3" or "rusty1".
|
|
|
|
|
|
|
|
* Using this automatically adds a checksum of the .c files and the
|
|
|
|
* local headers in "srcversion".
|
|
|
|
*/
|
2010-12-16 06:00:19 +08:00
|
|
|
|
2011-01-25 04:32:51 +08:00
|
|
|
#if defined(MODULE) || !defined(CONFIG_SYSFS)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define MODULE_VERSION(_version) MODULE_INFO(version, _version)
|
2010-12-16 06:00:19 +08:00
|
|
|
#else
|
|
|
|
#define MODULE_VERSION(_version) \
|
2019-04-30 00:11:14 +08:00
|
|
|
MODULE_INFO(version, _version); \
|
2020-11-23 18:23:15 +08:00
|
|
|
static struct module_version_attribute __modver_attr \
|
|
|
|
__used __section("__modver") \
|
|
|
|
__aligned(__alignof__(struct module_version_attribute)) \
|
|
|
|
= { \
|
|
|
|
.mattr = { \
|
|
|
|
.attr = { \
|
|
|
|
.name = "version", \
|
|
|
|
.mode = S_IRUGO, \
|
|
|
|
}, \
|
|
|
|
.show = __modver_version_show, \
|
2010-12-16 06:00:19 +08:00
|
|
|
}, \
|
2020-11-23 18:23:15 +08:00
|
|
|
.module_name = KBUILD_MODNAME, \
|
|
|
|
.version = _version, \
|
2020-12-07 17:13:08 +08:00
|
|
|
}
|
2010-12-16 06:00:19 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-08-29 06:08:21 +08:00
|
|
|
/* Optional firmware file (or files) needed by the module
|
|
|
|
* format is simply firmware file name. Multiple firmware
|
|
|
|
* files require multiple MODULE_FIRMWARE() specifiers */
|
|
|
|
#define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware)
|
|
|
|
|
2022-01-08 22:06:57 +08:00
|
|
|
#define _MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, #ns)
|
|
|
|
#define MODULE_IMPORT_NS(ns) _MODULE_IMPORT_NS(ns)
|
2019-09-23 02:15:14 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct notifier_block;
|
|
|
|
|
|
|
|
#ifdef CONFIG_MODULES
|
|
|
|
|
2010-03-11 07:24:06 +08:00
|
|
|
extern int modules_disabled; /* for sysctl */
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Get/put a kernel symbol (calls must be symmetric) */
|
|
|
|
void *__symbol_get(const char *symbol);
|
|
|
|
void *__symbol_get_gpl(const char *symbol);
|
2018-06-23 23:37:44 +08:00
|
|
|
#define symbol_get(x) ((typeof(&x))(__symbol_get(__stringify(x))))
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-06-06 01:17:35 +08:00
|
|
|
/* modules using other modules: kdb wants to see this. */
|
|
|
|
struct module_use {
|
|
|
|
struct list_head source_list;
|
|
|
|
struct list_head target_list;
|
|
|
|
struct module *source, *target;
|
|
|
|
};
|
|
|
|
|
2013-01-12 09:08:44 +08:00
|
|
|
enum module_state {
|
|
|
|
MODULE_STATE_LIVE, /* Normal state. */
|
|
|
|
MODULE_STATE_COMING, /* Full formed, running module_init. */
|
|
|
|
MODULE_STATE_GOING, /* Going away. */
|
|
|
|
MODULE_STATE_UNFORMED, /* Still setting it up. */
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2015-05-27 09:39:37 +08:00
|
|
|
struct mod_tree_node {
|
|
|
|
struct module *mod;
|
|
|
|
struct latch_tree_node node;
|
|
|
|
};
|
|
|
|
|
2015-11-26 07:14:08 +08:00
|
|
|
struct module_layout {
|
|
|
|
/* The actual code + data. */
|
|
|
|
void *base;
|
|
|
|
/* Total size. */
|
|
|
|
unsigned int size;
|
|
|
|
/* The size of the executable code. */
|
|
|
|
unsigned int text_size;
|
|
|
|
/* Size of RO section of the module (text+rodata) */
|
|
|
|
unsigned int ro_size;
|
2016-07-27 10:36:21 +08:00
|
|
|
/* Size of RO after init section */
|
|
|
|
unsigned int ro_after_init_size;
|
2015-11-26 07:14:08 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_MODULES_TREE_LOOKUP
|
|
|
|
struct mod_tree_node mtn;
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifdef CONFIG_MODULES_TREE_LOOKUP
|
|
|
|
/* Only touch one cacheline for common rbtree-for-core-layout case. */
|
|
|
|
#define __module_layout_align ____cacheline_aligned
|
|
|
|
#else
|
|
|
|
#define __module_layout_align
|
|
|
|
#endif
|
|
|
|
|
modules: fix longstanding /proc/kallsyms vs module insertion race.
For CONFIG_KALLSYMS, we keep two symbol tables and two string tables.
There's one full copy, marked SHF_ALLOC and laid out at the end of the
module's init section. There's also a cut-down version that only
contains core symbols and strings, and lives in the module's core
section.
After module init (and before we free the module memory), we switch
the mod->symtab, mod->num_symtab and mod->strtab to point to the core
versions. We do this under the module_mutex.
However, kallsyms doesn't take the module_mutex: it uses
preempt_disable() and rcu tricks to walk through the modules, because
it's used in the oops path. It's also used in /proc/kallsyms.
There's nothing atomic about the change of these variables, so we can
get the old (larger!) num_symtab and the new symtab pointer; in fact
this is what I saw when trying to reproduce.
By grouping these variables together, we can use a
carefully-dereferenced pointer to ensure we always get one or the
other (the free of the module init section is already done in an RCU
callback, so that's safe). We allocate the init one at the end of the
module init section, and keep the core one inside the struct module
itself (it could also have been allocated at the end of the module
core, but that's probably overkill).
Reported-by: Weilong Chen <chenweilong@huawei.com>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111541
Cc: stable@kernel.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2016-02-03 14:25:26 +08:00
|
|
|
struct mod_kallsyms {
|
|
|
|
Elf_Sym *symtab;
|
|
|
|
unsigned int num_symtab;
|
|
|
|
char *strtab;
|
2019-02-26 03:59:58 +08:00
|
|
|
char *typetab;
|
modules: fix longstanding /proc/kallsyms vs module insertion race.
For CONFIG_KALLSYMS, we keep two symbol tables and two string tables.
There's one full copy, marked SHF_ALLOC and laid out at the end of the
module's init section. There's also a cut-down version that only
contains core symbols and strings, and lives in the module's core
section.
After module init (and before we free the module memory), we switch
the mod->symtab, mod->num_symtab and mod->strtab to point to the core
versions. We do this under the module_mutex.
However, kallsyms doesn't take the module_mutex: it uses
preempt_disable() and rcu tricks to walk through the modules, because
it's used in the oops path. It's also used in /proc/kallsyms.
There's nothing atomic about the change of these variables, so we can
get the old (larger!) num_symtab and the new symtab pointer; in fact
this is what I saw when trying to reproduce.
By grouping these variables together, we can use a
carefully-dereferenced pointer to ensure we always get one or the
other (the free of the module init section is already done in an RCU
callback, so that's safe). We allocate the init one at the end of the
module init section, and keep the core one inside the struct module
itself (it could also have been allocated at the end of the module
core, but that's probably overkill).
Reported-by: Weilong Chen <chenweilong@huawei.com>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111541
Cc: stable@kernel.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2016-02-03 14:25:26 +08:00
|
|
|
};
|
|
|
|
|
2016-03-23 08:03:16 +08:00
|
|
|
#ifdef CONFIG_LIVEPATCH
|
|
|
|
struct klp_modinfo {
|
|
|
|
Elf_Ehdr hdr;
|
|
|
|
Elf_Shdr *sechdrs;
|
|
|
|
char *secstrings;
|
|
|
|
unsigned int symndx;
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
2014-01-16 07:48:49 +08:00
|
|
|
struct module {
|
2005-04-17 06:20:36 +08:00
|
|
|
enum module_state state;
|
|
|
|
|
|
|
|
/* Member of list of modules */
|
|
|
|
struct list_head list;
|
|
|
|
|
|
|
|
/* Unique handle for this module */
|
|
|
|
char name[MODULE_NAME_LEN];
|
|
|
|
|
2021-07-08 09:09:20 +08:00
|
|
|
#ifdef CONFIG_STACKTRACE_BUILD_ID
|
|
|
|
/* Module build ID */
|
|
|
|
unsigned char build_id[BUILD_ID_SIZE_MAX];
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Sysfs stuff. */
|
|
|
|
struct module_kobject mkobj;
|
2006-02-17 05:50:23 +08:00
|
|
|
struct module_attribute *modinfo_attrs;
|
[PATCH] modules: add version and srcversion to sysfs
This patch adds version and srcversion files to
/sys/module/${modulename} containing the version and srcversion fields
of the module's modinfo section (if present).
/sys/module/e1000
|-- srcversion
`-- version
This patch differs slightly from the version posted in January, as it
now uses the new kstrdup() call in -mm.
Why put this in sysfs?
a) Tools like DKMS, which deal with changing out individual kernel
modules without replacing the whole kernel, can behave smarter if they
can tell the version of a given module. The autoinstaller feature, for
example, which determines if your system has a "good" version of a
driver (i.e. if the one provided by DKMS has a newer verson than that
provided by the kernel package installed), and to automatically compile
and install a newer version if DKMS has it but your kernel doesn't yet
have that version.
b) Because sysadmins manually, or with tools like DKMS, can switch out
modules on the file system, you can't count on 'modinfo foo.ko', which
looks at /lib/modules/${kernelver}/... actually matching what is loaded
into the kernel already. Hence asking sysfs for this.
c) as the unbind-driver-from-device work takes shape, it will be
possible to rebind a driver that's built-in (no .ko to modinfo for the
version) to a newly loaded module. sysfs will have the
currently-built-in version info, for comparison.
d) tech support scripts can then easily grab the version info for what's
running presently - a question I get often.
There has been renewed interest in this patch on linux-scsi by driver
authors.
As the idea originated from GregKH, I leave his Signed-off-by: intact,
though the implementation is nearly completely new. Compiled and run on
x86 and x86_64.
From: Matthew Dobson <colpatch@us.ibm.com>
build fix
From: Thierry Vignaud <tvignaud@mandriva.com>
build fix
From: Matthew Dobson <colpatch@us.ibm.com>
warning fix
Signed-off-by: Greg Kroah-Hartman <greg@kroah.com>
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-24 13:05:15 +08:00
|
|
|
const char *version;
|
|
|
|
const char *srcversion;
|
2007-01-18 20:26:15 +08:00
|
|
|
struct kobject *holders_dir;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Exported symbols */
|
|
|
|
const struct kernel_symbol *syms;
|
modversions: treat symbol CRCs as 32 bit quantities
The modversion symbol CRCs are emitted as ELF symbols, which allows us
to easily populate the kcrctab sections by relying on the linker to
associate each kcrctab slot with the correct value.
This has a couple of downsides:
- Given that the CRCs are treated as memory addresses, we waste 4 bytes
for each CRC on 64 bit architectures,
- On architectures that support runtime relocation, a R_<arch>_RELATIVE
relocation entry is emitted for each CRC value, which identifies it
as a quantity that requires fixing up based on the actual runtime
load offset of the kernel. This results in corrupted CRCs unless we
explicitly undo the fixup (and this is currently being handled in the
core module code)
- Such runtime relocation entries take up 24 bytes of __init space
each, resulting in a x8 overhead in [uncompressed] kernel size for
CRCs.
Switching to explicit 32 bit values on 64 bit architectures fixes most
of these issues, given that 32 bit values are not treated as quantities
that require fixing up based on the actual runtime load offset. Note
that on some ELF64 architectures [such as PPC64], these 32-bit values
are still emitted as [absolute] runtime relocatable quantities, even if
the value resolves to a build time constant. Since relative relocations
are always resolved at build time, this patch enables MODULE_REL_CRCS on
powerpc when CONFIG_RELOCATABLE=y, which turns the absolute CRC
references into relative references into .rodata where the actual CRC
value is stored.
So redefine all CRC fields and variables as u32, and redefine the
__CRC_SYMBOL() macro for 64 bit builds to emit the CRC reference using
inline assembler (which is necessary since 64-bit C code cannot use
32-bit types to hold memory addresses, even if they are ultimately
resolved using values that do not exceed 0xffffffff). To avoid
potential problems with legacy 32-bit architectures using legacy
toolchains, the equivalent C definition of the kcrctab entry is retained
for 32-bit architectures.
Note that this mostly reverts commit d4703aefdbc8 ("module: handle ppc64
relocating kcrctabs when CONFIG_RELOCATABLE=y")
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-02-03 17:54:06 +08:00
|
|
|
const s32 *crcs;
|
2008-07-23 08:24:26 +08:00
|
|
|
unsigned int num_syms;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
add support for Clang CFI
This change adds support for Clang’s forward-edge Control Flow
Integrity (CFI) checking. With CONFIG_CFI_CLANG, the compiler
injects a runtime check before each indirect function call to ensure
the target is a valid function with the correct static type. This
restricts possible call targets and makes it more difficult for
an attacker to exploit bugs that allow the modification of stored
function pointers. For more details, see:
https://clang.llvm.org/docs/ControlFlowIntegrity.html
Clang requires CONFIG_LTO_CLANG to be enabled with CFI to gain
visibility to possible call targets. Kernel modules are supported
with Clang’s cross-DSO CFI mode, which allows checking between
independently compiled components.
With CFI enabled, the compiler injects a __cfi_check() function into
the kernel and each module for validating local call targets. For
cross-module calls that cannot be validated locally, the compiler
calls the global __cfi_slowpath_diag() function, which determines
the target module and calls the correct __cfi_check() function. This
patch includes a slowpath implementation that uses __module_address()
to resolve call targets, and with CONFIG_CFI_CLANG_SHADOW enabled, a
shadow map that speeds up module look-ups by ~3x.
Clang implements indirect call checking using jump tables and
offers two methods of generating them. With canonical jump tables,
the compiler renames each address-taken function to <function>.cfi
and points the original symbol to a jump table entry, which passes
__cfi_check() validation. This isn’t compatible with stand-alone
assembly code, which the compiler doesn’t instrument, and would
result in indirect calls to assembly code to fail. Therefore, we
default to using non-canonical jump tables instead, where the compiler
generates a local jump table entry <function>.cfi_jt for each
address-taken function, and replaces all references to the function
with the address of the jump table entry.
Note that because non-canonical jump table addresses are local
to each component, they break cross-module function address
equality. Specifically, the address of a global function will be
different in each module, as it's replaced with the address of a local
jump table entry. If this address is passed to a different module,
it won’t match the address of the same function taken there. This
may break code that relies on comparing addresses passed from other
components.
CFI checking can be disabled in a function with the __nocfi attribute.
Additionally, CFI can be disabled for an entire compilation unit by
filtering out CC_FLAGS_CFI.
By default, CFI failures result in a kernel panic to stop a potential
exploit. CONFIG_CFI_PERMISSIVE enables a permissive mode, where the
kernel prints out a rate-limited warning instead, and allows execution
to continue. This option is helpful for locating type mismatches, but
should only be enabled during development.
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210408182843.1754385-2-samitolvanen@google.com
2021-04-09 02:28:26 +08:00
|
|
|
#ifdef CONFIG_CFI_CLANG
|
|
|
|
cfi_check_fn cfi_check;
|
|
|
|
#endif
|
|
|
|
|
2009-04-01 03:05:29 +08:00
|
|
|
/* Kernel parameters. */
|
2015-06-26 05:14:38 +08:00
|
|
|
#ifdef CONFIG_SYSFS
|
module: add per-module param_lock
Add a "param_lock" mutex to each module, and update params.c to use
the correct built-in or module mutex while locking kernel params.
Remove the kparam_block_sysfs_r/w() macros, replace them with direct
calls to kernel_param_[un]lock(module).
The kernel param code currently uses a single mutex to protect
modification of any and all kernel params. While this generally works,
there is one specific problem with it; a module callback function
cannot safely load another module, i.e. with request_module() or even
with indirect calls such as crypto_has_alg(). If the module to be
loaded has any of its params configured (e.g. with a /etc/modprobe.d/*
config file), then the attempt will result in a deadlock between the
first module param callback waiting for modprobe, and modprobe trying to
lock the single kernel param mutex to set the new module's param.
This fixes that by using per-module mutexes, so that each individual module
is protected against concurrent changes in its own kernel params, but is
not blocked by changes to other module params. All built-in modules
continue to use the built-in mutex, since they will always be loaded at
runtime and references (e.g. request_module(), crypto_has_alg()) to them
will never cause load-time param changing.
This also simplifies the interface used by modules to block sysfs access
to their params; while there are currently functions to block and unblock
sysfs param access which are split up by read and write and expect a single
kernel param to be passed, their actual operation is identical and applies
to all params, not just the one passed to them; they simply lock and unlock
the global param mutex. They are replaced with direct calls to
kernel_param_[un]lock(THIS_MODULE), which locks THIS_MODULE's param_lock, or
if the module is built-in, it locks the built-in mutex.
Suggested-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2015-06-17 04:48:52 +08:00
|
|
|
struct mutex param_lock;
|
2015-06-26 05:14:38 +08:00
|
|
|
#endif
|
2009-04-01 03:05:29 +08:00
|
|
|
struct kernel_param *kp;
|
|
|
|
unsigned int num_kp;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* GPL-only exported symbols. */
|
|
|
|
unsigned int num_gpl_syms;
|
2008-07-23 08:24:26 +08:00
|
|
|
const struct kernel_symbol *gpl_syms;
|
modversions: treat symbol CRCs as 32 bit quantities
The modversion symbol CRCs are emitted as ELF symbols, which allows us
to easily populate the kcrctab sections by relying on the linker to
associate each kcrctab slot with the correct value.
This has a couple of downsides:
- Given that the CRCs are treated as memory addresses, we waste 4 bytes
for each CRC on 64 bit architectures,
- On architectures that support runtime relocation, a R_<arch>_RELATIVE
relocation entry is emitted for each CRC value, which identifies it
as a quantity that requires fixing up based on the actual runtime
load offset of the kernel. This results in corrupted CRCs unless we
explicitly undo the fixup (and this is currently being handled in the
core module code)
- Such runtime relocation entries take up 24 bytes of __init space
each, resulting in a x8 overhead in [uncompressed] kernel size for
CRCs.
Switching to explicit 32 bit values on 64 bit architectures fixes most
of these issues, given that 32 bit values are not treated as quantities
that require fixing up based on the actual runtime load offset. Note
that on some ELF64 architectures [such as PPC64], these 32-bit values
are still emitted as [absolute] runtime relocatable quantities, even if
the value resolves to a build time constant. Since relative relocations
are always resolved at build time, this patch enables MODULE_REL_CRCS on
powerpc when CONFIG_RELOCATABLE=y, which turns the absolute CRC
references into relative references into .rodata where the actual CRC
value is stored.
So redefine all CRC fields and variables as u32, and redefine the
__CRC_SYMBOL() macro for 64 bit builds to emit the CRC reference using
inline assembler (which is necessary since 64-bit C code cannot use
32-bit types to hold memory addresses, even if they are ultimately
resolved using values that do not exceed 0xffffffff). To avoid
potential problems with legacy 32-bit architectures using legacy
toolchains, the equivalent C definition of the kcrctab entry is retained
for 32-bit architectures.
Note that this mostly reverts commit d4703aefdbc8 ("module: handle ppc64
relocating kcrctabs when CONFIG_RELOCATABLE=y")
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-02-03 17:54:06 +08:00
|
|
|
const s32 *gpl_crcs;
|
2020-07-29 05:33:33 +08:00
|
|
|
bool using_gplonly_symbols;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-09-26 17:09:40 +08:00
|
|
|
#ifdef CONFIG_MODULE_SIG
|
|
|
|
/* Signature was verified. */
|
|
|
|
bool sig_ok;
|
|
|
|
#endif
|
|
|
|
|
2015-03-31 07:20:05 +08:00
|
|
|
bool async_probe_requested;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Exception table */
|
|
|
|
unsigned int num_exentries;
|
2008-10-22 23:00:13 +08:00
|
|
|
struct exception_table_entry *extable;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Startup function. */
|
|
|
|
int (*init)(void);
|
|
|
|
|
2015-11-26 07:14:08 +08:00
|
|
|
/* Core layout: rbtree is accessed frequently, so keep together. */
|
|
|
|
struct module_layout core_layout __module_layout_align;
|
|
|
|
struct module_layout init_layout;
|
2010-11-17 05:35:16 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Arch-specific module values */
|
|
|
|
struct mod_arch_specific arch;
|
|
|
|
|
2016-09-21 19:47:22 +08:00
|
|
|
unsigned long taints; /* same bits as kernel:taint_flags */
|
2006-10-02 17:17:02 +08:00
|
|
|
|
[PATCH] Generic BUG implementation
This patch adds common handling for kernel BUGs, for use by architectures as
they wish. The code is derived from arch/powerpc.
The advantages of having common BUG handling are:
- consistent BUG reporting across architectures
- shared implementation of out-of-line file/line data
- implement CONFIG_DEBUG_BUGVERBOSE consistently
This means that in inline impact of BUG is just the illegal instruction
itself, which is an improvement for i386 and x86-64.
A BUG is represented in the instruction stream as an illegal instruction,
which has file/line information associated with it. This extra information is
stored in the __bug_table section in the ELF file.
When the kernel gets an illegal instruction, it first confirms it might
possibly be from a BUG (ie, in kernel mode, the right illegal instruction).
It then calls report_bug(). This searches __bug_table for a matching
instruction pointer, and if found, prints the corresponding file/line
information. If report_bug() determines that it wasn't a BUG which caused the
trap, it returns BUG_TRAP_TYPE_NONE.
Some architectures (powerpc) implement WARN using the same mechanism; if the
illegal instruction was the result of a WARN, then report_bug(Q) returns
CONFIG_DEBUG_BUGVERBOSE; otherwise it returns BUG_TRAP_TYPE_BUG.
lib/bug.c keeps a list of loaded modules which can be searched for __bug_table
entries. The architecture must call
module_bug_finalize()/module_bug_cleanup() from its corresponding
module_finalize/cleanup functions.
Unsetting CONFIG_DEBUG_BUGVERBOSE will reduce the kernel size by some amount.
At the very least, filename and line information will not be recorded for each
but, but architectures may decide to store no extra information per BUG at
all.
Unfortunately, gcc doesn't have a general way to mark an asm() as noreturn, so
architectures will generally have to include an infinite loop (or similar) in
the BUG code, so that gcc knows execution won't continue beyond that point.
gcc does have a __builtin_trap() operator which may be useful to achieve the
same effect, unfortunately it cannot be used to actually implement the BUG
itself, because there's no way to get the instruction's address for use in
generating the __bug_table entry.
[randy.dunlap@oracle.com: Handle BUG=n, GENERIC_BUG=n to prevent build errors]
[bunk@stusta.de: include/linux/bug.h must always #include <linux/module.h]
Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Hugh Dickens <hugh@veritas.com>
Cc: Michael Ellerman <michael@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:36:19 +08:00
|
|
|
#ifdef CONFIG_GENERIC_BUG
|
|
|
|
/* Support for BUG */
|
2008-07-23 08:24:26 +08:00
|
|
|
unsigned num_bugs;
|
[PATCH] Generic BUG implementation
This patch adds common handling for kernel BUGs, for use by architectures as
they wish. The code is derived from arch/powerpc.
The advantages of having common BUG handling are:
- consistent BUG reporting across architectures
- shared implementation of out-of-line file/line data
- implement CONFIG_DEBUG_BUGVERBOSE consistently
This means that in inline impact of BUG is just the illegal instruction
itself, which is an improvement for i386 and x86-64.
A BUG is represented in the instruction stream as an illegal instruction,
which has file/line information associated with it. This extra information is
stored in the __bug_table section in the ELF file.
When the kernel gets an illegal instruction, it first confirms it might
possibly be from a BUG (ie, in kernel mode, the right illegal instruction).
It then calls report_bug(). This searches __bug_table for a matching
instruction pointer, and if found, prints the corresponding file/line
information. If report_bug() determines that it wasn't a BUG which caused the
trap, it returns BUG_TRAP_TYPE_NONE.
Some architectures (powerpc) implement WARN using the same mechanism; if the
illegal instruction was the result of a WARN, then report_bug(Q) returns
CONFIG_DEBUG_BUGVERBOSE; otherwise it returns BUG_TRAP_TYPE_BUG.
lib/bug.c keeps a list of loaded modules which can be searched for __bug_table
entries. The architecture must call
module_bug_finalize()/module_bug_cleanup() from its corresponding
module_finalize/cleanup functions.
Unsetting CONFIG_DEBUG_BUGVERBOSE will reduce the kernel size by some amount.
At the very least, filename and line information will not be recorded for each
but, but architectures may decide to store no extra information per BUG at
all.
Unfortunately, gcc doesn't have a general way to mark an asm() as noreturn, so
architectures will generally have to include an infinite loop (or similar) in
the BUG code, so that gcc knows execution won't continue beyond that point.
gcc does have a __builtin_trap() operator which may be useful to achieve the
same effect, unfortunately it cannot be used to actually implement the BUG
itself, because there's no way to get the instruction's address for use in
generating the __bug_table entry.
[randy.dunlap@oracle.com: Handle BUG=n, GENERIC_BUG=n to prevent build errors]
[bunk@stusta.de: include/linux/bug.h must always #include <linux/module.h]
Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Hugh Dickens <hugh@veritas.com>
Cc: Michael Ellerman <michael@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:36:19 +08:00
|
|
|
struct list_head bug_list;
|
|
|
|
struct bug_entry *bug_table;
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_KALLSYMS
|
modules: fix longstanding /proc/kallsyms vs module insertion race.
For CONFIG_KALLSYMS, we keep two symbol tables and two string tables.
There's one full copy, marked SHF_ALLOC and laid out at the end of the
module's init section. There's also a cut-down version that only
contains core symbols and strings, and lives in the module's core
section.
After module init (and before we free the module memory), we switch
the mod->symtab, mod->num_symtab and mod->strtab to point to the core
versions. We do this under the module_mutex.
However, kallsyms doesn't take the module_mutex: it uses
preempt_disable() and rcu tricks to walk through the modules, because
it's used in the oops path. It's also used in /proc/kallsyms.
There's nothing atomic about the change of these variables, so we can
get the old (larger!) num_symtab and the new symtab pointer; in fact
this is what I saw when trying to reproduce.
By grouping these variables together, we can use a
carefully-dereferenced pointer to ensure we always get one or the
other (the free of the module init section is already done in an RCU
callback, so that's safe). We allocate the init one at the end of the
module init section, and keep the core one inside the struct module
itself (it could also have been allocated at the end of the module
core, but that's probably overkill).
Reported-by: Weilong Chen <chenweilong@huawei.com>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111541
Cc: stable@kernel.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2016-02-03 14:25:26 +08:00
|
|
|
/* Protected by RCU and/or module_mutex: use rcu_dereference() */
|
2020-01-23 01:04:47 +08:00
|
|
|
struct mod_kallsyms __rcu *kallsyms;
|
modules: fix longstanding /proc/kallsyms vs module insertion race.
For CONFIG_KALLSYMS, we keep two symbol tables and two string tables.
There's one full copy, marked SHF_ALLOC and laid out at the end of the
module's init section. There's also a cut-down version that only
contains core symbols and strings, and lives in the module's core
section.
After module init (and before we free the module memory), we switch
the mod->symtab, mod->num_symtab and mod->strtab to point to the core
versions. We do this under the module_mutex.
However, kallsyms doesn't take the module_mutex: it uses
preempt_disable() and rcu tricks to walk through the modules, because
it's used in the oops path. It's also used in /proc/kallsyms.
There's nothing atomic about the change of these variables, so we can
get the old (larger!) num_symtab and the new symtab pointer; in fact
this is what I saw when trying to reproduce.
By grouping these variables together, we can use a
carefully-dereferenced pointer to ensure we always get one or the
other (the free of the module init section is already done in an RCU
callback, so that's safe). We allocate the init one at the end of the
module init section, and keep the core one inside the struct module
itself (it could also have been allocated at the end of the module
core, but that's probably overkill).
Reported-by: Weilong Chen <chenweilong@huawei.com>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111541
Cc: stable@kernel.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2016-02-03 14:25:26 +08:00
|
|
|
struct mod_kallsyms core_kallsyms;
|
2016-10-20 07:12:18 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Section attributes */
|
|
|
|
struct module_sect_attrs *sect_attrs;
|
2007-10-17 14:26:40 +08:00
|
|
|
|
|
|
|
/* Notes attributes */
|
|
|
|
struct module_notes_attrs *notes_attrs;
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
|
2011-05-20 06:55:25 +08:00
|
|
|
/* The command line arguments (may be mangled). People like
|
|
|
|
keeping pointers to this stuff */
|
|
|
|
char *args;
|
|
|
|
|
2010-03-10 17:56:10 +08:00
|
|
|
#ifdef CONFIG_SMP
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Per-cpu data. */
|
2010-03-10 17:56:10 +08:00
|
|
|
void __percpu *percpu;
|
|
|
|
unsigned int percpu_size;
|
|
|
|
#endif
|
2020-03-10 21:04:34 +08:00
|
|
|
void *noinstr_text_start;
|
|
|
|
unsigned int noinstr_text_size;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
tracing: Kernel Tracepoints
Implementation of kernel tracepoints. Inspired from the Linux Kernel
Markers. Allows complete typing verification by declaring both tracing
statement inline functions and probe registration/unregistration static
inline functions within the same macro "DEFINE_TRACE". No format string
is required. See the tracepoint Documentation and Samples patches for
usage examples.
Taken from the documentation patch :
"A tracepoint placed in code provides a hook to call a function (probe)
that you can provide at runtime. A tracepoint can be "on" (a probe is
connected to it) or "off" (no probe is attached). When a tracepoint is
"off" it has no effect, except for adding a tiny time penalty (checking
a condition for a branch) and space penalty (adding a few bytes for the
function call at the end of the instrumented function and adds a data
structure in a separate section). When a tracepoint is "on", the
function you provide is called each time the tracepoint is executed, in
the execution context of the caller. When the function provided ends its
execution, it returns to the caller (continuing from the tracepoint
site).
You can put tracepoints at important locations in the code. They are
lightweight hooks that can pass an arbitrary number of parameters, which
prototypes are described in a tracepoint declaration placed in a header
file."
Addition and removal of tracepoints is synchronized by RCU using the
scheduler (and preempt_disable) as guarantees to find a quiescent state
(this is really RCU "classic"). The update side uses rcu_barrier_sched()
with call_rcu_sched() and the read/execute side uses
"preempt_disable()/preempt_enable()".
We make sure the previous array containing probes, which has been
scheduled for deletion by the rcu callback, is indeed freed before we
proceed to the next update. It therefore limits the rate of modification
of a single tracepoint to one update per RCU period. The objective here
is to permit fast batch add/removal of probes on _different_
tracepoints.
Changelog :
- Use #name ":" #proto as string to identify the tracepoint in the
tracepoint table. This will make sure not type mismatch happens due to
connexion of a probe with the wrong type to a tracepoint declared with
the same name in a different header.
- Add tracepoint_entry_free_old.
- Change __TO_TRACE to get rid of the 'i' iterator.
Masami Hiramatsu <mhiramat@redhat.com> :
Tested on x86-64.
Performance impact of a tracepoint : same as markers, except that it
adds about 70 bytes of instructions in an unlikely branch of each
instrumented function (the for loop, the stack setup and the function
call). It currently adds a memory read, a test and a conditional branch
at the instrumentation site (in the hot path). Immediate values will
eventually change this into a load immediate, test and branch, which
removes the memory read which will make the i-cache impact smaller
(changing the memory read for a load immediate removes 3-4 bytes per
site on x86_32 (depending on mov prefixes), or 7-8 bytes on x86_64, it
also saves the d-cache hit).
About the performance impact of tracepoints (which is comparable to
markers), even without immediate values optimizations, tests done by
Hideo Aoki on ia64 show no regression. His test case was using hackbench
on a kernel where scheduler instrumentation (about 5 events in code
scheduler code) was added.
Quoting Hideo Aoki about Markers :
I evaluated overhead of kernel marker using linux-2.6-sched-fixes git
tree, which includes several markers for LTTng, using an ia64 server.
While the immediate trace mark feature isn't implemented on ia64, there
is no major performance regression. So, I think that we don't have any
issues to propose merging marker point patches into Linus's tree from
the viewpoint of performance impact.
I prepared two kernels to evaluate. The first one was compiled without
CONFIG_MARKERS. The second one was enabled CONFIG_MARKERS.
I downloaded the original hackbench from the following URL:
http://devresources.linux-foundation.org/craiger/hackbench/src/hackbench.c
I ran hackbench 5 times in each condition and calculated the average and
difference between the kernels.
The parameter of hackbench: every 50 from 50 to 800
The number of CPUs of the server: 2, 4, and 8
Below is the results. As you can see, major performance regression
wasn't found in any case. Even if number of processes increases,
differences between marker-enabled kernel and marker- disabled kernel
doesn't increase. Moreover, if number of CPUs increases, the differences
doesn't increase either.
Curiously, marker-enabled kernel is better than marker-disabled kernel
in more than half cases, although I guess it comes from the difference
of memory access pattern.
* 2 CPUs
Number of | without | with | diff | diff |
processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] |
--------------------------------------------------------------
50 | 4.811 | 4.872 | +0.061 | +1.27 |
100 | 9.854 | 10.309 | +0.454 | +4.61 |
150 | 15.602 | 15.040 | -0.562 | -3.6 |
200 | 20.489 | 20.380 | -0.109 | -0.53 |
250 | 25.798 | 25.652 | -0.146 | -0.56 |
300 | 31.260 | 30.797 | -0.463 | -1.48 |
350 | 36.121 | 35.770 | -0.351 | -0.97 |
400 | 42.288 | 42.102 | -0.186 | -0.44 |
450 | 47.778 | 47.253 | -0.526 | -1.1 |
500 | 51.953 | 52.278 | +0.325 | +0.63 |
550 | 58.401 | 57.700 | -0.701 | -1.2 |
600 | 63.334 | 63.222 | -0.112 | -0.18 |
650 | 68.816 | 68.511 | -0.306 | -0.44 |
700 | 74.667 | 74.088 | -0.579 | -0.78 |
750 | 78.612 | 79.582 | +0.970 | +1.23 |
800 | 85.431 | 85.263 | -0.168 | -0.2 |
--------------------------------------------------------------
* 4 CPUs
Number of | without | with | diff | diff |
processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] |
--------------------------------------------------------------
50 | 2.586 | 2.584 | -0.003 | -0.1 |
100 | 5.254 | 5.283 | +0.030 | +0.56 |
150 | 8.012 | 8.074 | +0.061 | +0.76 |
200 | 11.172 | 11.000 | -0.172 | -1.54 |
250 | 13.917 | 14.036 | +0.119 | +0.86 |
300 | 16.905 | 16.543 | -0.362 | -2.14 |
350 | 19.901 | 20.036 | +0.135 | +0.68 |
400 | 22.908 | 23.094 | +0.186 | +0.81 |
450 | 26.273 | 26.101 | -0.172 | -0.66 |
500 | 29.554 | 29.092 | -0.461 | -1.56 |
550 | 32.377 | 32.274 | -0.103 | -0.32 |
600 | 35.855 | 35.322 | -0.533 | -1.49 |
650 | 39.192 | 38.388 | -0.804 | -2.05 |
700 | 41.744 | 41.719 | -0.025 | -0.06 |
750 | 45.016 | 44.496 | -0.520 | -1.16 |
800 | 48.212 | 47.603 | -0.609 | -1.26 |
--------------------------------------------------------------
* 8 CPUs
Number of | without | with | diff | diff |
processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] |
--------------------------------------------------------------
50 | 2.094 | 2.072 | -0.022 | -1.07 |
100 | 4.162 | 4.273 | +0.111 | +2.66 |
150 | 6.485 | 6.540 | +0.055 | +0.84 |
200 | 8.556 | 8.478 | -0.078 | -0.91 |
250 | 10.458 | 10.258 | -0.200 | -1.91 |
300 | 12.425 | 12.750 | +0.325 | +2.62 |
350 | 14.807 | 14.839 | +0.032 | +0.22 |
400 | 16.801 | 16.959 | +0.158 | +0.94 |
450 | 19.478 | 19.009 | -0.470 | -2.41 |
500 | 21.296 | 21.504 | +0.208 | +0.98 |
550 | 23.842 | 23.979 | +0.137 | +0.57 |
600 | 26.309 | 26.111 | -0.198 | -0.75 |
650 | 28.705 | 28.446 | -0.259 | -0.9 |
700 | 31.233 | 31.394 | +0.161 | +0.52 |
750 | 34.064 | 33.720 | -0.344 | -1.01 |
800 | 36.320 | 36.114 | -0.206 | -0.57 |
--------------------------------------------------------------
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: 'Peter Zijlstra' <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-19 00:16:16 +08:00
|
|
|
#ifdef CONFIG_TRACEPOINTS
|
|
|
|
unsigned int num_tracepoints;
|
tracepoint: Fix tracepoint array element size mismatch
commit 46e0c9be206f ("kernel: tracepoints: add support for relative
references") changes the layout of the __tracepoint_ptrs section on
architectures supporting relative references. However, it does so
without turning struct tracepoint * const into const int elsewhere in
the tracepoint code, which has the following side-effect:
Setting mod->num_tracepoints is done in by module.c:
mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
sizeof(*mod->tracepoints_ptrs),
&mod->num_tracepoints);
Basically, since sizeof(*mod->tracepoints_ptrs) is a pointer size
(rather than sizeof(int)), num_tracepoints is erroneously set to half the
size it should be on 64-bit arch. So a module with an odd number of
tracepoints misses the last tracepoint due to effect of integer
division.
So in the module going notifier:
for_each_tracepoint_range(mod->tracepoints_ptrs,
mod->tracepoints_ptrs + mod->num_tracepoints,
tp_module_going_check_quiescent, NULL);
the expression (mod->tracepoints_ptrs + mod->num_tracepoints) actually
evaluates to something within the bounds of the array, but miss the
last tracepoint if the number of tracepoints is odd on 64-bit arch.
Fix this by introducing a new typedef: tracepoint_ptr_t, which
is either "const int" on architectures that have PREL32 relocations,
or "struct tracepoint * const" on architectures that does not have
this feature.
Also provide a new tracepoint_ptr_defer() static inline to
encapsulate deferencing this type rather than duplicate code and
ugly idefs within the for_each_tracepoint_range() implementation.
This issue appears in 4.19-rc kernels, and should ideally be fixed
before the end of the rc cycle.
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Jessica Yu <jeyu@kernel.org>
Link: http://lkml.kernel.org/r/20181013191050.22389-1-mathieu.desnoyers@efficios.com
Link: http://lkml.kernel.org/r/20180704083651.24360-7-ard.biesheuvel@linaro.org
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morris <james.morris@microsoft.com>
Cc: James Morris <jmorris@namei.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Nicolas Pitre <nico@linaro.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-10-14 03:10:50 +08:00
|
|
|
tracepoint_ptr_t *tracepoints_ptrs;
|
tracing: Kernel Tracepoints
Implementation of kernel tracepoints. Inspired from the Linux Kernel
Markers. Allows complete typing verification by declaring both tracing
statement inline functions and probe registration/unregistration static
inline functions within the same macro "DEFINE_TRACE". No format string
is required. See the tracepoint Documentation and Samples patches for
usage examples.
Taken from the documentation patch :
"A tracepoint placed in code provides a hook to call a function (probe)
that you can provide at runtime. A tracepoint can be "on" (a probe is
connected to it) or "off" (no probe is attached). When a tracepoint is
"off" it has no effect, except for adding a tiny time penalty (checking
a condition for a branch) and space penalty (adding a few bytes for the
function call at the end of the instrumented function and adds a data
structure in a separate section). When a tracepoint is "on", the
function you provide is called each time the tracepoint is executed, in
the execution context of the caller. When the function provided ends its
execution, it returns to the caller (continuing from the tracepoint
site).
You can put tracepoints at important locations in the code. They are
lightweight hooks that can pass an arbitrary number of parameters, which
prototypes are described in a tracepoint declaration placed in a header
file."
Addition and removal of tracepoints is synchronized by RCU using the
scheduler (and preempt_disable) as guarantees to find a quiescent state
(this is really RCU "classic"). The update side uses rcu_barrier_sched()
with call_rcu_sched() and the read/execute side uses
"preempt_disable()/preempt_enable()".
We make sure the previous array containing probes, which has been
scheduled for deletion by the rcu callback, is indeed freed before we
proceed to the next update. It therefore limits the rate of modification
of a single tracepoint to one update per RCU period. The objective here
is to permit fast batch add/removal of probes on _different_
tracepoints.
Changelog :
- Use #name ":" #proto as string to identify the tracepoint in the
tracepoint table. This will make sure not type mismatch happens due to
connexion of a probe with the wrong type to a tracepoint declared with
the same name in a different header.
- Add tracepoint_entry_free_old.
- Change __TO_TRACE to get rid of the 'i' iterator.
Masami Hiramatsu <mhiramat@redhat.com> :
Tested on x86-64.
Performance impact of a tracepoint : same as markers, except that it
adds about 70 bytes of instructions in an unlikely branch of each
instrumented function (the for loop, the stack setup and the function
call). It currently adds a memory read, a test and a conditional branch
at the instrumentation site (in the hot path). Immediate values will
eventually change this into a load immediate, test and branch, which
removes the memory read which will make the i-cache impact smaller
(changing the memory read for a load immediate removes 3-4 bytes per
site on x86_32 (depending on mov prefixes), or 7-8 bytes on x86_64, it
also saves the d-cache hit).
About the performance impact of tracepoints (which is comparable to
markers), even without immediate values optimizations, tests done by
Hideo Aoki on ia64 show no regression. His test case was using hackbench
on a kernel where scheduler instrumentation (about 5 events in code
scheduler code) was added.
Quoting Hideo Aoki about Markers :
I evaluated overhead of kernel marker using linux-2.6-sched-fixes git
tree, which includes several markers for LTTng, using an ia64 server.
While the immediate trace mark feature isn't implemented on ia64, there
is no major performance regression. So, I think that we don't have any
issues to propose merging marker point patches into Linus's tree from
the viewpoint of performance impact.
I prepared two kernels to evaluate. The first one was compiled without
CONFIG_MARKERS. The second one was enabled CONFIG_MARKERS.
I downloaded the original hackbench from the following URL:
http://devresources.linux-foundation.org/craiger/hackbench/src/hackbench.c
I ran hackbench 5 times in each condition and calculated the average and
difference between the kernels.
The parameter of hackbench: every 50 from 50 to 800
The number of CPUs of the server: 2, 4, and 8
Below is the results. As you can see, major performance regression
wasn't found in any case. Even if number of processes increases,
differences between marker-enabled kernel and marker- disabled kernel
doesn't increase. Moreover, if number of CPUs increases, the differences
doesn't increase either.
Curiously, marker-enabled kernel is better than marker-disabled kernel
in more than half cases, although I guess it comes from the difference
of memory access pattern.
* 2 CPUs
Number of | without | with | diff | diff |
processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] |
--------------------------------------------------------------
50 | 4.811 | 4.872 | +0.061 | +1.27 |
100 | 9.854 | 10.309 | +0.454 | +4.61 |
150 | 15.602 | 15.040 | -0.562 | -3.6 |
200 | 20.489 | 20.380 | -0.109 | -0.53 |
250 | 25.798 | 25.652 | -0.146 | -0.56 |
300 | 31.260 | 30.797 | -0.463 | -1.48 |
350 | 36.121 | 35.770 | -0.351 | -0.97 |
400 | 42.288 | 42.102 | -0.186 | -0.44 |
450 | 47.778 | 47.253 | -0.526 | -1.1 |
500 | 51.953 | 52.278 | +0.325 | +0.63 |
550 | 58.401 | 57.700 | -0.701 | -1.2 |
600 | 63.334 | 63.222 | -0.112 | -0.18 |
650 | 68.816 | 68.511 | -0.306 | -0.44 |
700 | 74.667 | 74.088 | -0.579 | -0.78 |
750 | 78.612 | 79.582 | +0.970 | +1.23 |
800 | 85.431 | 85.263 | -0.168 | -0.2 |
--------------------------------------------------------------
* 4 CPUs
Number of | without | with | diff | diff |
processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] |
--------------------------------------------------------------
50 | 2.586 | 2.584 | -0.003 | -0.1 |
100 | 5.254 | 5.283 | +0.030 | +0.56 |
150 | 8.012 | 8.074 | +0.061 | +0.76 |
200 | 11.172 | 11.000 | -0.172 | -1.54 |
250 | 13.917 | 14.036 | +0.119 | +0.86 |
300 | 16.905 | 16.543 | -0.362 | -2.14 |
350 | 19.901 | 20.036 | +0.135 | +0.68 |
400 | 22.908 | 23.094 | +0.186 | +0.81 |
450 | 26.273 | 26.101 | -0.172 | -0.66 |
500 | 29.554 | 29.092 | -0.461 | -1.56 |
550 | 32.377 | 32.274 | -0.103 | -0.32 |
600 | 35.855 | 35.322 | -0.533 | -1.49 |
650 | 39.192 | 38.388 | -0.804 | -2.05 |
700 | 41.744 | 41.719 | -0.025 | -0.06 |
750 | 45.016 | 44.496 | -0.520 | -1.16 |
800 | 48.212 | 47.603 | -0.609 | -1.26 |
--------------------------------------------------------------
* 8 CPUs
Number of | without | with | diff | diff |
processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] |
--------------------------------------------------------------
50 | 2.094 | 2.072 | -0.022 | -1.07 |
100 | 4.162 | 4.273 | +0.111 | +2.66 |
150 | 6.485 | 6.540 | +0.055 | +0.84 |
200 | 8.556 | 8.478 | -0.078 | -0.91 |
250 | 10.458 | 10.258 | -0.200 | -1.91 |
300 | 12.425 | 12.750 | +0.325 | +2.62 |
350 | 14.807 | 14.839 | +0.032 | +0.22 |
400 | 16.801 | 16.959 | +0.158 | +0.94 |
450 | 19.478 | 19.009 | -0.470 | -2.41 |
500 | 21.296 | 21.504 | +0.208 | +0.98 |
550 | 23.842 | 23.979 | +0.137 | +0.57 |
600 | 26.309 | 26.111 | -0.198 | -0.75 |
650 | 28.705 | 28.446 | -0.259 | -0.9 |
700 | 31.233 | 31.394 | +0.161 | +0.52 |
750 | 34.064 | 33.720 | -0.344 | -1.01 |
800 | 36.320 | 36.114 | -0.206 | -0.57 |
--------------------------------------------------------------
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: 'Peter Zijlstra' <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-19 00:16:16 +08:00
|
|
|
#endif
|
2019-04-06 07:15:00 +08:00
|
|
|
#ifdef CONFIG_TREE_SRCU
|
|
|
|
unsigned int num_srcu_structs;
|
|
|
|
struct srcu_struct **srcu_struct_ptrs;
|
|
|
|
#endif
|
2018-12-13 08:42:37 +08:00
|
|
|
#ifdef CONFIG_BPF_EVENTS
|
|
|
|
unsigned int num_bpf_raw_events;
|
|
|
|
struct bpf_raw_event_map *bpf_raw_events;
|
|
|
|
#endif
|
2020-11-10 09:19:31 +08:00
|
|
|
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
|
|
|
|
unsigned int btf_data_size;
|
|
|
|
void *btf_data;
|
|
|
|
#endif
|
2018-12-30 23:14:15 +08:00
|
|
|
#ifdef CONFIG_JUMP_LABEL
|
2010-09-17 23:09:00 +08:00
|
|
|
struct jump_entry *jump_entries;
|
|
|
|
unsigned int num_jump_entries;
|
|
|
|
#endif
|
2009-03-07 00:21:49 +08:00
|
|
|
#ifdef CONFIG_TRACING
|
2009-03-07 00:21:48 +08:00
|
|
|
unsigned int num_trace_bprintk_fmt;
|
2011-05-20 06:55:25 +08:00
|
|
|
const char **trace_bprintk_fmt_start;
|
2009-03-07 00:21:48 +08:00
|
|
|
#endif
|
2009-04-11 02:53:50 +08:00
|
|
|
#ifdef CONFIG_EVENT_TRACING
|
2015-05-05 23:45:27 +08:00
|
|
|
struct trace_event_call **trace_events;
|
2009-04-11 02:53:50 +08:00
|
|
|
unsigned int num_trace_events;
|
2017-06-01 05:56:44 +08:00
|
|
|
struct trace_eval_map **trace_evals;
|
|
|
|
unsigned int num_trace_evals;
|
2009-04-11 02:53:50 +08:00
|
|
|
#endif
|
2009-04-16 01:24:06 +08:00
|
|
|
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
|
|
|
|
unsigned int num_ftrace_callsites;
|
2011-05-20 06:55:25 +08:00
|
|
|
unsigned long *ftrace_callsites;
|
2009-04-16 01:24:06 +08:00
|
|
|
#endif
|
2020-03-26 22:49:48 +08:00
|
|
|
#ifdef CONFIG_KPROBES
|
|
|
|
void *kprobes_text_start;
|
|
|
|
unsigned int kprobes_text_size;
|
2020-03-26 22:50:00 +08:00
|
|
|
unsigned long *kprobe_blacklist;
|
|
|
|
unsigned int num_kprobe_blacklist;
|
2020-03-26 22:49:48 +08:00
|
|
|
#endif
|
2020-08-18 21:57:42 +08:00
|
|
|
#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
|
|
|
|
int num_static_call_sites;
|
|
|
|
struct static_call_site *static_call_sites;
|
|
|
|
#endif
|
2009-03-07 00:21:48 +08:00
|
|
|
|
livepatch: Fix subtle race with coming and going modules
There is a notifier that handles live patches for coming and going modules.
It takes klp_mutex lock to avoid races with coming and going patches but
it does not keep the lock all the time. Therefore the following races are
possible:
1. The notifier is called sometime in STATE_MODULE_COMING. The module
is visible by find_module() in this state all the time. It means that
new patch can be registered and enabled even before the notifier is
called. It might create wrong order of stacked patches, see below
for an example.
2. New patch could still see the module in the GOING state even after
the notifier has been called. It will try to initialize the related
object structures but the module could disappear at any time. There
will stay mess in the structures. It might even cause an invalid
memory access.
This patch solves the problem by adding a boolean variable into struct module.
The value is true after the coming and before the going handler is called.
New patches need to be applied when the value is true and they need to ignore
the module when the value is false.
Note that we need to know state of all modules on the system. The races are
related to new patches. Therefore we do not know what modules will get
patched.
Also note that we could not simply ignore going modules. The code from the
module could be called even in the GOING state until mod->exit() finishes.
If we start supporting patches with semantic changes between function
calls, we need to apply new patches to any still usable code.
See below for an example.
Finally note that the patch solves only the situation when a new patch is
registered. There are no such problems when the patch is being removed.
It does not matter who disable the patch first, whether the normal
disable_patch() or the module notifier. There is nothing to do
once the patch is disabled.
Alternative solutions:
======================
+ reject new patches when a patched module is coming or going; this is ugly
+ wait with adding new patch until the module leaves the COMING and GOING
states; this might be dangerous and complicated; we would need to release
kgr_lock in the middle of the patch registration to avoid a deadlock
with the coming and going handlers; also we might need a waitqueue for
each module which seems to be even bigger overhead than the boolean
+ stop modules from entering COMING and GOING states; wait until modules
leave these states when they are already there; looks complicated; we would
need to ignore the module that asked to stop the others to avoid a deadlock;
also it is unclear what to do when two modules asked to stop others and
both are in COMING state (situation when two new patches are applied)
+ always register/enable new patches and fix up the potential mess (registered
patches order) in klp_module_init(); this is nasty and prone to regressions
in the future development
+ add another MODULE_STATE where the kallsyms are visible but the module is not
used yet; this looks too complex; the module states are checked on "many"
locations
Example of patch stacking breakage:
===================================
The notifier could _not_ _simply_ ignore already initialized module objects.
For example, let's have three patches (P1, P2, P3) for functions a() and b()
where a() is from vmcore and b() is from a module M. Something like:
a() b()
P1 a1() b1()
P2 a2() b2()
P3 a3() b3(3)
If you load the module M after all patches are registered and enabled.
The ftrace ops for function a() and b() has listed the functions in this
order:
ops_a->func_stack -> list(a3,a2,a1)
ops_b->func_stack -> list(b3,b2,b1)
, so the pointer to b3() is the first and will be used.
Then you might have the following scenario. Let's start with state when patches
P1 and P2 are registered and enabled but the module M is not loaded. Then ftrace
ops for b() does not exist. Then we get into the following race:
CPU0 CPU1
load_module(M)
complete_formation()
mod->state = MODULE_STATE_COMING;
mutex_unlock(&module_mutex);
klp_register_patch(P3);
klp_enable_patch(P3);
# STATE 1
klp_module_notify(M)
klp_module_notify_coming(P1);
klp_module_notify_coming(P2);
klp_module_notify_coming(P3);
# STATE 2
The ftrace ops for a() and b() then looks:
STATE1:
ops_a->func_stack -> list(a3,a2,a1);
ops_b->func_stack -> list(b3);
STATE2:
ops_a->func_stack -> list(a3,a2,a1);
ops_b->func_stack -> list(b2,b1,b3);
therefore, b2() is used for the module but a3() is used for vmcore
because they were the last added.
Example of the race with going modules:
=======================================
CPU0 CPU1
delete_module() #SYSCALL
try_stop_module()
mod->state = MODULE_STATE_GOING;
mutex_unlock(&module_mutex);
klp_register_patch()
klp_enable_patch()
#save place to switch universe
b() # from module that is going
a() # from core (patched)
mod->exit();
Note that the function b() can be called until we call mod->exit().
If we do not apply patch against b() because it is in MODULE_STATE_GOING,
it will call patched a() with modified semantic and things might get wrong.
[jpoimboe@redhat.com: use one boolean instead of two]
Signed-off-by: Petr Mladek <pmladek@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
2015-03-12 19:55:13 +08:00
|
|
|
#ifdef CONFIG_LIVEPATCH
|
2016-03-23 08:03:16 +08:00
|
|
|
bool klp; /* Is this a livepatch module? */
|
livepatch: Fix subtle race with coming and going modules
There is a notifier that handles live patches for coming and going modules.
It takes klp_mutex lock to avoid races with coming and going patches but
it does not keep the lock all the time. Therefore the following races are
possible:
1. The notifier is called sometime in STATE_MODULE_COMING. The module
is visible by find_module() in this state all the time. It means that
new patch can be registered and enabled even before the notifier is
called. It might create wrong order of stacked patches, see below
for an example.
2. New patch could still see the module in the GOING state even after
the notifier has been called. It will try to initialize the related
object structures but the module could disappear at any time. There
will stay mess in the structures. It might even cause an invalid
memory access.
This patch solves the problem by adding a boolean variable into struct module.
The value is true after the coming and before the going handler is called.
New patches need to be applied when the value is true and they need to ignore
the module when the value is false.
Note that we need to know state of all modules on the system. The races are
related to new patches. Therefore we do not know what modules will get
patched.
Also note that we could not simply ignore going modules. The code from the
module could be called even in the GOING state until mod->exit() finishes.
If we start supporting patches with semantic changes between function
calls, we need to apply new patches to any still usable code.
See below for an example.
Finally note that the patch solves only the situation when a new patch is
registered. There are no such problems when the patch is being removed.
It does not matter who disable the patch first, whether the normal
disable_patch() or the module notifier. There is nothing to do
once the patch is disabled.
Alternative solutions:
======================
+ reject new patches when a patched module is coming or going; this is ugly
+ wait with adding new patch until the module leaves the COMING and GOING
states; this might be dangerous and complicated; we would need to release
kgr_lock in the middle of the patch registration to avoid a deadlock
with the coming and going handlers; also we might need a waitqueue for
each module which seems to be even bigger overhead than the boolean
+ stop modules from entering COMING and GOING states; wait until modules
leave these states when they are already there; looks complicated; we would
need to ignore the module that asked to stop the others to avoid a deadlock;
also it is unclear what to do when two modules asked to stop others and
both are in COMING state (situation when two new patches are applied)
+ always register/enable new patches and fix up the potential mess (registered
patches order) in klp_module_init(); this is nasty and prone to regressions
in the future development
+ add another MODULE_STATE where the kallsyms are visible but the module is not
used yet; this looks too complex; the module states are checked on "many"
locations
Example of patch stacking breakage:
===================================
The notifier could _not_ _simply_ ignore already initialized module objects.
For example, let's have three patches (P1, P2, P3) for functions a() and b()
where a() is from vmcore and b() is from a module M. Something like:
a() b()
P1 a1() b1()
P2 a2() b2()
P3 a3() b3(3)
If you load the module M after all patches are registered and enabled.
The ftrace ops for function a() and b() has listed the functions in this
order:
ops_a->func_stack -> list(a3,a2,a1)
ops_b->func_stack -> list(b3,b2,b1)
, so the pointer to b3() is the first and will be used.
Then you might have the following scenario. Let's start with state when patches
P1 and P2 are registered and enabled but the module M is not loaded. Then ftrace
ops for b() does not exist. Then we get into the following race:
CPU0 CPU1
load_module(M)
complete_formation()
mod->state = MODULE_STATE_COMING;
mutex_unlock(&module_mutex);
klp_register_patch(P3);
klp_enable_patch(P3);
# STATE 1
klp_module_notify(M)
klp_module_notify_coming(P1);
klp_module_notify_coming(P2);
klp_module_notify_coming(P3);
# STATE 2
The ftrace ops for a() and b() then looks:
STATE1:
ops_a->func_stack -> list(a3,a2,a1);
ops_b->func_stack -> list(b3);
STATE2:
ops_a->func_stack -> list(a3,a2,a1);
ops_b->func_stack -> list(b2,b1,b3);
therefore, b2() is used for the module but a3() is used for vmcore
because they were the last added.
Example of the race with going modules:
=======================================
CPU0 CPU1
delete_module() #SYSCALL
try_stop_module()
mod->state = MODULE_STATE_GOING;
mutex_unlock(&module_mutex);
klp_register_patch()
klp_enable_patch()
#save place to switch universe
b() # from module that is going
a() # from core (patched)
mod->exit();
Note that the function b() can be called until we call mod->exit().
If we do not apply patch against b() because it is in MODULE_STATE_GOING,
it will call patched a() with modified semantic and things might get wrong.
[jpoimboe@redhat.com: use one boolean instead of two]
Signed-off-by: Petr Mladek <pmladek@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
2015-03-12 19:55:13 +08:00
|
|
|
bool klp_alive;
|
2016-03-23 08:03:16 +08:00
|
|
|
|
|
|
|
/* Elf information */
|
|
|
|
struct klp_modinfo *klp_info;
|
livepatch: Fix subtle race with coming and going modules
There is a notifier that handles live patches for coming and going modules.
It takes klp_mutex lock to avoid races with coming and going patches but
it does not keep the lock all the time. Therefore the following races are
possible:
1. The notifier is called sometime in STATE_MODULE_COMING. The module
is visible by find_module() in this state all the time. It means that
new patch can be registered and enabled even before the notifier is
called. It might create wrong order of stacked patches, see below
for an example.
2. New patch could still see the module in the GOING state even after
the notifier has been called. It will try to initialize the related
object structures but the module could disappear at any time. There
will stay mess in the structures. It might even cause an invalid
memory access.
This patch solves the problem by adding a boolean variable into struct module.
The value is true after the coming and before the going handler is called.
New patches need to be applied when the value is true and they need to ignore
the module when the value is false.
Note that we need to know state of all modules on the system. The races are
related to new patches. Therefore we do not know what modules will get
patched.
Also note that we could not simply ignore going modules. The code from the
module could be called even in the GOING state until mod->exit() finishes.
If we start supporting patches with semantic changes between function
calls, we need to apply new patches to any still usable code.
See below for an example.
Finally note that the patch solves only the situation when a new patch is
registered. There are no such problems when the patch is being removed.
It does not matter who disable the patch first, whether the normal
disable_patch() or the module notifier. There is nothing to do
once the patch is disabled.
Alternative solutions:
======================
+ reject new patches when a patched module is coming or going; this is ugly
+ wait with adding new patch until the module leaves the COMING and GOING
states; this might be dangerous and complicated; we would need to release
kgr_lock in the middle of the patch registration to avoid a deadlock
with the coming and going handlers; also we might need a waitqueue for
each module which seems to be even bigger overhead than the boolean
+ stop modules from entering COMING and GOING states; wait until modules
leave these states when they are already there; looks complicated; we would
need to ignore the module that asked to stop the others to avoid a deadlock;
also it is unclear what to do when two modules asked to stop others and
both are in COMING state (situation when two new patches are applied)
+ always register/enable new patches and fix up the potential mess (registered
patches order) in klp_module_init(); this is nasty and prone to regressions
in the future development
+ add another MODULE_STATE where the kallsyms are visible but the module is not
used yet; this looks too complex; the module states are checked on "many"
locations
Example of patch stacking breakage:
===================================
The notifier could _not_ _simply_ ignore already initialized module objects.
For example, let's have three patches (P1, P2, P3) for functions a() and b()
where a() is from vmcore and b() is from a module M. Something like:
a() b()
P1 a1() b1()
P2 a2() b2()
P3 a3() b3(3)
If you load the module M after all patches are registered and enabled.
The ftrace ops for function a() and b() has listed the functions in this
order:
ops_a->func_stack -> list(a3,a2,a1)
ops_b->func_stack -> list(b3,b2,b1)
, so the pointer to b3() is the first and will be used.
Then you might have the following scenario. Let's start with state when patches
P1 and P2 are registered and enabled but the module M is not loaded. Then ftrace
ops for b() does not exist. Then we get into the following race:
CPU0 CPU1
load_module(M)
complete_formation()
mod->state = MODULE_STATE_COMING;
mutex_unlock(&module_mutex);
klp_register_patch(P3);
klp_enable_patch(P3);
# STATE 1
klp_module_notify(M)
klp_module_notify_coming(P1);
klp_module_notify_coming(P2);
klp_module_notify_coming(P3);
# STATE 2
The ftrace ops for a() and b() then looks:
STATE1:
ops_a->func_stack -> list(a3,a2,a1);
ops_b->func_stack -> list(b3);
STATE2:
ops_a->func_stack -> list(a3,a2,a1);
ops_b->func_stack -> list(b2,b1,b3);
therefore, b2() is used for the module but a3() is used for vmcore
because they were the last added.
Example of the race with going modules:
=======================================
CPU0 CPU1
delete_module() #SYSCALL
try_stop_module()
mod->state = MODULE_STATE_GOING;
mutex_unlock(&module_mutex);
klp_register_patch()
klp_enable_patch()
#save place to switch universe
b() # from module that is going
a() # from core (patched)
mod->exit();
Note that the function b() can be called until we call mod->exit().
If we do not apply patch against b() because it is in MODULE_STATE_GOING,
it will call patched a() with modified semantic and things might get wrong.
[jpoimboe@redhat.com: use one boolean instead of two]
Signed-off-by: Petr Mladek <pmladek@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
2015-03-12 19:55:13 +08:00
|
|
|
#endif
|
|
|
|
|
printk: Userspace format indexing support
We have a number of systems industry-wide that have a subset of their
functionality that works as follows:
1. Receive a message from local kmsg, serial console, or netconsole;
2. Apply a set of rules to classify the message;
3. Do something based on this classification (like scheduling a
remediation for the machine), rinse, and repeat.
As a couple of examples of places we have this implemented just inside
Facebook, although this isn't a Facebook-specific problem, we have this
inside our netconsole processing (for alarm classification), and as part
of our machine health checking. We use these messages to determine
fairly important metrics around production health, and it's important
that we get them right.
While for some kinds of issues we have counters, tracepoints, or metrics
with a stable interface which can reliably indicate the issue, in order
to react to production issues quickly we need to work with the interface
which most kernel developers naturally use when developing: printk.
Most production issues come from unexpected phenomena, and as such
usually the code in question doesn't have easily usable tracepoints or
other counters available for the specific problem being mitigated. We
have a number of lines of monitoring defence against problems in
production (host metrics, process metrics, service metrics, etc), and
where it's not feasible to reliably monitor at another level, this kind
of pragmatic netconsole monitoring is essential.
As one would expect, monitoring using printk is rather brittle for a
number of reasons -- most notably that the message might disappear
entirely in a new version of the kernel, or that the message may change
in some way that the regex or other classification methods start to
silently fail.
One factor that makes this even harder is that, under normal operation,
many of these messages are never expected to be hit. For example, there
may be a rare hardware bug which one wants to detect if it was to ever
happen again, but its recurrence is not likely or anticipated. This
precludes using something like checking whether the printk in question
was printed somewhere fleetwide recently to determine whether the
message in question is still present or not, since we don't anticipate
that it should be printed anywhere, but still need to monitor for its
future presence in the long-term.
This class of issue has happened on a number of occasions, causing
unhealthy machines with hardware issues to remain in production for
longer than ideal. As a recent example, some monitoring around
blk_update_request fell out of date and caused semi-broken machines to
remain in production for longer than would be desirable.
Searching through the codebase to find the message is also extremely
fragile, because many of the messages are further constructed beyond
their callsite (eg. btrfs_printk and other module-specific wrappers,
each with their own functionality). Even if they aren't, guessing the
format and formulation of the underlying message based on the aesthetics
of the message emitted is not a recipe for success at scale, and our
previous issues with fleetwide machine health checking demonstrate as
much.
This provides a solution to the issue of silently changed or deleted
printks: we record pointers to all printk format strings known at
compile time into a new .printk_index section, both in vmlinux and
modules. At runtime, this can then be iterated by looking at
<debugfs>/printk/index/<module>, which emits the following format, both
readable by humans and able to be parsed by machines:
$ head -1 vmlinux; shuf -n 5 vmlinux
# <level[,flags]> filename:line function "format"
<5> block/blk-settings.c:661 disk_stack_limits "%s: Warning: Device %s is misaligned\n"
<4> kernel/trace/trace.c:8296 trace_create_file "Could not create tracefs '%s' entry\n"
<6> arch/x86/kernel/hpet.c:144 _hpet_print_config "hpet: %s(%d):\n"
<6> init/do_mounts.c:605 prepare_namespace "Waiting for root device %s...\n"
<6> drivers/acpi/osl.c:1410 acpi_no_auto_serialize_setup "ACPI: auto-serialization disabled\n"
This mitigates the majority of cases where we have a highly-specific
printk which we want to match on, as we can now enumerate and check
whether the format changed or the printk callsite disappeared entirely
in userspace. This allows us to catch changes to printks we monitor
earlier and decide what to do about it before it becomes problematic.
There is no additional runtime cost for printk callers or printk itself,
and the assembly generated is exactly the same.
Signed-off-by: Chris Down <chris@chrisdown.name>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kees Cook <keescook@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Tested-by: Petr Mladek <pmladek@suse.com>
Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Acked-by: Jessica Yu <jeyu@kernel.org> # for module.{c,h}
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/e42070983637ac5e384f17fbdbe86d19c7b212a5.1623775748.git.chris@chrisdown.name
2021-06-16 00:52:53 +08:00
|
|
|
#ifdef CONFIG_PRINTK_INDEX
|
|
|
|
unsigned int printk_index_size;
|
|
|
|
struct pi_entry **printk_index_start;
|
|
|
|
#endif
|
|
|
|
|
2008-07-23 08:24:26 +08:00
|
|
|
#ifdef CONFIG_MODULE_UNLOAD
|
|
|
|
/* What modules depend on me? */
|
2010-06-01 03:19:37 +08:00
|
|
|
struct list_head source_list;
|
|
|
|
/* What modules do I depend on? */
|
|
|
|
struct list_head target_list;
|
2008-07-23 08:24:26 +08:00
|
|
|
|
|
|
|
/* Destruction function. */
|
|
|
|
void (*exit)(void);
|
|
|
|
|
2014-11-10 06:59:29 +08:00
|
|
|
atomic_t refcnt;
|
2008-07-23 08:24:26 +08:00
|
|
|
#endif
|
2009-06-18 07:28:03 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_CONSTRUCTORS
|
|
|
|
/* Constructor functions. */
|
|
|
|
ctor_fn_t *ctors;
|
|
|
|
unsigned int num_ctors;
|
|
|
|
#endif
|
2017-12-12 00:36:46 +08:00
|
|
|
|
2018-01-13 01:55:03 +08:00
|
|
|
#ifdef CONFIG_FUNCTION_ERROR_INJECTION
|
2018-01-13 01:55:33 +08:00
|
|
|
struct error_injection_entry *ei_funcs;
|
2018-01-13 01:55:03 +08:00
|
|
|
unsigned int num_ei_funcs;
|
2017-12-12 00:36:46 +08:00
|
|
|
#endif
|
2016-10-28 16:22:25 +08:00
|
|
|
} ____cacheline_aligned __randomize_layout;
|
2007-05-09 17:35:15 +08:00
|
|
|
#ifndef MODULE_ARCH_INIT
|
|
|
|
#define MODULE_ARCH_INIT {}
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
ARM: module: Fix function kallsyms on Thumb-2
Thumb-2 functions have the lowest bit set in the symbol value in the
symtab. When kallsyms are generated for the vmlinux, the kallsyms are
generated from the output of nm, and nm clears the lowest bit.
$ arm-linux-gnueabihf-readelf -a vmlinux | grep show_interrupts
95947: 8015dc89 686 FUNC GLOBAL DEFAULT 2 show_interrupts
$ arm-linux-gnueabihf-nm vmlinux | grep show_interrupts
8015dc88 T show_interrupts
$ cat /proc/kallsyms | grep show_interrupts
8015dc88 T show_interrupts
However, for modules, the kallsyms uses the values in the symbol table
without modification, so for functions in modules, the lowest bit is set
in kallsyms.
$ arm-linux-gnueabihf-readelf -a drivers/net/tun.ko | grep tun_get_socket
333: 00002d4d 36 FUNC GLOBAL DEFAULT 1 tun_get_socket
$ arm-linux-gnueabihf-nm drivers/net/tun.ko | grep tun_get_socket
00002d4c T tun_get_socket
$ cat /proc/kallsyms | grep tun_get_socket
7f802d4d t tun_get_socket [tun]
Because of this, the symbol+offset of the crashing instruction shown in
oopses is incorrect when the crash is in a module. For example, given a
tun_get_socket which starts like this,
00002d4c <tun_get_socket>:
2d4c: 6943 ldr r3, [r0, #20]
2d4e: 4a07 ldr r2, [pc, #28]
2d50: 4293 cmp r3, r2
a crash when tun_get_socket is called with NULL results in:
PC is at tun_xdp+0xa3/0xa4 [tun]
pc : [<7f802d4c>]
As can be seen, the "PC is at" line reports the wrong symbol name, and
the symbol+offset will point to the wrong source line if it is passed to
gdb.
To solve this, add a way for archs to fixup the reading of these module
kallsyms values, and use that to clear the lowest bit for function
symbols on Thumb-2.
After the fix:
# cat /proc/kallsyms | grep tun_get_socket
7f802d4c t tun_get_socket [tun]
PC is at tun_get_socket+0x0/0x24 [tun]
pc : [<7f802d4c>]
Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
2018-12-15 00:05:55 +08:00
|
|
|
#ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE
|
|
|
|
static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym)
|
|
|
|
{
|
|
|
|
return sym->st_value;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* FIXME: It'd be nice to isolate modules during init, too, so they
|
|
|
|
aren't used before they (may) fail. But presently too much code
|
|
|
|
(IDE & SCSI) require entry into the module during init.*/
|
2018-02-07 07:41:31 +08:00
|
|
|
static inline bool module_is_live(struct module *mod)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
return mod->state != MODULE_STATE_GOING;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct module *__module_text_address(unsigned long addr);
|
2009-04-01 03:05:31 +08:00
|
|
|
struct module *__module_address(unsigned long addr);
|
|
|
|
bool is_module_address(unsigned long addr);
|
2017-02-27 22:37:36 +08:00
|
|
|
bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
|
2010-03-10 17:57:54 +08:00
|
|
|
bool is_module_percpu_address(unsigned long addr);
|
2009-04-01 03:05:31 +08:00
|
|
|
bool is_module_text_address(unsigned long addr);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-07-27 05:55:01 +08:00
|
|
|
static inline bool within_module_core(unsigned long addr,
|
|
|
|
const struct module *mod)
|
2009-01-07 06:41:49 +08:00
|
|
|
{
|
2015-11-26 07:14:08 +08:00
|
|
|
return (unsigned long)mod->core_layout.base <= addr &&
|
|
|
|
addr < (unsigned long)mod->core_layout.base + mod->core_layout.size;
|
2009-01-07 06:41:49 +08:00
|
|
|
}
|
|
|
|
|
2014-07-27 05:55:01 +08:00
|
|
|
static inline bool within_module_init(unsigned long addr,
|
|
|
|
const struct module *mod)
|
2009-01-07 06:41:49 +08:00
|
|
|
{
|
2015-11-26 07:14:08 +08:00
|
|
|
return (unsigned long)mod->init_layout.base <= addr &&
|
|
|
|
addr < (unsigned long)mod->init_layout.base + mod->init_layout.size;
|
2009-01-07 06:41:49 +08:00
|
|
|
}
|
|
|
|
|
2014-07-27 05:55:01 +08:00
|
|
|
static inline bool within_module(unsigned long addr, const struct module *mod)
|
2014-07-27 05:54:01 +08:00
|
|
|
{
|
|
|
|
return within_module_init(addr, mod) || within_module_core(addr, mod);
|
|
|
|
}
|
|
|
|
|
2021-02-02 20:13:25 +08:00
|
|
|
/* Search for module by name: must be in a RCU-sched critical section. */
|
2008-12-06 08:03:59 +08:00
|
|
|
struct module *find_module(const char *name);
|
|
|
|
|
2007-05-08 15:28:39 +08:00
|
|
|
/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
|
2005-04-17 06:20:36 +08:00
|
|
|
symnum out of range. */
|
2007-05-08 15:28:39 +08:00
|
|
|
int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
|
|
|
|
char *name, char *module_name, int *exported);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Look for this name: can be of form module:name. */
|
|
|
|
unsigned long module_kallsyms_lookup_name(const char *name);
|
|
|
|
|
2021-12-04 01:00:19 +08:00
|
|
|
extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
|
2016-04-12 03:32:09 +08:00
|
|
|
long code);
|
2021-12-04 01:00:19 +08:00
|
|
|
#define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_MODULE_UNLOAD
|
2015-01-22 08:43:14 +08:00
|
|
|
int module_refcount(struct module *mod);
|
2005-04-17 06:20:36 +08:00
|
|
|
void __symbol_put(const char *symbol);
|
2018-06-23 23:37:44 +08:00
|
|
|
#define symbol_put(x) __symbol_put(__stringify(x))
|
2005-04-17 06:20:36 +08:00
|
|
|
void symbol_put_addr(void *addr);
|
|
|
|
|
|
|
|
/* Sometimes we know we already have a refcount, and it's easier not
|
|
|
|
to handle the error case (which only happens with rmmod --wait). */
|
2012-03-26 10:20:52 +08:00
|
|
|
extern void __module_get(struct module *module);
|
2010-01-05 14:34:50 +08:00
|
|
|
|
2012-03-26 10:20:52 +08:00
|
|
|
/* This is the Right Way to get a module: if it fails, it's being removed,
|
|
|
|
* so pretend it's not there. */
|
|
|
|
extern bool try_module_get(struct module *module);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-10-18 13:47:25 +08:00
|
|
|
extern void module_put(struct module *module);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#else /*!CONFIG_MODULE_UNLOAD*/
|
2017-04-19 09:47:22 +08:00
|
|
|
static inline bool try_module_get(struct module *module)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
return !module || module_is_live(module);
|
|
|
|
}
|
|
|
|
static inline void module_put(struct module *module)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void __module_get(struct module *module)
|
|
|
|
{
|
|
|
|
}
|
2014-01-16 07:48:49 +08:00
|
|
|
#define symbol_put(x) do { } while (0)
|
|
|
|
#define symbol_put_addr(p) do { } while (0)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#endif /* CONFIG_MODULE_UNLOAD */
|
|
|
|
|
|
|
|
/* This is a #define so the string doesn't get put in every .o file */
|
|
|
|
#define module_name(mod) \
|
|
|
|
({ \
|
|
|
|
struct module *__mod = (mod); \
|
|
|
|
__mod ? __mod->name : "kernel"; \
|
|
|
|
})
|
|
|
|
|
sections: split dereference_function_descriptor()
There are two format specifiers to print out a pointer in symbolic
format: '%pS/%ps' and '%pF/%pf'. On most architectures, the two
mean exactly the same thing, but some architectures (ia64, ppc64,
parisc64) use an indirect pointer for C function pointers, where
the function pointer points to a function descriptor (which in
turn contains the actual pointer to the code). The '%pF/%pf, when
used appropriately, automatically does the appropriate function
descriptor dereference on such architectures.
The "when used appropriately" part is tricky. Basically this is
a subtle ABI detail, specific to some platforms, that made it to
the API level and people can be unaware of it and miss the whole
"we need to dereference the function" business out. [1] proves
that point (note that it fixes only '%pF' and '%pS', there might
be '%pf' and '%ps' cases as well).
It appears that we can handle everything within the affected
arches and make '%pS/%ps' smart enough to retire '%pF/%pf'.
Function descriptors live in .opd elf section and all affected
arches (ia64, ppc64, parisc64) handle it properly for kernel
and modules. So we, technically, can decide if the dereference
is needed by simply looking at the pointer: if it belongs to
.opd section then we need to dereference it.
The kernel and modules have their own .opd sections, obviously,
that's why we need to split dereference_function_descriptor()
and use separate kernel and module dereference arch callbacks.
This patch does the first step, it
a) adds dereference_kernel_function_descriptor() function.
b) adds a weak alias to dereference_module_function_descriptor()
function.
So, for the time being, we will have:
1) dereference_function_descriptor()
A generic function, that simply dereferences the pointer. There is
bunch of places that call it: kgdbts, init/main.c, extable, etc.
2) dereference_kernel_function_descriptor()
A function to call on kernel symbols that does kernel .opd section
address range test.
3) dereference_module_function_descriptor()
A function to call on modules' symbols that does modules' .opd
section address range test.
[1] https://marc.info/?l=linux-kernel&m=150472969730573
Link: http://lkml.kernel.org/r/20171109234830.5067-2-sergey.senozhatsky@gmail.com
To: Fenghua Yu <fenghua.yu@intel.com>
To: Benjamin Herrenschmidt <benh@kernel.crashing.org>
To: Paul Mackerras <paulus@samba.org>
To: Michael Ellerman <mpe@ellerman.id.au>
To: James Bottomley <jejb@parisc-linux.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linux-ia64@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Tested-by: Tony Luck <tony.luck@intel.com> #ia64
Tested-by: Santosh Sivaraj <santosh@fossix.org> #powerpc
Tested-by: Helge Deller <deller@gmx.de> #parisc64
Signed-off-by: Petr Mladek <pmladek@suse.com>
2017-11-10 07:48:25 +08:00
|
|
|
/* Dereference module function descriptor */
|
|
|
|
void *dereference_module_function_descriptor(struct module *mod, void *ptr);
|
|
|
|
|
2008-01-30 06:13:22 +08:00
|
|
|
/* For kallsyms to ask for address resolution. namebuf should be at
|
|
|
|
* least KSYM_NAME_LEN long: a pointer to namebuf is returned if
|
|
|
|
* found, otherwise NULL. */
|
2008-02-08 20:18:43 +08:00
|
|
|
const char *module_address_lookup(unsigned long addr,
|
2008-01-30 06:13:22 +08:00
|
|
|
unsigned long *symbolsize,
|
|
|
|
unsigned long *offset,
|
2021-07-08 09:09:20 +08:00
|
|
|
char **modname, const unsigned char **modbuildid,
|
2008-01-30 06:13:22 +08:00
|
|
|
char *namebuf);
|
2007-05-08 15:28:43 +08:00
|
|
|
int lookup_module_symbol_name(unsigned long addr, char *symname);
|
2007-05-08 15:28:47 +08:00
|
|
|
int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-01-16 07:48:49 +08:00
|
|
|
int register_module_notifier(struct notifier_block *nb);
|
|
|
|
int unregister_module_notifier(struct notifier_block *nb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern void print_modules(void);
|
|
|
|
|
2015-05-22 06:49:37 +08:00
|
|
|
static inline bool module_requested_async_probing(struct module *module)
|
|
|
|
{
|
|
|
|
return module && module->async_probe_requested;
|
|
|
|
}
|
|
|
|
|
2016-03-23 08:03:16 +08:00
|
|
|
#ifdef CONFIG_LIVEPATCH
|
|
|
|
static inline bool is_livepatch_module(struct module *mod)
|
|
|
|
{
|
|
|
|
return mod->klp;
|
|
|
|
}
|
|
|
|
#else /* !CONFIG_LIVEPATCH */
|
|
|
|
static inline bool is_livepatch_module(struct module *mod)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_LIVEPATCH */
|
|
|
|
|
2017-10-25 01:37:00 +08:00
|
|
|
bool is_module_sig_enforced(void);
|
2019-01-28 08:03:45 +08:00
|
|
|
void set_module_sig_enforced(void);
|
2017-10-25 01:37:00 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#else /* !CONFIG_MODULES... */
|
|
|
|
|
2009-04-01 03:05:31 +08:00
|
|
|
static inline struct module *__module_address(unsigned long addr)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline struct module *__module_text_address(unsigned long addr)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2009-04-01 03:05:31 +08:00
|
|
|
static inline bool is_module_address(unsigned long addr)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2010-03-31 10:33:42 +08:00
|
|
|
static inline bool is_module_percpu_address(unsigned long addr)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-02-27 22:37:36 +08:00
|
|
|
static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2009-04-01 03:05:31 +08:00
|
|
|
static inline bool is_module_text_address(unsigned long addr)
|
2006-07-03 15:24:24 +08:00
|
|
|
{
|
2009-04-01 03:05:31 +08:00
|
|
|
return false;
|
2006-07-03 15:24:24 +08:00
|
|
|
}
|
|
|
|
|
vfs: Implement logging through fs_context
Implement the ability for filesystems to log error, warning and
informational messages through the fs_context. These can be extracted by
userspace by reading from an fd created by fsopen().
Error messages are prefixed with "e ", warnings with "w " and informational
messages with "i ".
Inside the kernel, formatted messages are malloc'd but unformatted messages
are not copied if they're either in the core .rodata section or in the
.rodata section of the filesystem module pinned by fs_context::fs_type.
The messages are only good till the fs_type is released.
Note that the logging object is shared between duplicated fs_context
structures. This is so that such as NFS which do a mount within a mount
can get at least some of the errors from the inner mount.
Five logging functions are provided for this:
(1) void logfc(struct fs_context *fc, const char *fmt, ...);
This logs a message into the context. If the buffer is full, the
earliest message is discarded.
(2) void errorf(fc, fmt, ...);
This wraps logfc() to log an error.
(3) void invalf(fc, fmt, ...);
This wraps errorf() and returns -EINVAL for convenience.
(4) void warnf(fc, fmt, ...);
This wraps logfc() to log a warning.
(5) void infof(fc, fmt, ...);
This wraps logfc() to log an informational message.
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:34:29 +08:00
|
|
|
static inline bool within_module_core(unsigned long addr,
|
|
|
|
const struct module *mod)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2019-04-16 02:18:33 +08:00
|
|
|
static inline bool within_module_init(unsigned long addr,
|
|
|
|
const struct module *mod)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool within_module(unsigned long addr, const struct module *mod)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Get/put a kernel symbol (calls should be symmetric) */
|
2020-10-27 23:11:32 +08:00
|
|
|
#define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); })
|
2014-01-16 07:48:49 +08:00
|
|
|
#define symbol_put(x) do { } while (0)
|
|
|
|
#define symbol_put_addr(x) do { } while (0)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
static inline void __module_get(struct module *module)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2017-04-19 09:47:22 +08:00
|
|
|
static inline bool try_module_get(struct module *module)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2017-04-19 09:47:22 +08:00
|
|
|
return true;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void module_put(struct module *module)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
#define module_name(mod) "kernel"
|
|
|
|
|
|
|
|
/* For kallsyms to ask for address resolution. NULL means not found. */
|
2008-02-08 20:18:43 +08:00
|
|
|
static inline const char *module_address_lookup(unsigned long addr,
|
2008-01-30 06:13:22 +08:00
|
|
|
unsigned long *symbolsize,
|
|
|
|
unsigned long *offset,
|
|
|
|
char **modname,
|
2021-07-08 09:09:20 +08:00
|
|
|
const unsigned char **modbuildid,
|
2008-01-30 06:13:22 +08:00
|
|
|
char *namebuf)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2007-05-08 15:28:43 +08:00
|
|
|
static inline int lookup_module_symbol_name(unsigned long addr, char *symname)
|
|
|
|
{
|
|
|
|
return -ERANGE;
|
|
|
|
}
|
|
|
|
|
2007-05-08 15:28:47 +08:00
|
|
|
static inline int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name)
|
|
|
|
{
|
|
|
|
return -ERANGE;
|
|
|
|
}
|
|
|
|
|
2007-05-08 15:28:39 +08:00
|
|
|
static inline int module_get_kallsym(unsigned int symnum, unsigned long *value,
|
|
|
|
char *type, char *name,
|
|
|
|
char *module_name, int *exported)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-05-08 15:28:39 +08:00
|
|
|
return -ERANGE;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long module_kallsyms_lookup_name(const char *name)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-01-16 07:48:49 +08:00
|
|
|
static inline int register_module_notifier(struct notifier_block *nb)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
/* no events will happen anyway, so this can always succeed */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-01-16 07:48:49 +08:00
|
|
|
static inline int unregister_module_notifier(struct notifier_block *nb)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-12-04 01:00:19 +08:00
|
|
|
#define module_put_and_kthread_exit(code) kthread_exit(code)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
static inline void print_modules(void)
|
|
|
|
{
|
|
|
|
}
|
2015-05-22 06:49:37 +08:00
|
|
|
|
|
|
|
static inline bool module_requested_async_probing(struct module *module)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-10-25 01:37:00 +08:00
|
|
|
static inline bool is_module_sig_enforced(void)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2019-01-28 08:03:45 +08:00
|
|
|
static inline void set_module_sig_enforced(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
sections: split dereference_function_descriptor()
There are two format specifiers to print out a pointer in symbolic
format: '%pS/%ps' and '%pF/%pf'. On most architectures, the two
mean exactly the same thing, but some architectures (ia64, ppc64,
parisc64) use an indirect pointer for C function pointers, where
the function pointer points to a function descriptor (which in
turn contains the actual pointer to the code). The '%pF/%pf, when
used appropriately, automatically does the appropriate function
descriptor dereference on such architectures.
The "when used appropriately" part is tricky. Basically this is
a subtle ABI detail, specific to some platforms, that made it to
the API level and people can be unaware of it and miss the whole
"we need to dereference the function" business out. [1] proves
that point (note that it fixes only '%pF' and '%pS', there might
be '%pf' and '%ps' cases as well).
It appears that we can handle everything within the affected
arches and make '%pS/%ps' smart enough to retire '%pF/%pf'.
Function descriptors live in .opd elf section and all affected
arches (ia64, ppc64, parisc64) handle it properly for kernel
and modules. So we, technically, can decide if the dereference
is needed by simply looking at the pointer: if it belongs to
.opd section then we need to dereference it.
The kernel and modules have their own .opd sections, obviously,
that's why we need to split dereference_function_descriptor()
and use separate kernel and module dereference arch callbacks.
This patch does the first step, it
a) adds dereference_kernel_function_descriptor() function.
b) adds a weak alias to dereference_module_function_descriptor()
function.
So, for the time being, we will have:
1) dereference_function_descriptor()
A generic function, that simply dereferences the pointer. There is
bunch of places that call it: kgdbts, init/main.c, extable, etc.
2) dereference_kernel_function_descriptor()
A function to call on kernel symbols that does kernel .opd section
address range test.
3) dereference_module_function_descriptor()
A function to call on modules' symbols that does modules' .opd
section address range test.
[1] https://marc.info/?l=linux-kernel&m=150472969730573
Link: http://lkml.kernel.org/r/20171109234830.5067-2-sergey.senozhatsky@gmail.com
To: Fenghua Yu <fenghua.yu@intel.com>
To: Benjamin Herrenschmidt <benh@kernel.crashing.org>
To: Paul Mackerras <paulus@samba.org>
To: Michael Ellerman <mpe@ellerman.id.au>
To: James Bottomley <jejb@parisc-linux.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linux-ia64@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Tested-by: Tony Luck <tony.luck@intel.com> #ia64
Tested-by: Santosh Sivaraj <santosh@fossix.org> #powerpc
Tested-by: Helge Deller <deller@gmx.de> #parisc64
Signed-off-by: Petr Mladek <pmladek@suse.com>
2017-11-10 07:48:25 +08:00
|
|
|
/* Dereference module function descriptor */
|
|
|
|
static inline
|
|
|
|
void *dereference_module_function_descriptor(struct module *mod, void *ptr)
|
|
|
|
{
|
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
|
2007-02-14 07:19:06 +08:00
|
|
|
#endif /* CONFIG_MODULES */
|
|
|
|
|
|
|
|
#ifdef CONFIG_SYSFS
|
2007-11-02 01:39:50 +08:00
|
|
|
extern struct kset *module_kset;
|
|
|
|
extern struct kobj_type module_ktype;
|
|
|
|
extern int module_sysfs_initialized;
|
2007-02-14 07:19:06 +08:00
|
|
|
#endif /* CONFIG_SYSFS */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x)
|
|
|
|
|
|
|
|
/* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */
|
|
|
|
|
|
|
|
#define __MODULE_STRING(x) __stringify(x)
|
|
|
|
|
2009-06-17 06:33:37 +08:00
|
|
|
#ifdef CONFIG_GENERIC_BUG
|
2010-10-06 02:29:27 +08:00
|
|
|
void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
|
2009-06-17 06:33:37 +08:00
|
|
|
struct module *);
|
|
|
|
void module_bug_cleanup(struct module *);
|
|
|
|
|
|
|
|
#else /* !CONFIG_GENERIC_BUG */
|
|
|
|
|
2010-10-06 02:29:27 +08:00
|
|
|
static inline void module_bug_finalize(const Elf_Ehdr *hdr,
|
2009-06-17 06:33:37 +08:00
|
|
|
const Elf_Shdr *sechdrs,
|
|
|
|
struct module *mod)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void module_bug_cleanup(struct module *mod) {}
|
|
|
|
#endif /* CONFIG_GENERIC_BUG */
|
|
|
|
|
2018-12-11 00:37:25 +08:00
|
|
|
#ifdef CONFIG_RETPOLINE
|
2018-01-26 07:50:28 +08:00
|
|
|
extern bool retpoline_module_ok(bool has_retpoline);
|
|
|
|
#else
|
|
|
|
static inline bool retpoline_module_ok(bool has_retpoline)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-04-22 11:28:46 +08:00
|
|
|
#ifdef CONFIG_MODULE_SIG
|
|
|
|
static inline bool module_sig_ok(struct module *module)
|
|
|
|
{
|
|
|
|
return module->sig_ok;
|
|
|
|
}
|
|
|
|
#else /* !CONFIG_MODULE_SIG */
|
|
|
|
static inline bool module_sig_ok(struct module *module)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_MODULE_SIG */
|
|
|
|
|
2021-02-02 20:13:27 +08:00
|
|
|
int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
|
|
|
|
struct module *, unsigned long),
|
|
|
|
void *data);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* _LINUX_MODULE_H */
|