2005-04-17 06:20:36 +08:00
|
|
|
#ifndef _LINUX_KERNEL_H
|
|
|
|
#define _LINUX_KERNEL_H
|
|
|
|
|
|
|
|
|
|
|
|
#include <stdarg.h>
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
#include <linux/stddef.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/compiler.h>
|
|
|
|
#include <linux/bitops.h>
|
2006-12-08 18:37:49 +08:00
|
|
|
#include <linux/log2.h>
|
2008-07-25 16:45:24 +08:00
|
|
|
#include <linux/typecheck.h>
|
2010-11-16 05:37:37 +08:00
|
|
|
#include <linux/printk.h>
|
kernel.h: handle pointers to arrays better in container_of()
If the first parameter of container_of() is a pointer to a
non-const-qualified array type (and the third parameter names a
non-const-qualified array member), the local variable __mptr will be
defined with a const-qualified array type. In ISO C, these types are
incompatible. They work as expected in GNU C, but some versions will
issue warnings. For example, GCC 4.9 produces the warning
"initialization from incompatible pointer type".
Here is an example of where the problem occurs:
-------------------------------------------------------
#include <linux/kernel.h>
#include <linux/module.h>
MODULE_LICENSE("GPL");
struct st {
int a;
char b[16];
};
static int __init example_init(void) {
struct st t = { .a = 101, .b = "hello" };
char (*p)[16] = &t.b;
struct st *x = container_of(p, struct st, b);
printk(KERN_DEBUG "%p %p\n", (void *)&t, (void *)x);
return 0;
}
static void __exit example_exit(void) {
}
module_init(example_init);
module_exit(example_exit);
-------------------------------------------------------
Building the module with gcc-4.9 results in these warnings (where '{m}'
is the module source and '{k}' is the kernel source):
-------------------------------------------------------
In file included from {m}/example.c:1:0:
{m}/example.c: In function `example_init':
{k}/include/linux/kernel.h:854:48: warning: initialization from incompatible pointer type
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
^
{m}/example.c:14:17: note: in expansion of macro `container_of'
struct st *x = container_of(p, struct st, b);
^
{k}/include/linux/kernel.h:854:48: warning: (near initialization for `x')
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
^
{m}/example.c:14:17: note: in expansion of macro `container_of'
struct st *x = container_of(p, struct st, b);
^
-------------------------------------------------------
Replace the type checking performed by the macro to avoid these
warnings. Make sure `*(ptr)` either has type compatible with the
member, or has type compatible with `void`, ignoring qualifiers. Raise
compiler errors if this is not true. This is stronger than the previous
behaviour, which only resulted in compiler warnings for a type mismatch.
[arnd@arndb.de: fix new warnings for container_of()]
Link: http://lkml.kernel.org/r/20170620200940.90557-1-arnd@arndb.de
Link: http://lkml.kernel.org/r/20170525120316.24473-7-abbotti@mev.co.uk
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-13 05:33:04 +08:00
|
|
|
#include <linux/build_bug.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <asm/byteorder.h>
|
2012-10-13 17:46:48 +08:00
|
|
|
#include <uapi/linux/kernel.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-05-25 05:33:03 +08:00
|
|
|
#define USHRT_MAX ((u16)(~0U))
|
|
|
|
#define SHRT_MAX ((s16)(USHRT_MAX>>1))
|
|
|
|
#define SHRT_MIN ((s16)(-SHRT_MAX - 1))
|
2005-04-17 06:20:36 +08:00
|
|
|
#define INT_MAX ((int)(~0U>>1))
|
|
|
|
#define INT_MIN (-INT_MAX - 1)
|
|
|
|
#define UINT_MAX (~0U)
|
|
|
|
#define LONG_MAX ((long)(~0UL>>1))
|
|
|
|
#define LONG_MIN (-LONG_MAX - 1)
|
|
|
|
#define ULONG_MAX (~0UL)
|
[PATCH] writeback: fix range handling
When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request. Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".
To make all this sane, the patch changes range of writeback_control.
So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.
And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.
This patch does,
- Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
-1 is usually ok for range_end (type is long long). But, if someone did,
range_end += val; range_end is "val - 1"
u64val = range_end >> bits; u64val is "~(0ULL)"
or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
things, and uses LLONG_MAX for range_end.
- All callers of ->writepages() sets range_start/end or range_cyclic.
- Fix updates of ->writeback_index. It seems already bit strange.
If it starts at 0 and ended by check of nr_to_write, this last
index may reduce chance to scan end of file. So, this updates
->writeback_index only if range_cyclic is true or whole-file is
scanned.
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 17:03:26 +08:00
|
|
|
#define LLONG_MAX ((long long)(~0ULL>>1))
|
|
|
|
#define LLONG_MIN (-LLONG_MAX - 1)
|
|
|
|
#define ULLONG_MAX (~0ULL)
|
2012-06-01 07:26:04 +08:00
|
|
|
#define SIZE_MAX (~(size_t)0)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-01-24 07:54:00 +08:00
|
|
|
#define U8_MAX ((u8)~0U)
|
|
|
|
#define S8_MAX ((s8)(U8_MAX>>1))
|
|
|
|
#define S8_MIN ((s8)(-S8_MAX - 1))
|
|
|
|
#define U16_MAX ((u16)~0U)
|
|
|
|
#define S16_MAX ((s16)(U16_MAX>>1))
|
|
|
|
#define S16_MIN ((s16)(-S16_MAX - 1))
|
|
|
|
#define U32_MAX ((u32)~0U)
|
|
|
|
#define S32_MAX ((s32)(U32_MAX>>1))
|
|
|
|
#define S32_MIN ((s32)(-S32_MAX - 1))
|
|
|
|
#define U64_MAX ((u64)~0ULL)
|
|
|
|
#define S64_MAX ((s64)(U64_MAX>>1))
|
|
|
|
#define S64_MIN ((s64)(-S64_MAX - 1))
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#define STACK_MAGIC 0xdeadbeef
|
|
|
|
|
2012-05-24 11:12:50 +08:00
|
|
|
#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x))
|
|
|
|
|
percpu: ensure the requested alignment is power of two
The percpu allocator expectedly assumes that the requested alignment
is power of two but hasn't been veryfing the input. If the specified
alignment isn't power of two, the allocator can malfunction. Add the
sanity check.
The following is detailed analysis of the effects of alignments which
aren't power of two.
The alignment must be a even at least since the LSB of a chunk->map
element is used as free/in-use flag of a area; besides, the alignment
must be a power of 2 too since ALIGN() doesn't work well for other
alignment always but is adopted by pcpu_fit_in_area(). IOW, the
current allocator only works well for a power of 2 aligned area
allocation.
See below opposite example for why an odd alignment doesn't work.
Let's assume area [16, 36) is free but its previous one is in-use, we
want to allocate a @size == 8 and @align == 7 area. The larger area
[16, 36) is split to three areas [16, 21), [21, 29), [29, 36)
eventually. However, due to the usage for a chunk->map element, the
actual offset of the aim area [21, 29) is 21 but is recorded in
relevant element as 20; moreover, the residual tail free area [29,
36) is mistook as in-use and is lost silently
Unlike macro roundup(), ALIGN(x, a) doesn't work if @a isn't a power
of 2 for example, roundup(10, 6) == 12 but ALIGN(10, 6) == 10, and
the latter result isn't desired obviously.
tj: Code style and patch description updates.
Signed-off-by: zijun_hu <zijun_hu@htc.com>
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-10-14 15:12:54 +08:00
|
|
|
/* @a is a power of 2 value */
|
2010-04-13 17:21:46 +08:00
|
|
|
#define ALIGN(x, a) __ALIGN_KERNEL((x), (a))
|
2017-04-12 02:08:34 +08:00
|
|
|
#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a))
|
2010-04-13 20:09:15 +08:00
|
|
|
#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
|
2007-09-12 06:23:47 +08:00
|
|
|
#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a)))
|
2008-02-06 17:37:05 +08:00
|
|
|
#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0)
|
2006-11-27 11:05:22 +08:00
|
|
|
|
2016-11-01 21:40:11 +08:00
|
|
|
/* generic data direction definitions */
|
|
|
|
#define READ 0
|
|
|
|
#define WRITE 1
|
|
|
|
|
2007-05-07 05:51:05 +08:00
|
|
|
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
|
|
|
|
|
2016-04-26 23:32:27 +08:00
|
|
|
#define u64_to_user_ptr(x) ( \
|
|
|
|
{ \
|
|
|
|
typecheck(u64, x); \
|
|
|
|
(void __user *)(uintptr_t)x; \
|
|
|
|
} \
|
|
|
|
)
|
|
|
|
|
2010-02-10 17:20:29 +08:00
|
|
|
/*
|
|
|
|
* This looks more complex than it should be. But we need to
|
|
|
|
* get the type for the ~ right in round_down (it needs to be
|
|
|
|
* as wide as the result!), and we want to evaluate the macro
|
|
|
|
* arguments just once each.
|
|
|
|
*/
|
|
|
|
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
|
|
|
|
#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
|
|
|
|
#define round_down(x, y) ((x) & ~__round_mask(x, y))
|
|
|
|
|
2006-06-26 19:57:28 +08:00
|
|
|
#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
|
2016-03-04 18:52:16 +08:00
|
|
|
#define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP
|
2017-09-09 07:13:45 +08:00
|
|
|
|
|
|
|
#define DIV_ROUND_DOWN_ULL(ll, d) \
|
|
|
|
({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
|
|
|
|
|
|
|
|
#define DIV_ROUND_UP_ULL(ll, d) DIV_ROUND_DOWN_ULL((ll) + (d) - 1, (d))
|
2011-07-26 15:35:26 +08:00
|
|
|
|
|
|
|
#if BITS_PER_LONG == 32
|
|
|
|
# define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP_ULL(ll, d)
|
|
|
|
#else
|
|
|
|
# define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP(ll,d)
|
|
|
|
#endif
|
2010-11-10 06:01:31 +08:00
|
|
|
|
|
|
|
/* The `const' in roundup() prevents gcc-3.3 from calling __divdi3 */
|
2010-10-14 05:50:08 +08:00
|
|
|
#define roundup(x, y) ( \
|
|
|
|
{ \
|
2010-11-08 10:20:49 +08:00
|
|
|
const typeof(y) __y = y; \
|
2010-10-14 05:50:08 +08:00
|
|
|
(((x) + (__y - 1)) / __y) * __y; \
|
|
|
|
} \
|
|
|
|
)
|
2010-10-14 05:50:02 +08:00
|
|
|
#define rounddown(x, y) ( \
|
|
|
|
{ \
|
|
|
|
typeof(x) __x = (x); \
|
|
|
|
__x - (__x % (y)); \
|
|
|
|
} \
|
|
|
|
)
|
2012-08-25 08:25:01 +08:00
|
|
|
|
|
|
|
/*
|
linux/kernel.h: fix DIV_ROUND_CLOSEST to support negative divisors
While working on a thermal driver I encounter a scenario where the
divisor could be negative, instead of adding local code to handle this I
though I first try to add support for this in DIV_ROUND_CLOSEST.
Add support to DIV_ROUND_CLOSEST for negative divisors if both dividend
and divisor variable types are signed. This should not alter current
behavior for users of the macro as previously negative divisors where
not supported.
Before:
DIV_ROUND_CLOSEST( 59, 4) = 15
DIV_ROUND_CLOSEST( 59, -4) = -14
DIV_ROUND_CLOSEST( -59, 4) = -15
DIV_ROUND_CLOSEST( -59, -4) = 14
After:
DIV_ROUND_CLOSEST( 59, 4) = 15
DIV_ROUND_CLOSEST( 59, -4) = -15
DIV_ROUND_CLOSEST( -59, 4) = -15
DIV_ROUND_CLOSEST( -59, -4) = 15
[akpm@linux-foundation.org: fix comment, per Guenter]
Link: http://lkml.kernel.org/r/20161222102217.29011-1-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-02-25 07:01:01 +08:00
|
|
|
* Divide positive or negative dividend by positive or negative divisor
|
|
|
|
* and round to closest integer. Result is undefined for negative
|
|
|
|
* divisors if he dividend variable type is unsigned and for negative
|
|
|
|
* dividends if the divisor variable type is unsigned.
|
2012-08-25 08:25:01 +08:00
|
|
|
*/
|
2009-01-07 06:40:51 +08:00
|
|
|
#define DIV_ROUND_CLOSEST(x, divisor)( \
|
|
|
|
{ \
|
2012-08-25 08:25:01 +08:00
|
|
|
typeof(x) __x = x; \
|
|
|
|
typeof(divisor) __d = divisor; \
|
linux/kernel.h: fix DIV_ROUND_CLOSEST with unsigned divisors
Commit 263a523d18bc ("linux/kernel.h: Fix warning seen with W=1 due to
change in DIV_ROUND_CLOSEST") fixes a warning seen with W=1 due to
change in DIV_ROUND_CLOSEST.
Unfortunately, the C compiler converts divide operations with unsigned
divisors to unsigned, even if the dividend is signed and negative (for
example, -10 / 5U = 858993457). The C standard says "If one operand has
unsigned int type, the other operand is converted to unsigned int", so
the compiler is not to blame. As a result, DIV_ROUND_CLOSEST(0, 2U) and
similar operations now return bad values, since the automatic conversion
of expressions such as "0 - 2U/2" to unsigned was not taken into
account.
Fix by checking for the divisor variable type when deciding which
operation to perform. This fixes DIV_ROUND_CLOSEST(0, 2U), but still
returns bad values for negative dividends divided by unsigned divisors.
Mark the latter case as unsupported.
One observed effect of this problem is that the s2c_hwmon driver reports
a value of 4198403 instead of 0 if the ADC reads 0.
Other impact is unpredictable. Problem is seen if the divisor is an
unsigned variable or constant and the dividend is less than (divisor/2).
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Reported-by: Juergen Beisert <jbe@pengutronix.de>
Tested-by: Juergen Beisert <jbe@pengutronix.de>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: <stable@vger.kernel.org> [3.7.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-21 07:05:42 +08:00
|
|
|
(((typeof(x))-1) > 0 || \
|
linux/kernel.h: fix DIV_ROUND_CLOSEST to support negative divisors
While working on a thermal driver I encounter a scenario where the
divisor could be negative, instead of adding local code to handle this I
though I first try to add support for this in DIV_ROUND_CLOSEST.
Add support to DIV_ROUND_CLOSEST for negative divisors if both dividend
and divisor variable types are signed. This should not alter current
behavior for users of the macro as previously negative divisors where
not supported.
Before:
DIV_ROUND_CLOSEST( 59, 4) = 15
DIV_ROUND_CLOSEST( 59, -4) = -14
DIV_ROUND_CLOSEST( -59, 4) = -15
DIV_ROUND_CLOSEST( -59, -4) = 14
After:
DIV_ROUND_CLOSEST( 59, 4) = 15
DIV_ROUND_CLOSEST( 59, -4) = -15
DIV_ROUND_CLOSEST( -59, 4) = -15
DIV_ROUND_CLOSEST( -59, -4) = 15
[akpm@linux-foundation.org: fix comment, per Guenter]
Link: http://lkml.kernel.org/r/20161222102217.29011-1-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-02-25 07:01:01 +08:00
|
|
|
((typeof(divisor))-1) > 0 || \
|
|
|
|
(((__x) > 0) == ((__d) > 0))) ? \
|
2012-08-25 08:25:01 +08:00
|
|
|
(((__x) + ((__d) / 2)) / (__d)) : \
|
|
|
|
(((__x) - ((__d) / 2)) / (__d)); \
|
2009-01-07 06:40:51 +08:00
|
|
|
} \
|
|
|
|
)
|
2015-04-17 03:43:45 +08:00
|
|
|
/*
|
|
|
|
* Same as above but for u64 dividends. divisor must be a 32-bit
|
|
|
|
* number.
|
|
|
|
*/
|
|
|
|
#define DIV_ROUND_CLOSEST_ULL(x, divisor)( \
|
|
|
|
{ \
|
|
|
|
typeof(divisor) __d = divisor; \
|
|
|
|
unsigned long long _tmp = (x) + (__d) / 2; \
|
|
|
|
do_div(_tmp, __d); \
|
|
|
|
_tmp; \
|
|
|
|
} \
|
|
|
|
)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-03-10 08:41:01 +08:00
|
|
|
/*
|
|
|
|
* Multiplies an integer by a fraction, while avoiding unnecessary
|
|
|
|
* overflow or loss of precision.
|
|
|
|
*/
|
|
|
|
#define mult_frac(x, numer, denom)( \
|
|
|
|
{ \
|
|
|
|
typeof(x) quot = (x) / (denom); \
|
|
|
|
typeof(x) rem = (x) % (denom); \
|
|
|
|
(quot * (numer)) + ((rem * (numer)) / (denom)); \
|
|
|
|
} \
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2008-07-05 17:14:23 +08:00
|
|
|
#define _RET_IP_ (unsigned long)__builtin_return_address(0)
|
|
|
|
#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; })
|
|
|
|
|
2009-06-19 14:08:50 +08:00
|
|
|
#ifdef CONFIG_LBDAF
|
2007-10-12 18:40:38 +08:00
|
|
|
# include <asm/div64.h>
|
|
|
|
# define sector_div(a, b) do_div(a, b)
|
|
|
|
#else
|
|
|
|
# define sector_div(n, b)( \
|
|
|
|
{ \
|
|
|
|
int _res; \
|
|
|
|
_res = (n) % (b); \
|
|
|
|
(n) /= (b); \
|
|
|
|
_res; \
|
|
|
|
} \
|
|
|
|
)
|
|
|
|
#endif
|
|
|
|
|
2007-05-10 18:15:18 +08:00
|
|
|
/**
|
|
|
|
* upper_32_bits - return bits 32-63 of a number
|
|
|
|
* @n: the number we're accessing
|
|
|
|
*
|
|
|
|
* A basic shift-right of a 64- or 32-bit quantity. Use this to suppress
|
|
|
|
* the "right shift count >= width of type" warning when that quantity is
|
|
|
|
* 32-bits.
|
|
|
|
*/
|
|
|
|
#define upper_32_bits(n) ((u32)(((n) >> 16) >> 16))
|
|
|
|
|
2008-07-30 13:33:42 +08:00
|
|
|
/**
|
|
|
|
* lower_32_bits - return bits 0-31 of a number
|
|
|
|
* @n: the number we're accessing
|
|
|
|
*/
|
|
|
|
#define lower_32_bits(n) ((u32)(n))
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct completion;
|
2006-01-10 12:51:37 +08:00
|
|
|
struct pt_regs;
|
|
|
|
struct user;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-08-13 06:08:59 +08:00
|
|
|
#ifdef CONFIG_PREEMPT_VOLUNTARY
|
|
|
|
extern int _cond_resched(void);
|
|
|
|
# define might_resched() _cond_resched()
|
|
|
|
#else
|
|
|
|
# define might_resched() do { } while (0)
|
|
|
|
#endif
|
|
|
|
|
2011-06-09 01:31:56 +08:00
|
|
|
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
|
2014-09-24 16:18:56 +08:00
|
|
|
void ___might_sleep(const char *file, int line, int preempt_offset);
|
2009-12-23 18:08:18 +08:00
|
|
|
void __might_sleep(const char *file, int line, int preempt_offset);
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* might_sleep - annotation for functions that can sleep
|
|
|
|
*
|
|
|
|
* this macro will print a stack trace if it is executed in an atomic
|
|
|
|
* context (spinlock, irq-handler, ...).
|
|
|
|
*
|
|
|
|
* This is a useful debugging help to be able to catch problems early and not
|
2006-11-30 11:46:13 +08:00
|
|
|
* be bitten later when the calling function happens to sleep when it is not
|
2005-04-17 06:20:36 +08:00
|
|
|
* supposed to.
|
|
|
|
*/
|
2005-06-26 05:57:39 +08:00
|
|
|
# define might_sleep() \
|
2009-07-16 21:44:29 +08:00
|
|
|
do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
|
sched: don't cause task state changes in nested sleep debugging
Commit 8eb23b9f35aa ("sched: Debug nested sleeps") added code to report
on nested sleep conditions, which we generally want to avoid because the
inner sleeping operation can re-set the thread state to TASK_RUNNING,
but that will then cause the outer sleep loop not actually sleep when it
calls schedule.
However, that's actually valid traditional behavior, with the inner
sleep being some fairly rare case (like taking a sleeping lock that
normally doesn't actually need to sleep).
And the debug code would actually change the state of the task to
TASK_RUNNING internally, which makes that kind of traditional and
working code not work at all, because now the nested sleep doesn't just
sometimes cause the outer one to not block, but will cause it to happen
every time.
In particular, it will cause the cardbus kernel daemon (pccardd) to
basically busy-loop doing scheduling, converting a laptop into a heater,
as reported by Bruno Prémont. But there may be other legacy uses of
that nested sleep model in other drivers that are also likely to never
get converted to the new model.
This fixes both cases:
- don't set TASK_RUNNING when the nested condition happens (note: even
if WARN_ONCE() only _warns_ once, the return value isn't whether the
warning happened, but whether the condition for the warning was true.
So despite the warning only happening once, the "if (WARN_ON(..))"
would trigger for every nested sleep.
- in the cases where we knowingly disable the warning by using
"sched_annotate_sleep()", don't change the task state (that is used
for all core scheduling decisions), instead use '->task_state_change'
that is used for the debugging decision itself.
(Credit for the second part of the fix goes to Oleg Nesterov: "Can't we
avoid this subtle change in behaviour DEBUG_ATOMIC_SLEEP adds?" with the
suggested change to use 'task_state_change' as part of the test)
Reported-and-bisected-by: Bruno Prémont <bonbons@linux-vserver.org>
Tested-by: Rafael J Wysocki <rjw@rjwysocki.net>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>,
Cc: Ilya Dryomov <ilya.dryomov@inktank.com>,
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>,
Cc: Davidlohr Bueso <dave@stgolabs.net>,
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-02 04:23:32 +08:00
|
|
|
# define sched_annotate_sleep() (current->task_state_change = 0)
|
2005-04-17 06:20:36 +08:00
|
|
|
#else
|
2014-09-24 16:18:56 +08:00
|
|
|
static inline void ___might_sleep(const char *file, int line,
|
|
|
|
int preempt_offset) { }
|
2009-12-23 18:08:18 +08:00
|
|
|
static inline void __might_sleep(const char *file, int line,
|
|
|
|
int preempt_offset) { }
|
2005-06-26 05:57:39 +08:00
|
|
|
# define might_sleep() do { might_resched(); } while (0)
|
2014-09-24 16:18:49 +08:00
|
|
|
# define sched_annotate_sleep() do { } while (0)
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
|
2006-06-23 17:05:42 +08:00
|
|
|
#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
|
2005-06-26 05:57:39 +08:00
|
|
|
|
kernel.h: make abs() work with 64-bit types
For 64-bit arguments, the abs macro casts it to an int which leads to
lost precision and may cause incorrect results. To deal with 64-bit
types abs64 macro has been introduced but still there are places where
abs macro is used incorrectly.
To deal with the problem, expand abs macro such that it operates on s64
type when dealing with 64-bit types while still returning long when
dealing with smaller types.
This fixes one known bug (per John):
The internal clocksteering done for fine-grained error correction uses a
: logarithmic approximation, so any time adjtimex() adjusts the clock
: steering, timekeeping_freqadjust() quickly approximates the correct clock
: frequency over a series of ticks.
:
: Unfortunately, the logic in timekeeping_freqadjust(), introduced in commit
: dc491596f639438 (Rework frequency adjustments to work better w/ nohz),
: used the abs() function with a s64 error value to calculate the size of
: the approximated adjustment to be made.
:
: Per include/linux/kernel.h: "abs() should not be used for 64-bit types
: (s64, u64, long long) - use abs64()".
:
: Thus on 32-bit platforms, this resulted in the clocksteering to take a
: quite dampended random walk trying to converge on the proper frequency,
: which caused the adjustments to be made much slower then intended (most
: easily observed when large adjustments are made).
Signed-off-by: Michal Nazarewicz <mina86@mina86.com>
Reported-by: John Stultz <john.stultz@linaro.org>
Tested-by: John Stultz <john.stultz@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-10 06:58:10 +08:00
|
|
|
/**
|
|
|
|
* abs - return absolute value of an argument
|
include/linux/kernel.h: change abs() macro so it uses consistent return type
Rewrite abs() so that its return type does not depend on the
architecture and no unexpected type conversion happen inside of it. The
only conversion is from unsigned to signed type. char is left as a
return type but treated as a signed type regradless of it's actual
signedness.
With the old version, int arguments were promoted to long and depending
on architecture a long argument might result in s64 or long return type
(which may or may not be the same).
This came after some back and forth with Nicolas. The current macro has
different return type (for the same input type) depending on
architecture which might be midly iritating.
An alternative version would promote to int like so:
#define abs(x) __abs_choose_expr(x, long long, \
__abs_choose_expr(x, long, \
__builtin_choose_expr( \
sizeof(x) <= sizeof(int), \
({ int __x = (x); __x<0?-__x:__x; }), \
((void)0))))
I have no preference but imagine Linus might. :] Nicolas argument against
is that promoting to int causes iconsistent behaviour:
int main(void) {
unsigned short a = 0, b = 1, c = a - b;
unsigned short d = abs(a - b);
unsigned short e = abs(c);
printf("%u %u\n", d, e); // prints: 1 65535
}
Then again, no sane person expects consistent behaviour from C integer
arithmetic. ;)
Note:
__builtin_types_compatible_p(unsigned char, char) is always false, and
__builtin_types_compatible_p(signed char, char) is also always false.
Signed-off-by: Michal Nazarewicz <mina86@mina86.com>
Reviewed-by: Nicolas Pitre <nico@linaro.org>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Wey-Yi Guy <wey-yi.w.guy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-16 08:57:58 +08:00
|
|
|
* @x: the value. If it is unsigned type, it is converted to signed type first.
|
|
|
|
* char is treated as if it was signed (regardless of whether it really is)
|
|
|
|
* but the macro's return type is preserved as char.
|
kernel.h: make abs() work with 64-bit types
For 64-bit arguments, the abs macro casts it to an int which leads to
lost precision and may cause incorrect results. To deal with 64-bit
types abs64 macro has been introduced but still there are places where
abs macro is used incorrectly.
To deal with the problem, expand abs macro such that it operates on s64
type when dealing with 64-bit types while still returning long when
dealing with smaller types.
This fixes one known bug (per John):
The internal clocksteering done for fine-grained error correction uses a
: logarithmic approximation, so any time adjtimex() adjusts the clock
: steering, timekeeping_freqadjust() quickly approximates the correct clock
: frequency over a series of ticks.
:
: Unfortunately, the logic in timekeeping_freqadjust(), introduced in commit
: dc491596f639438 (Rework frequency adjustments to work better w/ nohz),
: used the abs() function with a s64 error value to calculate the size of
: the approximated adjustment to be made.
:
: Per include/linux/kernel.h: "abs() should not be used for 64-bit types
: (s64, u64, long long) - use abs64()".
:
: Thus on 32-bit platforms, this resulted in the clocksteering to take a
: quite dampended random walk trying to converge on the proper frequency,
: which caused the adjustments to be made much slower then intended (most
: easily observed when large adjustments are made).
Signed-off-by: Michal Nazarewicz <mina86@mina86.com>
Reported-by: John Stultz <john.stultz@linaro.org>
Tested-by: John Stultz <john.stultz@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-10 06:58:10 +08:00
|
|
|
*
|
include/linux/kernel.h: change abs() macro so it uses consistent return type
Rewrite abs() so that its return type does not depend on the
architecture and no unexpected type conversion happen inside of it. The
only conversion is from unsigned to signed type. char is left as a
return type but treated as a signed type regradless of it's actual
signedness.
With the old version, int arguments were promoted to long and depending
on architecture a long argument might result in s64 or long return type
(which may or may not be the same).
This came after some back and forth with Nicolas. The current macro has
different return type (for the same input type) depending on
architecture which might be midly iritating.
An alternative version would promote to int like so:
#define abs(x) __abs_choose_expr(x, long long, \
__abs_choose_expr(x, long, \
__builtin_choose_expr( \
sizeof(x) <= sizeof(int), \
({ int __x = (x); __x<0?-__x:__x; }), \
((void)0))))
I have no preference but imagine Linus might. :] Nicolas argument against
is that promoting to int causes iconsistent behaviour:
int main(void) {
unsigned short a = 0, b = 1, c = a - b;
unsigned short d = abs(a - b);
unsigned short e = abs(c);
printf("%u %u\n", d, e); // prints: 1 65535
}
Then again, no sane person expects consistent behaviour from C integer
arithmetic. ;)
Note:
__builtin_types_compatible_p(unsigned char, char) is always false, and
__builtin_types_compatible_p(signed char, char) is also always false.
Signed-off-by: Michal Nazarewicz <mina86@mina86.com>
Reviewed-by: Nicolas Pitre <nico@linaro.org>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Wey-Yi Guy <wey-yi.w.guy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-16 08:57:58 +08:00
|
|
|
* Return: an absolute value of x.
|
2011-01-13 08:59:35 +08:00
|
|
|
*/
|
include/linux/kernel.h: change abs() macro so it uses consistent return type
Rewrite abs() so that its return type does not depend on the
architecture and no unexpected type conversion happen inside of it. The
only conversion is from unsigned to signed type. char is left as a
return type but treated as a signed type regradless of it's actual
signedness.
With the old version, int arguments were promoted to long and depending
on architecture a long argument might result in s64 or long return type
(which may or may not be the same).
This came after some back and forth with Nicolas. The current macro has
different return type (for the same input type) depending on
architecture which might be midly iritating.
An alternative version would promote to int like so:
#define abs(x) __abs_choose_expr(x, long long, \
__abs_choose_expr(x, long, \
__builtin_choose_expr( \
sizeof(x) <= sizeof(int), \
({ int __x = (x); __x<0?-__x:__x; }), \
((void)0))))
I have no preference but imagine Linus might. :] Nicolas argument against
is that promoting to int causes iconsistent behaviour:
int main(void) {
unsigned short a = 0, b = 1, c = a - b;
unsigned short d = abs(a - b);
unsigned short e = abs(c);
printf("%u %u\n", d, e); // prints: 1 65535
}
Then again, no sane person expects consistent behaviour from C integer
arithmetic. ;)
Note:
__builtin_types_compatible_p(unsigned char, char) is always false, and
__builtin_types_compatible_p(signed char, char) is also always false.
Signed-off-by: Michal Nazarewicz <mina86@mina86.com>
Reviewed-by: Nicolas Pitre <nico@linaro.org>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Wey-Yi Guy <wey-yi.w.guy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-16 08:57:58 +08:00
|
|
|
#define abs(x) __abs_choose_expr(x, long long, \
|
|
|
|
__abs_choose_expr(x, long, \
|
|
|
|
__abs_choose_expr(x, int, \
|
|
|
|
__abs_choose_expr(x, short, \
|
|
|
|
__abs_choose_expr(x, char, \
|
|
|
|
__builtin_choose_expr( \
|
|
|
|
__builtin_types_compatible_p(typeof(x), char), \
|
|
|
|
(char)({ signed char __x = (x); __x<0?-__x:__x; }), \
|
|
|
|
((void)0)))))))
|
|
|
|
|
|
|
|
#define __abs_choose_expr(x, type, other) __builtin_choose_expr( \
|
|
|
|
__builtin_types_compatible_p(typeof(x), signed type) || \
|
|
|
|
__builtin_types_compatible_p(typeof(x), unsigned type), \
|
|
|
|
({ signed type __x = (x); __x < 0 ? -__x : __x; }), other)
|
kernel.h: make abs() work with 64-bit types
For 64-bit arguments, the abs macro casts it to an int which leads to
lost precision and may cause incorrect results. To deal with 64-bit
types abs64 macro has been introduced but still there are places where
abs macro is used incorrectly.
To deal with the problem, expand abs macro such that it operates on s64
type when dealing with 64-bit types while still returning long when
dealing with smaller types.
This fixes one known bug (per John):
The internal clocksteering done for fine-grained error correction uses a
: logarithmic approximation, so any time adjtimex() adjusts the clock
: steering, timekeeping_freqadjust() quickly approximates the correct clock
: frequency over a series of ticks.
:
: Unfortunately, the logic in timekeeping_freqadjust(), introduced in commit
: dc491596f639438 (Rework frequency adjustments to work better w/ nohz),
: used the abs() function with a s64 error value to calculate the size of
: the approximated adjustment to be made.
:
: Per include/linux/kernel.h: "abs() should not be used for 64-bit types
: (s64, u64, long long) - use abs64()".
:
: Thus on 32-bit platforms, this resulted in the clocksteering to take a
: quite dampended random walk trying to converge on the proper frequency,
: which caused the adjustments to be made much slower then intended (most
: easily observed when large adjustments are made).
Signed-off-by: Michal Nazarewicz <mina86@mina86.com>
Reported-by: John Stultz <john.stultz@linaro.org>
Tested-by: John Stultz <john.stultz@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-10 06:58:10 +08:00
|
|
|
|
2014-01-22 09:29:40 +08:00
|
|
|
/**
|
|
|
|
* reciprocal_scale - "scale" a value into range [0, ep_ro)
|
|
|
|
* @val: value
|
|
|
|
* @ep_ro: right open interval endpoint
|
|
|
|
*
|
|
|
|
* Perform a "reciprocal multiplication" in order to "scale" a value into
|
|
|
|
* range [0, ep_ro), where the upper interval endpoint is right-open.
|
|
|
|
* This is useful, e.g. for accessing a index of an array containing
|
|
|
|
* ep_ro elements, for example. Think of it as sort of modulus, only that
|
|
|
|
* the result isn't that of modulo. ;) Note that if initial input is a
|
|
|
|
* small value, then result will return 0.
|
|
|
|
*
|
|
|
|
* Return: a result based on val in interval [0, ep_ro).
|
|
|
|
*/
|
|
|
|
static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
|
|
|
|
{
|
|
|
|
return (u32)(((u64) val * ep_ro) >> 32);
|
|
|
|
}
|
|
|
|
|
2013-12-13 09:12:24 +08:00
|
|
|
#if defined(CONFIG_MMU) && \
|
|
|
|
(defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP))
|
sched/preempt, mm/fault: Trigger might_sleep() in might_fault() with disabled pagefaults
Commit 662bbcb2747c ("mm, sched: Allow uaccess in atomic with
pagefault_disable()") removed might_sleep() checks for all user access
code (that uses might_fault()).
The reason was to disable wrong "sleep in atomic" warnings in the
following scenario:
pagefault_disable()
rc = copy_to_user(...)
pagefault_enable()
Which is valid, as pagefault_disable() increments the preempt counter
and therefore disables the pagefault handler. copy_to_user() will not
sleep and return an error code if a page is not available.
However, as all might_sleep() checks are removed,
CONFIG_DEBUG_ATOMIC_SLEEP would no longer detect the following scenario:
spin_lock(&lock);
rc = copy_to_user(...)
spin_unlock(&lock)
If the kernel is compiled with preemption turned on, preempt_disable()
will make in_atomic() detect disabled preemption. The fault handler would
correctly never sleep on user access.
However, with preemption turned off, preempt_disable() is usually a NOP
(with !CONFIG_PREEMPT_COUNT), therefore in_atomic() will not be able to
detect disabled preemption nor disabled pagefaults. The fault handler
could sleep.
We really want to enable CONFIG_DEBUG_ATOMIC_SLEEP checks for user access
functions again, otherwise we can end up with horrible deadlocks.
Root of all evil is that pagefault_disable() acts almost as
preempt_disable(), depending on preemption being turned on/off.
As we now have pagefault_disabled(), we can use it to distinguish
whether user acces functions might sleep.
Convert might_fault() into a makro that calls __might_fault(), to
allow proper file + line messages in case of a might_sleep() warning.
Reviewed-and-tested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David.Laight@ACULAB.COM
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: airlied@linux.ie
Cc: akpm@linux-foundation.org
Cc: benh@kernel.crashing.org
Cc: bigeasy@linutronix.de
Cc: borntraeger@de.ibm.com
Cc: daniel.vetter@intel.com
Cc: heiko.carstens@de.ibm.com
Cc: herbert@gondor.apana.org.au
Cc: hocko@suse.cz
Cc: hughd@google.com
Cc: mst@redhat.com
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: schwidefsky@de.ibm.com
Cc: yang.shi@windriver.com
Link: http://lkml.kernel.org/r/1431359540-32227-3-git-send-email-dahi@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-11 23:52:07 +08:00
|
|
|
#define might_fault() __might_fault(__FILE__, __LINE__)
|
|
|
|
void __might_fault(const char *file, int line);
|
2008-09-10 19:37:17 +08:00
|
|
|
#else
|
2013-05-26 22:32:23 +08:00
|
|
|
static inline void might_fault(void) { }
|
2008-09-10 19:37:17 +08:00
|
|
|
#endif
|
|
|
|
|
[PATCH] Notifier chain update: API changes
The kernel's implementation of notifier chains is unsafe. There is no
protection against entries being added to or removed from a chain while the
chain is in use. The issues were discussed in this thread:
http://marc.theaimsgroup.com/?l=linux-kernel&m=113018709002036&w=2
We noticed that notifier chains in the kernel fall into two basic usage
classes:
"Blocking" chains are always called from a process context
and the callout routines are allowed to sleep;
"Atomic" chains can be called from an atomic context and
the callout routines are not allowed to sleep.
We decided to codify this distinction and make it part of the API. Therefore
this set of patches introduces three new, parallel APIs: one for blocking
notifiers, one for atomic notifiers, and one for "raw" notifiers (which is
really just the old API under a new name). New kinds of data structures are
used for the heads of the chains, and new routines are defined for
registration, unregistration, and calling a chain. The three APIs are
explained in include/linux/notifier.h and their implementation is in
kernel/sys.c.
With atomic and blocking chains, the implementation guarantees that the chain
links will not be corrupted and that chain callers will not get messed up by
entries being added or removed. For raw chains the implementation provides no
guarantees at all; users of this API must provide their own protections. (The
idea was that situations may come up where the assumptions of the atomic and
blocking APIs are not appropriate, so it should be possible for users to
handle these things in their own way.)
There are some limitations, which should not be too hard to live with. For
atomic/blocking chains, registration and unregistration must always be done in
a process context since the chain is protected by a mutex/rwsem. Also, a
callout routine for a non-raw chain must not try to register or unregister
entries on its own chain. (This did happen in a couple of places and the code
had to be changed to avoid it.)
Since atomic chains may be called from within an NMI handler, they cannot use
spinlocks for synchronization. Instead we use RCU. The overhead falls almost
entirely in the unregister routine, which is okay since unregistration is much
less frequent that calling a chain.
Here is the list of chains that we adjusted and their classifications. None
of them use the raw API, so for the moment it is only a placeholder.
ATOMIC CHAINS
-------------
arch/i386/kernel/traps.c: i386die_chain
arch/ia64/kernel/traps.c: ia64die_chain
arch/powerpc/kernel/traps.c: powerpc_die_chain
arch/sparc64/kernel/traps.c: sparc64die_chain
arch/x86_64/kernel/traps.c: die_chain
drivers/char/ipmi/ipmi_si_intf.c: xaction_notifier_list
kernel/panic.c: panic_notifier_list
kernel/profile.c: task_free_notifier
net/bluetooth/hci_core.c: hci_notifier
net/ipv4/netfilter/ip_conntrack_core.c: ip_conntrack_chain
net/ipv4/netfilter/ip_conntrack_core.c: ip_conntrack_expect_chain
net/ipv6/addrconf.c: inet6addr_chain
net/netfilter/nf_conntrack_core.c: nf_conntrack_chain
net/netfilter/nf_conntrack_core.c: nf_conntrack_expect_chain
net/netlink/af_netlink.c: netlink_chain
BLOCKING CHAINS
---------------
arch/powerpc/platforms/pseries/reconfig.c: pSeries_reconfig_chain
arch/s390/kernel/process.c: idle_chain
arch/x86_64/kernel/process.c idle_notifier
drivers/base/memory.c: memory_chain
drivers/cpufreq/cpufreq.c cpufreq_policy_notifier_list
drivers/cpufreq/cpufreq.c cpufreq_transition_notifier_list
drivers/macintosh/adb.c: adb_client_list
drivers/macintosh/via-pmu.c sleep_notifier_list
drivers/macintosh/via-pmu68k.c sleep_notifier_list
drivers/macintosh/windfarm_core.c wf_client_list
drivers/usb/core/notify.c usb_notifier_list
drivers/video/fbmem.c fb_notifier_list
kernel/cpu.c cpu_chain
kernel/module.c module_notify_list
kernel/profile.c munmap_notifier
kernel/profile.c task_exit_notifier
kernel/sys.c reboot_notifier_list
net/core/dev.c netdev_chain
net/decnet/dn_dev.c: dnaddr_chain
net/ipv4/devinet.c: inetaddr_chain
It's possible that some of these classifications are wrong. If they are,
please let us know or submit a patch to fix them. Note that any chain that
gets called very frequently should be atomic, because the rwsem read-locking
used for blocking chains is very likely to incur cache misses on SMP systems.
(However, if the chain's callout routines may sleep then the chain cannot be
atomic.)
The patch set was written by Alan Stern and Chandra Seetharaman, incorporating
material written by Keith Owens and suggestions from Paul McKenney and Andrew
Morton.
[jes@sgi.com: restructure the notifier chain initialization macros]
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-27 17:16:30 +08:00
|
|
|
extern struct atomic_notifier_head panic_notifier_list;
|
2010-08-11 09:03:28 +08:00
|
|
|
extern long (*panic_blink)(int state);
|
2012-01-13 09:17:17 +08:00
|
|
|
__printf(1, 2)
|
2016-09-14 00:37:29 +08:00
|
|
|
void panic(const char *fmt, ...) __noreturn __cold;
|
2016-03-23 05:27:17 +08:00
|
|
|
void nmi_panic(struct pt_regs *regs, const char *msg);
|
2006-03-23 19:00:57 +08:00
|
|
|
extern void oops_enter(void);
|
|
|
|
extern void oops_exit(void);
|
2010-08-11 09:03:30 +08:00
|
|
|
void print_oops_end_marker(void);
|
2006-03-23 19:00:57 +08:00
|
|
|
extern int oops_may_print(void);
|
2016-09-14 00:37:29 +08:00
|
|
|
void do_exit(long error_code) __noreturn;
|
|
|
|
void complete_and_exit(struct completion *, long) __noreturn;
|
2011-03-23 07:34:40 +08:00
|
|
|
|
locking/refcounts, x86/asm: Implement fast refcount overflow protection
This implements refcount_t overflow protection on x86 without a noticeable
performance impact, though without the fuller checking of REFCOUNT_FULL.
This is done by duplicating the existing atomic_t refcount implementation
but with normally a single instruction added to detect if the refcount
has gone negative (e.g. wrapped past INT_MAX or below zero). When detected,
the handler saturates the refcount_t to INT_MIN / 2. With this overflow
protection, the erroneous reference release that would follow a wrap back
to zero is blocked from happening, avoiding the class of refcount-overflow
use-after-free vulnerabilities entirely.
Only the overflow case of refcounting can be perfectly protected, since
it can be detected and stopped before the reference is freed and left to
be abused by an attacker. There isn't a way to block early decrements,
and while REFCOUNT_FULL stops increment-from-zero cases (which would
be the state _after_ an early decrement and stops potential double-free
conditions), this fast implementation does not, since it would require
the more expensive cmpxchg loops. Since the overflow case is much more
common (e.g. missing a "put" during an error path), this protection
provides real-world protection. For example, the two public refcount
overflow use-after-free exploits published in 2016 would have been
rendered unexploitable:
http://perception-point.io/2016/01/14/analysis-and-exploitation-of-a-linux-kernel-vulnerability-cve-2016-0728/
http://cyseclabs.com/page?n=02012016
This implementation does, however, notice an unchecked decrement to zero
(i.e. caller used refcount_dec() instead of refcount_dec_and_test() and it
resulted in a zero). Decrements under zero are noticed (since they will
have resulted in a negative value), though this only indicates that a
use-after-free may have already happened. Such notifications are likely
avoidable by an attacker that has already exploited a use-after-free
vulnerability, but it's better to have them reported than allow such
conditions to remain universally silent.
On first overflow detection, the refcount value is reset to INT_MIN / 2
(which serves as a saturation value) and a report and stack trace are
produced. When operations detect only negative value results (such as
changing an already saturated value), saturation still happens but no
notification is performed (since the value was already saturated).
On the matter of races, since the entire range beyond INT_MAX but before
0 is negative, every operation at INT_MIN / 2 will trap, leaving no
overflow-only race condition.
As for performance, this implementation adds a single "js" instruction
to the regular execution flow of a copy of the standard atomic_t refcount
operations. (The non-"and_test" refcount_dec() function, which is uncommon
in regular refcount design patterns, has an additional "jz" instruction
to detect reaching exactly zero.) Since this is a forward jump, it is by
default the non-predicted path, which will be reinforced by dynamic branch
prediction. The result is this protection having virtually no measurable
change in performance over standard atomic_t operations. The error path,
located in .text.unlikely, saves the refcount location and then uses UD0
to fire a refcount exception handler, which resets the refcount, handles
reporting, and returns to regular execution. This keeps the changes to
.text size minimal, avoiding return jumps and open-coded calls to the
error reporting routine.
Example assembly comparison:
refcount_inc() before:
.text:
ffffffff81546149: f0 ff 45 f4 lock incl -0xc(%rbp)
refcount_inc() after:
.text:
ffffffff81546149: f0 ff 45 f4 lock incl -0xc(%rbp)
ffffffff8154614d: 0f 88 80 d5 17 00 js ffffffff816c36d3
...
.text.unlikely:
ffffffff816c36d3: 48 8d 4d f4 lea -0xc(%rbp),%rcx
ffffffff816c36d7: 0f ff (bad)
These are the cycle counts comparing a loop of refcount_inc() from 1
to INT_MAX and back down to 0 (via refcount_dec_and_test()), between
unprotected refcount_t (atomic_t), fully protected REFCOUNT_FULL
(refcount_t-full), and this overflow-protected refcount (refcount_t-fast):
2147483646 refcount_inc()s and 2147483647 refcount_dec_and_test()s:
cycles protections
atomic_t 82249267387 none
refcount_t-fast 82211446892 overflow, untested dec-to-zero
refcount_t-full 144814735193 overflow, untested dec-to-zero, inc-from-zero
This code is a modified version of the x86 PAX_REFCOUNT atomic_t
overflow defense from the last public patch of PaX/grsecurity, based
on my understanding of the code. Changes or omissions from the original
code are mine and don't reflect the original grsecurity/PaX code. Thanks
to PaX Team for various suggestions for improvement for repurposing this
code to be a refcount-only protection.
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Hans Liljestrand <ishkamiel@gmail.com>
Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
Cc: Jann Horn <jannh@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arozansk@redhat.com
Cc: axboe@kernel.dk
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170815161924.GA133115@beast
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-08-16 00:19:24 +08:00
|
|
|
#ifdef CONFIG_ARCH_HAS_REFCOUNT
|
|
|
|
void refcount_error_report(struct pt_regs *regs, const char *err);
|
|
|
|
#else
|
|
|
|
static inline void refcount_error_report(struct pt_regs *regs, const char *err)
|
|
|
|
{ }
|
|
|
|
#endif
|
|
|
|
|
2011-03-23 07:34:40 +08:00
|
|
|
/* Internal, do not use. */
|
|
|
|
int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
|
|
|
|
int __must_check _kstrtol(const char *s, unsigned int base, long *res);
|
|
|
|
|
|
|
|
int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res);
|
|
|
|
int __must_check kstrtoll(const char *s, unsigned int base, long long *res);
|
2012-12-18 08:03:04 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* kstrtoul - convert a string to an unsigned long
|
|
|
|
* @s: The start of the string. The string must be null-terminated, and may also
|
|
|
|
* include a single newline before its terminating null. The first character
|
|
|
|
* may also be a plus sign, but not a minus sign.
|
|
|
|
* @base: The number base to use. The maximum supported base is 16. If base is
|
|
|
|
* given as 0, then the base of the string is automatically detected with the
|
|
|
|
* conventional semantics - If it begins with 0x the number will be parsed as a
|
|
|
|
* hexadecimal (case insensitive), if it otherwise begins with 0, it will be
|
|
|
|
* parsed as an octal number. Otherwise it will be parsed as a decimal.
|
|
|
|
* @res: Where to write the result of the conversion on success.
|
|
|
|
*
|
|
|
|
* Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
|
|
|
|
* Used as a replacement for the obsolete simple_strtoull. Return code must
|
|
|
|
* be checked.
|
|
|
|
*/
|
2011-03-23 07:34:40 +08:00
|
|
|
static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We want to shortcut function call, but
|
|
|
|
* __builtin_types_compatible_p(unsigned long, unsigned long long) = 0.
|
|
|
|
*/
|
|
|
|
if (sizeof(unsigned long) == sizeof(unsigned long long) &&
|
|
|
|
__alignof__(unsigned long) == __alignof__(unsigned long long))
|
|
|
|
return kstrtoull(s, base, (unsigned long long *)res);
|
|
|
|
else
|
|
|
|
return _kstrtoul(s, base, res);
|
|
|
|
}
|
|
|
|
|
2012-12-18 08:03:04 +08:00
|
|
|
/**
|
|
|
|
* kstrtol - convert a string to a long
|
|
|
|
* @s: The start of the string. The string must be null-terminated, and may also
|
|
|
|
* include a single newline before its terminating null. The first character
|
|
|
|
* may also be a plus sign or a minus sign.
|
|
|
|
* @base: The number base to use. The maximum supported base is 16. If base is
|
|
|
|
* given as 0, then the base of the string is automatically detected with the
|
|
|
|
* conventional semantics - If it begins with 0x the number will be parsed as a
|
|
|
|
* hexadecimal (case insensitive), if it otherwise begins with 0, it will be
|
|
|
|
* parsed as an octal number. Otherwise it will be parsed as a decimal.
|
|
|
|
* @res: Where to write the result of the conversion on success.
|
|
|
|
*
|
|
|
|
* Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
|
|
|
|
* Used as a replacement for the obsolete simple_strtoull. Return code must
|
|
|
|
* be checked.
|
|
|
|
*/
|
2011-03-23 07:34:40 +08:00
|
|
|
static inline int __must_check kstrtol(const char *s, unsigned int base, long *res)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We want to shortcut function call, but
|
|
|
|
* __builtin_types_compatible_p(long, long long) = 0.
|
|
|
|
*/
|
|
|
|
if (sizeof(long) == sizeof(long long) &&
|
|
|
|
__alignof__(long) == __alignof__(long long))
|
|
|
|
return kstrtoll(s, base, (long long *)res);
|
|
|
|
else
|
|
|
|
return _kstrtol(s, base, res);
|
|
|
|
}
|
|
|
|
|
|
|
|
int __must_check kstrtouint(const char *s, unsigned int base, unsigned int *res);
|
|
|
|
int __must_check kstrtoint(const char *s, unsigned int base, int *res);
|
|
|
|
|
|
|
|
static inline int __must_check kstrtou64(const char *s, unsigned int base, u64 *res)
|
|
|
|
{
|
|
|
|
return kstrtoull(s, base, res);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int __must_check kstrtos64(const char *s, unsigned int base, s64 *res)
|
|
|
|
{
|
|
|
|
return kstrtoll(s, base, res);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int __must_check kstrtou32(const char *s, unsigned int base, u32 *res)
|
|
|
|
{
|
|
|
|
return kstrtouint(s, base, res);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *res)
|
|
|
|
{
|
|
|
|
return kstrtoint(s, base, res);
|
|
|
|
}
|
|
|
|
|
|
|
|
int __must_check kstrtou16(const char *s, unsigned int base, u16 *res);
|
|
|
|
int __must_check kstrtos16(const char *s, unsigned int base, s16 *res);
|
|
|
|
int __must_check kstrtou8(const char *s, unsigned int base, u8 *res);
|
|
|
|
int __must_check kstrtos8(const char *s, unsigned int base, s8 *res);
|
2016-03-18 05:22:50 +08:00
|
|
|
int __must_check kstrtobool(const char *s, bool *res);
|
2011-03-23 07:34:40 +08:00
|
|
|
|
2011-05-25 08:13:31 +08:00
|
|
|
int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res);
|
|
|
|
int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res);
|
|
|
|
int __must_check kstrtoul_from_user(const char __user *s, size_t count, unsigned int base, unsigned long *res);
|
|
|
|
int __must_check kstrtol_from_user(const char __user *s, size_t count, unsigned int base, long *res);
|
|
|
|
int __must_check kstrtouint_from_user(const char __user *s, size_t count, unsigned int base, unsigned int *res);
|
|
|
|
int __must_check kstrtoint_from_user(const char __user *s, size_t count, unsigned int base, int *res);
|
|
|
|
int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigned int base, u16 *res);
|
|
|
|
int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res);
|
|
|
|
int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res);
|
|
|
|
int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res);
|
2016-03-18 05:22:50 +08:00
|
|
|
int __must_check kstrtobool_from_user(const char __user *s, size_t count, bool *res);
|
2011-05-25 08:13:31 +08:00
|
|
|
|
|
|
|
static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res)
|
|
|
|
{
|
|
|
|
return kstrtoull_from_user(s, count, base, res);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int __must_check kstrtos64_from_user(const char __user *s, size_t count, unsigned int base, s64 *res)
|
|
|
|
{
|
|
|
|
return kstrtoll_from_user(s, count, base, res);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int __must_check kstrtou32_from_user(const char __user *s, size_t count, unsigned int base, u32 *res)
|
|
|
|
{
|
|
|
|
return kstrtouint_from_user(s, count, base, res);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int __must_check kstrtos32_from_user(const char __user *s, size_t count, unsigned int base, s32 *res)
|
|
|
|
{
|
|
|
|
return kstrtoint_from_user(s, count, base, res);
|
|
|
|
}
|
|
|
|
|
2011-11-01 08:13:10 +08:00
|
|
|
/* Obsolete, do not use. Use kstrto<foo> instead */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern unsigned long simple_strtoul(const char *,char **,unsigned int);
|
|
|
|
extern long simple_strtol(const char *,char **,unsigned int);
|
|
|
|
extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
|
|
|
|
extern long long simple_strtoll(const char *,char **,unsigned int);
|
2011-03-23 07:34:40 +08:00
|
|
|
|
2012-03-24 06:02:54 +08:00
|
|
|
extern int num_to_str(char *buf, int size, unsigned long long num);
|
|
|
|
|
2011-11-01 08:13:10 +08:00
|
|
|
/* lib/printf utilities */
|
|
|
|
|
2011-11-01 08:11:33 +08:00
|
|
|
extern __printf(2, 3) int sprintf(char *buf, const char * fmt, ...);
|
|
|
|
extern __printf(2, 0) int vsprintf(char *buf, const char *, va_list);
|
|
|
|
extern __printf(3, 4)
|
|
|
|
int snprintf(char *buf, size_t size, const char *fmt, ...);
|
|
|
|
extern __printf(3, 0)
|
|
|
|
int vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
|
|
|
|
extern __printf(3, 4)
|
|
|
|
int scnprintf(char *buf, size_t size, const char *fmt, ...);
|
|
|
|
extern __printf(3, 0)
|
|
|
|
int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
|
2016-05-20 08:10:55 +08:00
|
|
|
extern __printf(2, 3) __malloc
|
2011-11-01 08:11:33 +08:00
|
|
|
char *kasprintf(gfp_t gfp, const char *fmt, ...);
|
2016-05-20 08:10:55 +08:00
|
|
|
extern __printf(2, 0) __malloc
|
2015-07-18 07:23:42 +08:00
|
|
|
char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
|
2015-11-07 08:31:20 +08:00
|
|
|
extern __printf(2, 0)
|
|
|
|
const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-03-24 06:02:16 +08:00
|
|
|
extern __scanf(2, 3)
|
|
|
|
int sscanf(const char *, const char *, ...);
|
|
|
|
extern __scanf(2, 0)
|
|
|
|
int vsscanf(const char *, const char *, va_list);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern int get_option(char **str, int *pint);
|
|
|
|
extern char *get_options(const char *str, int nints, int *ints);
|
2008-07-25 07:27:46 +08:00
|
|
|
extern unsigned long long memparse(const char *ptr, char **retptr);
|
2014-08-14 17:15:27 +08:00
|
|
|
extern bool parse_option_str(const char *str, const char *option);
|
2017-04-17 21:34:56 +08:00
|
|
|
extern char *next_arg(char *args, char **param, char **val);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-05-16 00:44:06 +08:00
|
|
|
extern int core_kernel_text(unsigned long addr);
|
2011-05-06 09:14:55 +08:00
|
|
|
extern int core_kernel_data(unsigned long addr);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int __kernel_text_address(unsigned long addr);
|
|
|
|
extern int kernel_text_address(unsigned long addr);
|
2008-08-16 06:29:38 +08:00
|
|
|
extern int func_ptr_is_kernel_text(void *ptr);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long int_sqrt(unsigned long);
|
|
|
|
|
|
|
|
extern void bust_spinlocks(int yes);
|
|
|
|
extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */
|
2006-04-11 13:53:59 +08:00
|
|
|
extern int panic_timeout;
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int panic_on_oops;
|
2006-09-26 16:52:27 +08:00
|
|
|
extern int panic_on_unrecovered_nmi;
|
2009-06-25 05:32:11 +08:00
|
|
|
extern int panic_on_io_nmi;
|
2014-12-11 07:45:50 +08:00
|
|
|
extern int panic_on_warn;
|
2016-06-03 00:51:41 +08:00
|
|
|
extern int sysctl_panic_on_rcu_stall;
|
2011-11-29 14:08:36 +08:00
|
|
|
extern int sysctl_panic_on_stackoverflow;
|
2015-07-01 05:57:46 +08:00
|
|
|
|
|
|
|
extern bool crash_kexec_post_notifiers;
|
|
|
|
|
2015-12-14 18:19:09 +08:00
|
|
|
/*
|
|
|
|
* panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
|
|
|
|
* holds a CPU number which is executing panic() currently. A value of
|
|
|
|
* PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
|
|
|
|
*/
|
|
|
|
extern atomic_t panic_cpu;
|
|
|
|
#define PANIC_CPU_INVALID -1
|
|
|
|
|
2013-11-26 07:23:04 +08:00
|
|
|
/*
|
|
|
|
* Only to be used by arch init code. If the user over-wrote the default
|
|
|
|
* CONFIG_PANIC_TIMEOUT, honor it.
|
|
|
|
*/
|
|
|
|
static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout)
|
|
|
|
{
|
|
|
|
if (panic_timeout == arch_default_timeout)
|
|
|
|
panic_timeout = timeout;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
extern const char *print_tainted(void);
|
2013-01-21 14:47:39 +08:00
|
|
|
enum lockdep_ok {
|
|
|
|
LOCKDEP_STILL_OK,
|
|
|
|
LOCKDEP_NOW_UNRELIABLE
|
|
|
|
};
|
|
|
|
extern void add_taint(unsigned flag, enum lockdep_ok);
|
2008-10-16 13:01:41 +08:00
|
|
|
extern int test_taint(unsigned flag);
|
|
|
|
extern unsigned long get_taint(void);
|
2008-02-08 20:19:31 +08:00
|
|
|
extern int root_mountflags;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-20 19:06:35 +08:00
|
|
|
extern bool early_boot_irqs_disabled;
|
|
|
|
|
2017-05-17 02:42:47 +08:00
|
|
|
/*
|
|
|
|
* Values used for system_state. Ordering of the states must not be changed
|
|
|
|
* as code checks for <, <=, >, >= STATE.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
extern enum system_states {
|
|
|
|
SYSTEM_BOOTING,
|
2017-05-17 02:42:47 +08:00
|
|
|
SYSTEM_SCHEDULING,
|
2005-04-17 06:20:36 +08:00
|
|
|
SYSTEM_RUNNING,
|
|
|
|
SYSTEM_HALT,
|
|
|
|
SYSTEM_POWER_OFF,
|
|
|
|
SYSTEM_RESTART,
|
|
|
|
} system_state;
|
|
|
|
|
2008-10-16 13:01:41 +08:00
|
|
|
#define TAINT_PROPRIETARY_MODULE 0
|
|
|
|
#define TAINT_FORCED_MODULE 1
|
2014-02-26 23:49:49 +08:00
|
|
|
#define TAINT_CPU_OUT_OF_SPEC 2
|
2008-10-16 13:01:41 +08:00
|
|
|
#define TAINT_FORCED_RMMOD 3
|
|
|
|
#define TAINT_MACHINE_CHECK 4
|
|
|
|
#define TAINT_BAD_PAGE 5
|
|
|
|
#define TAINT_USER 6
|
|
|
|
#define TAINT_DIE 7
|
|
|
|
#define TAINT_OVERRIDDEN_ACPI_TABLE 8
|
|
|
|
#define TAINT_WARN 9
|
2008-10-18 00:50:12 +08:00
|
|
|
#define TAINT_CRAP 10
|
2010-04-04 02:36:42 +08:00
|
|
|
#define TAINT_FIRMWARE_WORKAROUND 11
|
2011-10-24 21:12:28 +08:00
|
|
|
#define TAINT_OOT_MODULE 12
|
Fix: module signature vs tracepoints: add new TAINT_UNSIGNED_MODULE
Users have reported being unable to trace non-signed modules loaded
within a kernel supporting module signature.
This is caused by tracepoint.c:tracepoint_module_coming() refusing to
take into account tracepoints sitting within force-loaded modules
(TAINT_FORCED_MODULE). The reason for this check, in the first place, is
that a force-loaded module may have a struct module incompatible with
the layout expected by the kernel, and can thus cause a kernel crash
upon forced load of that module on a kernel with CONFIG_TRACEPOINTS=y.
Tracepoints, however, specifically accept TAINT_OOT_MODULE and
TAINT_CRAP, since those modules do not lead to the "very likely system
crash" issue cited above for force-loaded modules.
With kernels having CONFIG_MODULE_SIG=y (signed modules), a non-signed
module is tainted re-using the TAINT_FORCED_MODULE taint flag.
Unfortunately, this means that Tracepoints treat that module as a
force-loaded module, and thus silently refuse to consider any tracepoint
within this module.
Since an unsigned module does not fit within the "very likely system
crash" category of tainting, add a new TAINT_UNSIGNED_MODULE taint flag
to specifically address this taint behavior, and accept those modules
within Tracepoints. We use the letter 'X' as a taint flag character for
a module being loaded that doesn't know how to sign its name (proposed
by Steven Rostedt).
Also add the missing 'O' entry to trace event show_module_flags() list
for the sake of completeness.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
NAKed-by: Ingo Molnar <mingo@redhat.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: David Howells <dhowells@redhat.com>
CC: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2014-03-13 09:41:30 +08:00
|
|
|
#define TAINT_UNSIGNED_MODULE 13
|
2014-08-09 05:22:31 +08:00
|
|
|
#define TAINT_SOFTLOCKUP 14
|
2014-12-17 01:58:18 +08:00
|
|
|
#define TAINT_LIVEPATCH 15
|
2016-09-21 19:47:22 +08:00
|
|
|
#define TAINT_FLAGS_COUNT 16
|
|
|
|
|
|
|
|
struct taint_flag {
|
2017-01-02 10:25:25 +08:00
|
|
|
char c_true; /* character printed when tainted */
|
|
|
|
char c_false; /* character printed when not tainted */
|
2016-09-21 19:47:22 +08:00
|
|
|
bool module; /* also show as a per-module taint flag */
|
|
|
|
};
|
|
|
|
|
|
|
|
extern const struct taint_flag taint_flags[TAINT_FLAGS_COUNT];
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-05-15 07:05:49 +08:00
|
|
|
extern const char hex_asc[];
|
|
|
|
#define hex_asc_lo(x) hex_asc[((x) & 0x0f)]
|
|
|
|
#define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4]
|
|
|
|
|
2011-11-01 08:12:41 +08:00
|
|
|
static inline char *hex_byte_pack(char *buf, u8 byte)
|
2008-05-15 07:05:49 +08:00
|
|
|
{
|
|
|
|
*buf++ = hex_asc_hi(byte);
|
|
|
|
*buf++ = hex_asc_lo(byte);
|
|
|
|
return buf;
|
|
|
|
}
|
2007-05-11 13:22:39 +08:00
|
|
|
|
2013-09-14 01:37:12 +08:00
|
|
|
extern const char hex_asc_upper[];
|
|
|
|
#define hex_asc_upper_lo(x) hex_asc_upper[((x) & 0x0f)]
|
|
|
|
#define hex_asc_upper_hi(x) hex_asc_upper[((x) & 0xf0) >> 4]
|
|
|
|
|
|
|
|
static inline char *hex_byte_pack_upper(char *buf, u8 byte)
|
|
|
|
{
|
|
|
|
*buf++ = hex_asc_upper_hi(byte);
|
|
|
|
*buf++ = hex_asc_upper_lo(byte);
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
2010-05-25 05:33:23 +08:00
|
|
|
extern int hex_to_bin(char ch);
|
2011-09-20 23:23:49 +08:00
|
|
|
extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
|
2014-09-17 00:36:01 +08:00
|
|
|
extern char *bin2hex(char *dst, const void *src, size_t count);
|
2010-05-25 05:33:23 +08:00
|
|
|
|
2014-06-25 02:20:48 +08:00
|
|
|
bool mac_pton(const char *s, u8 *mac);
|
2013-06-05 00:46:26 +08:00
|
|
|
|
2009-03-05 17:28:45 +08:00
|
|
|
/*
|
|
|
|
* General tracing related utility functions - trace_printk(),
|
2009-03-05 23:35:56 +08:00
|
|
|
* tracing_on/tracing_off and tracing_start()/tracing_stop
|
|
|
|
*
|
|
|
|
* Use tracing_on/tracing_off when you want to quickly turn on or off
|
|
|
|
* tracing. It simply enables or disables the recording of the trace events.
|
2009-06-02 14:01:37 +08:00
|
|
|
* This also corresponds to the user space /sys/kernel/debug/tracing/tracing_on
|
2009-03-05 23:35:56 +08:00
|
|
|
* file, which gives a means for the kernel and userspace to interact.
|
|
|
|
* Place a tracing_off() in the kernel where you want tracing to end.
|
|
|
|
* From user space, examine the trace, and then echo 1 > tracing_on
|
|
|
|
* to continue tracing.
|
|
|
|
*
|
|
|
|
* tracing_stop/tracing_start has slightly more overhead. It is used
|
|
|
|
* by things like suspend to ram where disabling the recording of the
|
|
|
|
* trace is not enough, but tracing must actually stop because things
|
|
|
|
* like calling smp_processor_id() may crash the system.
|
|
|
|
*
|
|
|
|
* Most likely, you want to use tracing_on/tracing_off.
|
2009-03-05 17:28:45 +08:00
|
|
|
*/
|
2010-04-19 01:08:41 +08:00
|
|
|
|
|
|
|
enum ftrace_dump_mode {
|
|
|
|
DUMP_NONE,
|
|
|
|
DUMP_ALL,
|
|
|
|
DUMP_ORIG,
|
|
|
|
};
|
|
|
|
|
2009-03-05 17:28:45 +08:00
|
|
|
#ifdef CONFIG_TRACING
|
2012-03-21 00:28:29 +08:00
|
|
|
void tracing_on(void);
|
|
|
|
void tracing_off(void);
|
|
|
|
int tracing_is_on(void);
|
tracing: Add internal tracing_snapshot() functions
The new snapshot feature is quite handy. It's a way for the user
to take advantage of the spare buffer that, until then, only
the latency tracers used to "snapshot" the buffer when it hit
a max latency. Now users can trigger a "snapshot" manually when
some condition is hit in a program. But a snapshot currently can
not be triggered by a condition inside the kernel.
With the addition of tracing_snapshot() and tracing_snapshot_alloc(),
snapshots can now be taking when a condition is hit, and the
developer wants to snapshot the case without stopping the trace.
Note, any snapshot will overwrite the old one, so take care
in how this is done.
These new functions are to be used like tracing_on(), tracing_off()
and trace_printk() are. That is, they should never be called
in the mainline Linux kernel. They are solely for the purpose
of debugging.
The tracing_snapshot() will not allocate a buffer, but it is
safe to be called from any context (except NMIs). But if a
snapshot buffer isn't allocated when it is called, it will write
to the live buffer, complaining about the lack of a snapshot
buffer, and then stop tracing (giving you the "permanent snapshot").
tracing_snapshot_alloc() will allocate the snapshot buffer if
it was not already allocated and then take the snapshot. This routine
*may sleep*, and must be called from context that can sleep.
The allocation is done with GFP_KERNEL and not atomic.
If you need a snapshot in an atomic context, say in early boot,
then it is best to call the tracing_snapshot_alloc() before then,
where it will allocate the buffer, and then you can use the
tracing_snapshot() anywhere you want and still get snapshots.
Cc: Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-03-07 10:45:37 +08:00
|
|
|
void tracing_snapshot(void);
|
|
|
|
void tracing_snapshot_alloc(void);
|
2012-03-21 00:28:29 +08:00
|
|
|
|
2009-03-05 17:28:45 +08:00
|
|
|
extern void tracing_start(void);
|
|
|
|
extern void tracing_stop(void);
|
|
|
|
|
2011-11-01 08:11:33 +08:00
|
|
|
static inline __printf(1, 2)
|
|
|
|
void ____trace_printk_check_format(const char *fmt, ...)
|
2009-03-07 00:21:49 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
#define __trace_printk_check_format(fmt, args...) \
|
|
|
|
do { \
|
|
|
|
if (0) \
|
|
|
|
____trace_printk_check_format(fmt, ##args); \
|
|
|
|
} while (0)
|
|
|
|
|
2009-03-05 17:28:45 +08:00
|
|
|
/**
|
|
|
|
* trace_printk - printf formatting in the ftrace buffer
|
|
|
|
* @fmt: the printf format for printing
|
|
|
|
*
|
|
|
|
* Note: __trace_printk is an internal function for trace_printk and
|
|
|
|
* the @ip is passed in via the trace_printk macro.
|
|
|
|
*
|
|
|
|
* This function allows a kernel developer to debug fast path sections
|
|
|
|
* that printk is not appropriate for. By scattering in various
|
|
|
|
* printk like tracing in the code, a developer can quickly see
|
|
|
|
* where problems are occurring.
|
|
|
|
*
|
|
|
|
* This is intended as a debugging tool for the developer only.
|
|
|
|
* Please refrain from leaving trace_printks scattered around in
|
2013-03-09 10:02:34 +08:00
|
|
|
* your code. (Extra memory is used for special buffers that are
|
|
|
|
* allocated when trace_printk() is used)
|
tracing: Optimize trace_printk() with one arg to use trace_puts()
Although trace_printk() is extremely fast, especially when it uses
trace_bprintk() (writes args straight to buffer instead of inserting
into string), it still has the overhead of calling one of the printf
sprintf() functions, that need to scan the fmt string to determine
what, if any args it has.
This is a waste of precious CPU cycles if the printk format has no
args but a single constant string. It is better to use trace_puts()
which does not have the overhead of the fmt scanning.
But wouldn't it be nice if the developer didn't have to think about
such things, and the compile would just do it for them?
trace_printk("this string has no args\n");
[...]
trace_printk("this sting does %p %d\n", foo, bar);
As tracing is critical to have the least amount of overhead,
especially when dealing with race conditions, and you want to
eliminate any "Heisenbugs", you want the trace_printk() to use the
fastest possible means of tracing.
Currently the macro magic determines if it will use trace_bprintk()
or if the fmt is a dynamic string (a variable), it will fall
back to the slow trace_printk() method that does a full snprintf()
before copying it into the buffer, where as trace_bprintk() only
copys the pointer to the fmt and the args into the buffer.
Well, now there's a way to spend some more Hogwarts cash and come
up with new fancy macro magic.
#define trace_printk(fmt, ...) \
do { \
char _______STR[] = __stringify((__VA_ARGS__)); \
if (sizeof(_______STR) > 3) \
do_trace_printk(fmt, ##__VA_ARGS__); \
else \
trace_puts(fmt); \
} while (0)
The above needs a bit of explaining (both here and in the comments).
By stringifying the __VA_ARGS__, we can, at compile time, determine
the number of args that are being passed to trace_printk(). The extra
parenthesis are required, otherwise the compiler complains about
too many parameters for __stringify if there is more than one arg.
When there are no args, the __stringify((__VA_ARGS__)) converts into
"()\0", a string of 3 characters. Anything else, will be a string
containing more than 3 characters. Now we assign that string to a
dynamic char array, and then take the sizeof() of that array.
If it is greater than 3 characters, we know trace_printk() has args
and we need to do the full "do_trace_printk()" on them, otherwise
it was only passed a single arg and we can optimize to use trace_puts().
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven "The King of Nasty Macros!" Rostedt <rostedt@goodmis.org>
2013-03-09 11:11:57 +08:00
|
|
|
*
|
|
|
|
* A little optization trick is done here. If there's only one
|
|
|
|
* argument, there's no need to scan the string for printf formats.
|
|
|
|
* The trace_puts() will suffice. But how can we take advantage of
|
|
|
|
* using trace_puts() when trace_printk() has only one argument?
|
|
|
|
* By stringifying the args and checking the size we can tell
|
|
|
|
* whether or not there are args. __stringify((__VA_ARGS__)) will
|
|
|
|
* turn into "()\0" with a size of 3 when there are no args, anything
|
|
|
|
* else will be bigger. All we need to do is define a string to this,
|
|
|
|
* and then take its size and compare to 3. If it's bigger, use
|
|
|
|
* do_trace_printk() otherwise, optimize it to trace_puts(). Then just
|
|
|
|
* let gcc optimize the rest.
|
2009-03-05 17:28:45 +08:00
|
|
|
*/
|
2009-03-07 00:21:49 +08:00
|
|
|
|
tracing: Optimize trace_printk() with one arg to use trace_puts()
Although trace_printk() is extremely fast, especially when it uses
trace_bprintk() (writes args straight to buffer instead of inserting
into string), it still has the overhead of calling one of the printf
sprintf() functions, that need to scan the fmt string to determine
what, if any args it has.
This is a waste of precious CPU cycles if the printk format has no
args but a single constant string. It is better to use trace_puts()
which does not have the overhead of the fmt scanning.
But wouldn't it be nice if the developer didn't have to think about
such things, and the compile would just do it for them?
trace_printk("this string has no args\n");
[...]
trace_printk("this sting does %p %d\n", foo, bar);
As tracing is critical to have the least amount of overhead,
especially when dealing with race conditions, and you want to
eliminate any "Heisenbugs", you want the trace_printk() to use the
fastest possible means of tracing.
Currently the macro magic determines if it will use trace_bprintk()
or if the fmt is a dynamic string (a variable), it will fall
back to the slow trace_printk() method that does a full snprintf()
before copying it into the buffer, where as trace_bprintk() only
copys the pointer to the fmt and the args into the buffer.
Well, now there's a way to spend some more Hogwarts cash and come
up with new fancy macro magic.
#define trace_printk(fmt, ...) \
do { \
char _______STR[] = __stringify((__VA_ARGS__)); \
if (sizeof(_______STR) > 3) \
do_trace_printk(fmt, ##__VA_ARGS__); \
else \
trace_puts(fmt); \
} while (0)
The above needs a bit of explaining (both here and in the comments).
By stringifying the __VA_ARGS__, we can, at compile time, determine
the number of args that are being passed to trace_printk(). The extra
parenthesis are required, otherwise the compiler complains about
too many parameters for __stringify if there is more than one arg.
When there are no args, the __stringify((__VA_ARGS__)) converts into
"()\0", a string of 3 characters. Anything else, will be a string
containing more than 3 characters. Now we assign that string to a
dynamic char array, and then take the sizeof() of that array.
If it is greater than 3 characters, we know trace_printk() has args
and we need to do the full "do_trace_printk()" on them, otherwise
it was only passed a single arg and we can optimize to use trace_puts().
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven "The King of Nasty Macros!" Rostedt <rostedt@goodmis.org>
2013-03-09 11:11:57 +08:00
|
|
|
#define trace_printk(fmt, ...) \
|
|
|
|
do { \
|
|
|
|
char _______STR[] = __stringify((__VA_ARGS__)); \
|
|
|
|
if (sizeof(_______STR) > 3) \
|
|
|
|
do_trace_printk(fmt, ##__VA_ARGS__); \
|
|
|
|
else \
|
|
|
|
trace_puts(fmt); \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#define do_trace_printk(fmt, args...) \
|
2009-03-07 00:21:49 +08:00
|
|
|
do { \
|
2016-03-23 05:30:58 +08:00
|
|
|
static const char *trace_printk_fmt __used \
|
tracing: Add percpu buffers for trace_printk()
Currently, trace_printk() uses a single buffer to write into
to calculate the size and format needed to save the trace. To
do this safely in an SMP environment, a spin_lock() is taken
to only allow one writer at a time to the buffer. But this could
also affect what is being traced, and add synchronization that
would not be there otherwise.
Ideally, using percpu buffers would be useful, but since trace_printk()
is only used in development, having per cpu buffers for something
never used is a waste of space. Thus, the use of the trace_bprintk()
format section is changed to be used for static fmts as well as dynamic ones.
Then at boot up, we can check if the section that holds the trace_printk
formats is non-empty, and if it does contain something, then we
know a trace_printk() has been added to the kernel. At this time
the trace_printk per cpu buffers are allocated. A check is also
done at module load time in case a module is added that contains a
trace_printk().
Once the buffers are allocated, they are never freed. If you use
a trace_printk() then you should know what you are doing.
A buffer is made for each type of context:
normal
softirq
irq
nmi
The context is checked and the appropriate buffer is used.
This allows for totally lockless usage of trace_printk(),
and they no longer even disable interrupts.
Requested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-09-23 02:01:55 +08:00
|
|
|
__attribute__((section("__trace_printk_fmt"))) = \
|
|
|
|
__builtin_constant_p(fmt) ? fmt : NULL; \
|
|
|
|
\
|
2009-03-07 00:21:49 +08:00
|
|
|
__trace_printk_check_format(fmt, ##args); \
|
2009-03-13 01:24:49 +08:00
|
|
|
\
|
tracing: Add percpu buffers for trace_printk()
Currently, trace_printk() uses a single buffer to write into
to calculate the size and format needed to save the trace. To
do this safely in an SMP environment, a spin_lock() is taken
to only allow one writer at a time to the buffer. But this could
also affect what is being traced, and add synchronization that
would not be there otherwise.
Ideally, using percpu buffers would be useful, but since trace_printk()
is only used in development, having per cpu buffers for something
never used is a waste of space. Thus, the use of the trace_bprintk()
format section is changed to be used for static fmts as well as dynamic ones.
Then at boot up, we can check if the section that holds the trace_printk
formats is non-empty, and if it does contain something, then we
know a trace_printk() has been added to the kernel. At this time
the trace_printk per cpu buffers are allocated. A check is also
done at module load time in case a module is added that contains a
trace_printk().
Once the buffers are allocated, they are never freed. If you use
a trace_printk() then you should know what you are doing.
A buffer is made for each type of context:
normal
softirq
irq
nmi
The context is checked and the appropriate buffer is used.
This allows for totally lockless usage of trace_printk(),
and they no longer even disable interrupts.
Requested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-09-23 02:01:55 +08:00
|
|
|
if (__builtin_constant_p(fmt)) \
|
2009-03-13 01:24:49 +08:00
|
|
|
__trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args); \
|
tracing: Add percpu buffers for trace_printk()
Currently, trace_printk() uses a single buffer to write into
to calculate the size and format needed to save the trace. To
do this safely in an SMP environment, a spin_lock() is taken
to only allow one writer at a time to the buffer. But this could
also affect what is being traced, and add synchronization that
would not be there otherwise.
Ideally, using percpu buffers would be useful, but since trace_printk()
is only used in development, having per cpu buffers for something
never used is a waste of space. Thus, the use of the trace_bprintk()
format section is changed to be used for static fmts as well as dynamic ones.
Then at boot up, we can check if the section that holds the trace_printk
formats is non-empty, and if it does contain something, then we
know a trace_printk() has been added to the kernel. At this time
the trace_printk per cpu buffers are allocated. A check is also
done at module load time in case a module is added that contains a
trace_printk().
Once the buffers are allocated, they are never freed. If you use
a trace_printk() then you should know what you are doing.
A buffer is made for each type of context:
normal
softirq
irq
nmi
The context is checked and the appropriate buffer is used.
This allows for totally lockless usage of trace_printk(),
and they no longer even disable interrupts.
Requested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-09-23 02:01:55 +08:00
|
|
|
else \
|
|
|
|
__trace_printk(_THIS_IP_, fmt, ##args); \
|
2009-03-07 00:21:49 +08:00
|
|
|
} while (0)
|
|
|
|
|
2011-11-01 08:11:33 +08:00
|
|
|
extern __printf(2, 3)
|
|
|
|
int __trace_bprintk(unsigned long ip, const char *fmt, ...);
|
2009-03-13 01:24:49 +08:00
|
|
|
|
2011-11-01 08:11:33 +08:00
|
|
|
extern __printf(2, 3)
|
|
|
|
int __trace_printk(unsigned long ip, const char *fmt, ...);
|
2009-03-07 00:21:49 +08:00
|
|
|
|
2013-03-09 10:02:34 +08:00
|
|
|
/**
|
|
|
|
* trace_puts - write a string into the ftrace buffer
|
|
|
|
* @str: the string to record
|
|
|
|
*
|
|
|
|
* Note: __trace_bputs is an internal function for trace_puts and
|
|
|
|
* the @ip is passed in via the trace_puts macro.
|
|
|
|
*
|
|
|
|
* This is similar to trace_printk() but is made for those really fast
|
|
|
|
* paths that a developer wants the least amount of "Heisenbug" affects,
|
|
|
|
* where the processing of the print format is still too much.
|
|
|
|
*
|
|
|
|
* This function allows a kernel developer to debug fast path sections
|
|
|
|
* that printk is not appropriate for. By scattering in various
|
|
|
|
* printk like tracing in the code, a developer can quickly see
|
|
|
|
* where problems are occurring.
|
|
|
|
*
|
|
|
|
* This is intended as a debugging tool for the developer only.
|
|
|
|
* Please refrain from leaving trace_puts scattered around in
|
|
|
|
* your code. (Extra memory is used for special buffers that are
|
|
|
|
* allocated when trace_puts() is used)
|
|
|
|
*
|
|
|
|
* Returns: 0 if nothing was written, positive # if string was.
|
|
|
|
* (1 when __trace_bputs is used, strlen(str) when __trace_puts is used)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define trace_puts(str) ({ \
|
2016-03-23 05:30:58 +08:00
|
|
|
static const char *trace_printk_fmt __used \
|
2013-03-09 10:02:34 +08:00
|
|
|
__attribute__((section("__trace_printk_fmt"))) = \
|
|
|
|
__builtin_constant_p(str) ? str : NULL; \
|
|
|
|
\
|
|
|
|
if (__builtin_constant_p(str)) \
|
|
|
|
__trace_bputs(_THIS_IP_, trace_printk_fmt); \
|
|
|
|
else \
|
|
|
|
__trace_puts(_THIS_IP_, str, strlen(str)); \
|
|
|
|
})
|
2013-05-02 23:26:13 +08:00
|
|
|
extern int __trace_bputs(unsigned long ip, const char *str);
|
|
|
|
extern int __trace_puts(unsigned long ip, const char *str, int size);
|
2013-03-09 10:02:34 +08:00
|
|
|
|
2013-03-13 21:55:57 +08:00
|
|
|
extern void trace_dump_stack(int skip);
|
2009-12-11 22:48:22 +08:00
|
|
|
|
2009-03-13 01:24:49 +08:00
|
|
|
/*
|
|
|
|
* The double __builtin_constant_p is because gcc will give us an error
|
|
|
|
* if we try to allocate the static variable to fmt if it is not a
|
|
|
|
* constant. Even with the outer if statement.
|
|
|
|
*/
|
2009-03-07 00:21:49 +08:00
|
|
|
#define ftrace_vprintk(fmt, vargs) \
|
|
|
|
do { \
|
2009-03-13 01:24:49 +08:00
|
|
|
if (__builtin_constant_p(fmt)) { \
|
2016-03-23 05:30:58 +08:00
|
|
|
static const char *trace_printk_fmt __used \
|
2009-03-13 01:24:49 +08:00
|
|
|
__attribute__((section("__trace_printk_fmt"))) = \
|
|
|
|
__builtin_constant_p(fmt) ? fmt : NULL; \
|
2009-03-09 17:11:36 +08:00
|
|
|
\
|
2009-03-13 01:24:49 +08:00
|
|
|
__ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs); \
|
|
|
|
} else \
|
|
|
|
__ftrace_vprintk(_THIS_IP_, fmt, vargs); \
|
2009-03-07 00:21:49 +08:00
|
|
|
} while (0)
|
|
|
|
|
2015-07-18 07:23:42 +08:00
|
|
|
extern __printf(2, 0) int
|
2009-03-13 01:24:49 +08:00
|
|
|
__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
|
|
|
|
|
2015-07-18 07:23:42 +08:00
|
|
|
extern __printf(2, 0) int
|
2009-03-05 17:28:45 +08:00
|
|
|
__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
|
2009-03-07 00:21:49 +08:00
|
|
|
|
2010-04-19 01:08:41 +08:00
|
|
|
extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
|
2009-03-05 17:28:45 +08:00
|
|
|
#else
|
|
|
|
static inline void tracing_start(void) { }
|
|
|
|
static inline void tracing_stop(void) { }
|
2013-08-03 02:47:29 +08:00
|
|
|
static inline void trace_dump_stack(int skip) { }
|
2012-03-21 00:28:29 +08:00
|
|
|
|
|
|
|
static inline void tracing_on(void) { }
|
|
|
|
static inline void tracing_off(void) { }
|
|
|
|
static inline int tracing_is_on(void) { return 0; }
|
tracing: Add internal tracing_snapshot() functions
The new snapshot feature is quite handy. It's a way for the user
to take advantage of the spare buffer that, until then, only
the latency tracers used to "snapshot" the buffer when it hit
a max latency. Now users can trigger a "snapshot" manually when
some condition is hit in a program. But a snapshot currently can
not be triggered by a condition inside the kernel.
With the addition of tracing_snapshot() and tracing_snapshot_alloc(),
snapshots can now be taking when a condition is hit, and the
developer wants to snapshot the case without stopping the trace.
Note, any snapshot will overwrite the old one, so take care
in how this is done.
These new functions are to be used like tracing_on(), tracing_off()
and trace_printk() are. That is, they should never be called
in the mainline Linux kernel. They are solely for the purpose
of debugging.
The tracing_snapshot() will not allocate a buffer, but it is
safe to be called from any context (except NMIs). But if a
snapshot buffer isn't allocated when it is called, it will write
to the live buffer, complaining about the lack of a snapshot
buffer, and then stop tracing (giving you the "permanent snapshot").
tracing_snapshot_alloc() will allocate the snapshot buffer if
it was not already allocated and then take the snapshot. This routine
*may sleep*, and must be called from context that can sleep.
The allocation is done with GFP_KERNEL and not atomic.
If you need a snapshot in an atomic context, say in early boot,
then it is best to call the tracing_snapshot_alloc() before then,
where it will allocate the buffer, and then you can use the
tracing_snapshot() anywhere you want and still get snapshots.
Cc: Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-03-07 10:45:37 +08:00
|
|
|
static inline void tracing_snapshot(void) { }
|
|
|
|
static inline void tracing_snapshot_alloc(void) { }
|
2012-03-21 00:28:29 +08:00
|
|
|
|
2012-10-25 21:41:51 +08:00
|
|
|
static inline __printf(1, 2)
|
|
|
|
int trace_printk(const char *fmt, ...)
|
2009-03-05 17:28:45 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2015-07-18 07:23:42 +08:00
|
|
|
static __printf(1, 0) inline int
|
2009-03-05 17:28:45 +08:00
|
|
|
ftrace_vprintk(const char *fmt, va_list ap)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2010-04-19 01:08:41 +08:00
|
|
|
static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
|
2009-03-07 00:21:49 +08:00
|
|
|
#endif /* CONFIG_TRACING */
|
2009-03-05 17:28:45 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2008-04-30 15:54:55 +08:00
|
|
|
* min()/max()/clamp() macros that also do
|
2005-04-17 06:20:36 +08:00
|
|
|
* strict type-checking.. See the
|
|
|
|
* "unnecessary" pointer comparison.
|
|
|
|
*/
|
2016-10-08 08:02:42 +08:00
|
|
|
#define __min(t1, t2, min1, min2, x, y) ({ \
|
|
|
|
t1 min1 = (x); \
|
|
|
|
t2 min2 = (y); \
|
|
|
|
(void) (&min1 == &min2); \
|
|
|
|
min1 < min2 ? min1 : min2; })
|
|
|
|
#define min(x, y) \
|
|
|
|
__min(typeof(x), typeof(y), \
|
|
|
|
__UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \
|
|
|
|
x, y)
|
|
|
|
|
|
|
|
#define __max(t1, t2, max1, max2, x, y) ({ \
|
|
|
|
t1 max1 = (x); \
|
|
|
|
t2 max2 = (y); \
|
|
|
|
(void) (&max1 == &max2); \
|
|
|
|
max1 > max2 ? max1 : max2; })
|
|
|
|
#define max(x, y) \
|
|
|
|
__max(typeof(x), typeof(y), \
|
|
|
|
__UNIQUE_ID(max1_), __UNIQUE_ID(max2_), \
|
|
|
|
x, y)
|
2008-04-30 15:54:55 +08:00
|
|
|
|
include/linux/kernel.h: rewrite min3, max3 and clamp using min and max
It appears that gcc is better at optimising a double call to min and max
rather than open coded min3 and max3. This can be observed here:
$ cat min-max.c
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void) (&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2; })
#define min3(x, y, z) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
typeof(z) _min3 = (z); \
(void) (&_min1 == &_min2); \
(void) (&_min1 == &_min3); \
_min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
(_min2 < _min3 ? _min2 : _min3); })
int fmin3(int x, int y, int z) { return min3(x, y, z); }
int fmin2(int x, int y, int z) { return min(min(x, y), z); }
$ gcc -O2 -o min-max.s -S min-max.c; cat min-max.s
.file "min-max.c"
.text
.p2align 4,,15
.globl fmin3
.type fmin3, @function
fmin3:
.LFB0:
.cfi_startproc
cmpl %esi, %edi
jl .L5
cmpl %esi, %edx
movl %esi, %eax
cmovle %edx, %eax
ret
.p2align 4,,10
.p2align 3
.L5:
cmpl %edi, %edx
movl %edi, %eax
cmovle %edx, %eax
ret
.cfi_endproc
.LFE0:
.size fmin3, .-fmin3
.p2align 4,,15
.globl fmin2
.type fmin2, @function
fmin2:
.LFB1:
.cfi_startproc
cmpl %edi, %esi
movl %edx, %eax
cmovle %esi, %edi
cmpl %edx, %edi
cmovle %edi, %eax
ret
.cfi_endproc
.LFE1:
.size fmin2, .-fmin2
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",@progbits
fmin3 function, which uses open-coded min3 macro, is compiled into total
of ten instructions including a conditional branch, whereas fmin2
function, which uses two calls to min2 macro, is compiled into six
instructions with no branches.
Similarly, open-coded clamp produces the same code as clamp using min and
max macros, but the latter is much shorter:
$ cat clamp.c
#define clamp(val, min, max) ({ \
typeof(val) __val = (val); \
typeof(min) __min = (min); \
typeof(max) __max = (max); \
(void) (&__val == &__min); \
(void) (&__val == &__max); \
__val = __val < __min ? __min: __val; \
__val > __max ? __max: __val; })
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void) (&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2; })
#define max(x, y) ({ \
typeof(x) _max1 = (x); \
typeof(y) _max2 = (y); \
(void) (&_max1 == &_max2); \
_max1 > _max2 ? _max1 : _max2; })
int fclamp(int v, int min, int max) { return clamp(v, min, max); }
int fclampmm(int v, int min, int max) { return min(max(v, min), max); }
$ gcc -O2 -o clamp.s -S clamp.c; cat clamp.s
.file "clamp.c"
.text
.p2align 4,,15
.globl fclamp
.type fclamp, @function
fclamp:
.LFB0:
.cfi_startproc
cmpl %edi, %esi
movl %edx, %eax
cmovge %esi, %edi
cmpl %edx, %edi
cmovle %edi, %eax
ret
.cfi_endproc
.LFE0:
.size fclamp, .-fclamp
.p2align 4,,15
.globl fclampmm
.type fclampmm, @function
fclampmm:
.LFB1:
.cfi_startproc
cmpl %edi, %esi
cmovge %esi, %edi
cmpl %edi, %edx
movl %edi, %eax
cmovle %edx, %eax
ret
.cfi_endproc
.LFE1:
.size fclampmm, .-fclampmm
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",@progbits
Linux mpn-glaptop 3.13.0-29-generic #53~precise1-Ubuntu SMP Wed Jun 4 22:06:25 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
Copyright (C) 2011 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-rwx------ 1 mpn eng 51224656 Jun 17 14:15 vmlinux.before
-rwx------ 1 mpn eng 51224608 Jun 17 13:57 vmlinux.after
48 bytes reduction. The do_fault_around was a few instruction shorter
and as far as I can tell saved 12 bytes on the stack, i.e.:
$ grep -e rsp -e pop -e push do_fault_around.*
do_fault_around.before.s:push %rbp
do_fault_around.before.s:mov %rsp,%rbp
do_fault_around.before.s:push %r13
do_fault_around.before.s:push %r12
do_fault_around.before.s:push %rbx
do_fault_around.before.s:sub $0x38,%rsp
do_fault_around.before.s:add $0x38,%rsp
do_fault_around.before.s:pop %rbx
do_fault_around.before.s:pop %r12
do_fault_around.before.s:pop %r13
do_fault_around.before.s:pop %rbp
do_fault_around.after.s:push %rbp
do_fault_around.after.s:mov %rsp,%rbp
do_fault_around.after.s:push %r12
do_fault_around.after.s:push %rbx
do_fault_around.after.s:sub $0x30,%rsp
do_fault_around.after.s:add $0x30,%rsp
do_fault_around.after.s:pop %rbx
do_fault_around.after.s:pop %r12
do_fault_around.after.s:pop %rbp
or here side-by-side:
Before After
push %rbp push %rbp
mov %rsp,%rbp mov %rsp,%rbp
push %r13
push %r12 push %r12
push %rbx push %rbx
sub $0x38,%rsp sub $0x30,%rsp
add $0x38,%rsp add $0x30,%rsp
pop %rbx pop %rbx
pop %r12 pop %r12
pop %r13
pop %rbp pop %rbp
There are also fewer branches:
$ grep ^j do_fault_around.*
do_fault_around.before.s:jae ffffffff812079b7
do_fault_around.before.s:jmp ffffffff812079c5
do_fault_around.before.s:jmp ffffffff81207a14
do_fault_around.before.s:ja ffffffff812079f9
do_fault_around.before.s:jb ffffffff81207a10
do_fault_around.before.s:jmp ffffffff81207a63
do_fault_around.before.s:jne ffffffff812079df
do_fault_around.after.s:jmp ffffffff812079fd
do_fault_around.after.s:ja ffffffff812079e2
do_fault_around.after.s:jb ffffffff812079f9
do_fault_around.after.s:jmp ffffffff81207a4c
do_fault_around.after.s:jne ffffffff812079c8
And here's with allyesconfig on a different machine:
$ uname -a; gcc --version; ls -l vmlinux.*
Linux erwin 3.14.7-mn #54 SMP Sun Jun 15 11:25:08 CEST 2014 x86_64 AMD Phenom(tm) II X3 710 Processor AuthenticAMD GNU/Linux
gcc (GCC) 4.8.3
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-rwx------ 1 mpn eng 437027411 Jun 20 16:04 vmlinux.before
-rwx------ 1 mpn eng 437026881 Jun 20 15:30 vmlinux.after
530 bytes reduction.
Signed-off-by: Michal Nazarewicz <mina86@mina86.com>
Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: David Rientjes <rientjes@google.com>
Cc: "Rustad, Mark D" <mark.d.rustad@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-10 06:30:13 +08:00
|
|
|
#define min3(x, y, z) min((typeof(x))min(x, y), z)
|
|
|
|
#define max3(x, y, z) max((typeof(x))max(x, y), z)
|
2010-10-27 05:22:21 +08:00
|
|
|
|
2010-09-11 02:07:38 +08:00
|
|
|
/**
|
|
|
|
* min_not_zero - return the minimum that is _not_ zero, unless both are zero
|
|
|
|
* @x: value1
|
|
|
|
* @y: value2
|
|
|
|
*/
|
|
|
|
#define min_not_zero(x, y) ({ \
|
|
|
|
typeof(x) __x = (x); \
|
|
|
|
typeof(y) __y = (y); \
|
|
|
|
__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
|
|
|
|
|
2008-04-30 15:54:55 +08:00
|
|
|
/**
|
|
|
|
* clamp - return a value clamped to a given range with strict typechecking
|
|
|
|
* @val: current value
|
include/linux/kernel.h: rewrite min3, max3 and clamp using min and max
It appears that gcc is better at optimising a double call to min and max
rather than open coded min3 and max3. This can be observed here:
$ cat min-max.c
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void) (&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2; })
#define min3(x, y, z) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
typeof(z) _min3 = (z); \
(void) (&_min1 == &_min2); \
(void) (&_min1 == &_min3); \
_min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
(_min2 < _min3 ? _min2 : _min3); })
int fmin3(int x, int y, int z) { return min3(x, y, z); }
int fmin2(int x, int y, int z) { return min(min(x, y), z); }
$ gcc -O2 -o min-max.s -S min-max.c; cat min-max.s
.file "min-max.c"
.text
.p2align 4,,15
.globl fmin3
.type fmin3, @function
fmin3:
.LFB0:
.cfi_startproc
cmpl %esi, %edi
jl .L5
cmpl %esi, %edx
movl %esi, %eax
cmovle %edx, %eax
ret
.p2align 4,,10
.p2align 3
.L5:
cmpl %edi, %edx
movl %edi, %eax
cmovle %edx, %eax
ret
.cfi_endproc
.LFE0:
.size fmin3, .-fmin3
.p2align 4,,15
.globl fmin2
.type fmin2, @function
fmin2:
.LFB1:
.cfi_startproc
cmpl %edi, %esi
movl %edx, %eax
cmovle %esi, %edi
cmpl %edx, %edi
cmovle %edi, %eax
ret
.cfi_endproc
.LFE1:
.size fmin2, .-fmin2
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",@progbits
fmin3 function, which uses open-coded min3 macro, is compiled into total
of ten instructions including a conditional branch, whereas fmin2
function, which uses two calls to min2 macro, is compiled into six
instructions with no branches.
Similarly, open-coded clamp produces the same code as clamp using min and
max macros, but the latter is much shorter:
$ cat clamp.c
#define clamp(val, min, max) ({ \
typeof(val) __val = (val); \
typeof(min) __min = (min); \
typeof(max) __max = (max); \
(void) (&__val == &__min); \
(void) (&__val == &__max); \
__val = __val < __min ? __min: __val; \
__val > __max ? __max: __val; })
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void) (&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2; })
#define max(x, y) ({ \
typeof(x) _max1 = (x); \
typeof(y) _max2 = (y); \
(void) (&_max1 == &_max2); \
_max1 > _max2 ? _max1 : _max2; })
int fclamp(int v, int min, int max) { return clamp(v, min, max); }
int fclampmm(int v, int min, int max) { return min(max(v, min), max); }
$ gcc -O2 -o clamp.s -S clamp.c; cat clamp.s
.file "clamp.c"
.text
.p2align 4,,15
.globl fclamp
.type fclamp, @function
fclamp:
.LFB0:
.cfi_startproc
cmpl %edi, %esi
movl %edx, %eax
cmovge %esi, %edi
cmpl %edx, %edi
cmovle %edi, %eax
ret
.cfi_endproc
.LFE0:
.size fclamp, .-fclamp
.p2align 4,,15
.globl fclampmm
.type fclampmm, @function
fclampmm:
.LFB1:
.cfi_startproc
cmpl %edi, %esi
cmovge %esi, %edi
cmpl %edi, %edx
movl %edi, %eax
cmovle %edx, %eax
ret
.cfi_endproc
.LFE1:
.size fclampmm, .-fclampmm
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",@progbits
Linux mpn-glaptop 3.13.0-29-generic #53~precise1-Ubuntu SMP Wed Jun 4 22:06:25 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
Copyright (C) 2011 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-rwx------ 1 mpn eng 51224656 Jun 17 14:15 vmlinux.before
-rwx------ 1 mpn eng 51224608 Jun 17 13:57 vmlinux.after
48 bytes reduction. The do_fault_around was a few instruction shorter
and as far as I can tell saved 12 bytes on the stack, i.e.:
$ grep -e rsp -e pop -e push do_fault_around.*
do_fault_around.before.s:push %rbp
do_fault_around.before.s:mov %rsp,%rbp
do_fault_around.before.s:push %r13
do_fault_around.before.s:push %r12
do_fault_around.before.s:push %rbx
do_fault_around.before.s:sub $0x38,%rsp
do_fault_around.before.s:add $0x38,%rsp
do_fault_around.before.s:pop %rbx
do_fault_around.before.s:pop %r12
do_fault_around.before.s:pop %r13
do_fault_around.before.s:pop %rbp
do_fault_around.after.s:push %rbp
do_fault_around.after.s:mov %rsp,%rbp
do_fault_around.after.s:push %r12
do_fault_around.after.s:push %rbx
do_fault_around.after.s:sub $0x30,%rsp
do_fault_around.after.s:add $0x30,%rsp
do_fault_around.after.s:pop %rbx
do_fault_around.after.s:pop %r12
do_fault_around.after.s:pop %rbp
or here side-by-side:
Before After
push %rbp push %rbp
mov %rsp,%rbp mov %rsp,%rbp
push %r13
push %r12 push %r12
push %rbx push %rbx
sub $0x38,%rsp sub $0x30,%rsp
add $0x38,%rsp add $0x30,%rsp
pop %rbx pop %rbx
pop %r12 pop %r12
pop %r13
pop %rbp pop %rbp
There are also fewer branches:
$ grep ^j do_fault_around.*
do_fault_around.before.s:jae ffffffff812079b7
do_fault_around.before.s:jmp ffffffff812079c5
do_fault_around.before.s:jmp ffffffff81207a14
do_fault_around.before.s:ja ffffffff812079f9
do_fault_around.before.s:jb ffffffff81207a10
do_fault_around.before.s:jmp ffffffff81207a63
do_fault_around.before.s:jne ffffffff812079df
do_fault_around.after.s:jmp ffffffff812079fd
do_fault_around.after.s:ja ffffffff812079e2
do_fault_around.after.s:jb ffffffff812079f9
do_fault_around.after.s:jmp ffffffff81207a4c
do_fault_around.after.s:jne ffffffff812079c8
And here's with allyesconfig on a different machine:
$ uname -a; gcc --version; ls -l vmlinux.*
Linux erwin 3.14.7-mn #54 SMP Sun Jun 15 11:25:08 CEST 2014 x86_64 AMD Phenom(tm) II X3 710 Processor AuthenticAMD GNU/Linux
gcc (GCC) 4.8.3
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-rwx------ 1 mpn eng 437027411 Jun 20 16:04 vmlinux.before
-rwx------ 1 mpn eng 437026881 Jun 20 15:30 vmlinux.after
530 bytes reduction.
Signed-off-by: Michal Nazarewicz <mina86@mina86.com>
Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: David Rientjes <rientjes@google.com>
Cc: "Rustad, Mark D" <mark.d.rustad@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-10 06:30:13 +08:00
|
|
|
* @lo: lowest allowable value
|
|
|
|
* @hi: highest allowable value
|
2008-04-30 15:54:55 +08:00
|
|
|
*
|
2014-10-10 06:30:15 +08:00
|
|
|
* This macro does strict typechecking of lo/hi to make sure they are of the
|
2008-04-30 15:54:55 +08:00
|
|
|
* same type as val. See the unnecessary pointer comparisons.
|
|
|
|
*/
|
include/linux/kernel.h: rewrite min3, max3 and clamp using min and max
It appears that gcc is better at optimising a double call to min and max
rather than open coded min3 and max3. This can be observed here:
$ cat min-max.c
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void) (&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2; })
#define min3(x, y, z) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
typeof(z) _min3 = (z); \
(void) (&_min1 == &_min2); \
(void) (&_min1 == &_min3); \
_min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
(_min2 < _min3 ? _min2 : _min3); })
int fmin3(int x, int y, int z) { return min3(x, y, z); }
int fmin2(int x, int y, int z) { return min(min(x, y), z); }
$ gcc -O2 -o min-max.s -S min-max.c; cat min-max.s
.file "min-max.c"
.text
.p2align 4,,15
.globl fmin3
.type fmin3, @function
fmin3:
.LFB0:
.cfi_startproc
cmpl %esi, %edi
jl .L5
cmpl %esi, %edx
movl %esi, %eax
cmovle %edx, %eax
ret
.p2align 4,,10
.p2align 3
.L5:
cmpl %edi, %edx
movl %edi, %eax
cmovle %edx, %eax
ret
.cfi_endproc
.LFE0:
.size fmin3, .-fmin3
.p2align 4,,15
.globl fmin2
.type fmin2, @function
fmin2:
.LFB1:
.cfi_startproc
cmpl %edi, %esi
movl %edx, %eax
cmovle %esi, %edi
cmpl %edx, %edi
cmovle %edi, %eax
ret
.cfi_endproc
.LFE1:
.size fmin2, .-fmin2
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",@progbits
fmin3 function, which uses open-coded min3 macro, is compiled into total
of ten instructions including a conditional branch, whereas fmin2
function, which uses two calls to min2 macro, is compiled into six
instructions with no branches.
Similarly, open-coded clamp produces the same code as clamp using min and
max macros, but the latter is much shorter:
$ cat clamp.c
#define clamp(val, min, max) ({ \
typeof(val) __val = (val); \
typeof(min) __min = (min); \
typeof(max) __max = (max); \
(void) (&__val == &__min); \
(void) (&__val == &__max); \
__val = __val < __min ? __min: __val; \
__val > __max ? __max: __val; })
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void) (&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2; })
#define max(x, y) ({ \
typeof(x) _max1 = (x); \
typeof(y) _max2 = (y); \
(void) (&_max1 == &_max2); \
_max1 > _max2 ? _max1 : _max2; })
int fclamp(int v, int min, int max) { return clamp(v, min, max); }
int fclampmm(int v, int min, int max) { return min(max(v, min), max); }
$ gcc -O2 -o clamp.s -S clamp.c; cat clamp.s
.file "clamp.c"
.text
.p2align 4,,15
.globl fclamp
.type fclamp, @function
fclamp:
.LFB0:
.cfi_startproc
cmpl %edi, %esi
movl %edx, %eax
cmovge %esi, %edi
cmpl %edx, %edi
cmovle %edi, %eax
ret
.cfi_endproc
.LFE0:
.size fclamp, .-fclamp
.p2align 4,,15
.globl fclampmm
.type fclampmm, @function
fclampmm:
.LFB1:
.cfi_startproc
cmpl %edi, %esi
cmovge %esi, %edi
cmpl %edi, %edx
movl %edi, %eax
cmovle %edx, %eax
ret
.cfi_endproc
.LFE1:
.size fclampmm, .-fclampmm
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",@progbits
Linux mpn-glaptop 3.13.0-29-generic #53~precise1-Ubuntu SMP Wed Jun 4 22:06:25 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
Copyright (C) 2011 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-rwx------ 1 mpn eng 51224656 Jun 17 14:15 vmlinux.before
-rwx------ 1 mpn eng 51224608 Jun 17 13:57 vmlinux.after
48 bytes reduction. The do_fault_around was a few instruction shorter
and as far as I can tell saved 12 bytes on the stack, i.e.:
$ grep -e rsp -e pop -e push do_fault_around.*
do_fault_around.before.s:push %rbp
do_fault_around.before.s:mov %rsp,%rbp
do_fault_around.before.s:push %r13
do_fault_around.before.s:push %r12
do_fault_around.before.s:push %rbx
do_fault_around.before.s:sub $0x38,%rsp
do_fault_around.before.s:add $0x38,%rsp
do_fault_around.before.s:pop %rbx
do_fault_around.before.s:pop %r12
do_fault_around.before.s:pop %r13
do_fault_around.before.s:pop %rbp
do_fault_around.after.s:push %rbp
do_fault_around.after.s:mov %rsp,%rbp
do_fault_around.after.s:push %r12
do_fault_around.after.s:push %rbx
do_fault_around.after.s:sub $0x30,%rsp
do_fault_around.after.s:add $0x30,%rsp
do_fault_around.after.s:pop %rbx
do_fault_around.after.s:pop %r12
do_fault_around.after.s:pop %rbp
or here side-by-side:
Before After
push %rbp push %rbp
mov %rsp,%rbp mov %rsp,%rbp
push %r13
push %r12 push %r12
push %rbx push %rbx
sub $0x38,%rsp sub $0x30,%rsp
add $0x38,%rsp add $0x30,%rsp
pop %rbx pop %rbx
pop %r12 pop %r12
pop %r13
pop %rbp pop %rbp
There are also fewer branches:
$ grep ^j do_fault_around.*
do_fault_around.before.s:jae ffffffff812079b7
do_fault_around.before.s:jmp ffffffff812079c5
do_fault_around.before.s:jmp ffffffff81207a14
do_fault_around.before.s:ja ffffffff812079f9
do_fault_around.before.s:jb ffffffff81207a10
do_fault_around.before.s:jmp ffffffff81207a63
do_fault_around.before.s:jne ffffffff812079df
do_fault_around.after.s:jmp ffffffff812079fd
do_fault_around.after.s:ja ffffffff812079e2
do_fault_around.after.s:jb ffffffff812079f9
do_fault_around.after.s:jmp ffffffff81207a4c
do_fault_around.after.s:jne ffffffff812079c8
And here's with allyesconfig on a different machine:
$ uname -a; gcc --version; ls -l vmlinux.*
Linux erwin 3.14.7-mn #54 SMP Sun Jun 15 11:25:08 CEST 2014 x86_64 AMD Phenom(tm) II X3 710 Processor AuthenticAMD GNU/Linux
gcc (GCC) 4.8.3
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-rwx------ 1 mpn eng 437027411 Jun 20 16:04 vmlinux.before
-rwx------ 1 mpn eng 437026881 Jun 20 15:30 vmlinux.after
530 bytes reduction.
Signed-off-by: Michal Nazarewicz <mina86@mina86.com>
Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: David Rientjes <rientjes@google.com>
Cc: "Rustad, Mark D" <mark.d.rustad@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-10 06:30:13 +08:00
|
|
|
#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ..and if you can't take the strict
|
|
|
|
* types, you can specify one yourself.
|
|
|
|
*
|
2008-04-30 15:54:55 +08:00
|
|
|
* Or not use min/max/clamp at all, of course.
|
|
|
|
*/
|
2016-10-08 08:02:42 +08:00
|
|
|
#define min_t(type, x, y) \
|
|
|
|
__min(type, type, \
|
|
|
|
__UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \
|
|
|
|
x, y)
|
|
|
|
|
|
|
|
#define max_t(type, x, y) \
|
|
|
|
__max(type, type, \
|
|
|
|
__UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \
|
|
|
|
x, y)
|
2008-04-30 15:54:55 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* clamp_t - return a value clamped to a given range using a given type
|
|
|
|
* @type: the type of variable to use
|
|
|
|
* @val: current value
|
2014-10-10 06:30:15 +08:00
|
|
|
* @lo: minimum allowable value
|
|
|
|
* @hi: maximum allowable value
|
2008-04-30 15:54:55 +08:00
|
|
|
*
|
|
|
|
* This macro does no typechecking and uses temporary variables of type
|
|
|
|
* 'type' to make all the comparisons.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2014-10-10 06:30:15 +08:00
|
|
|
#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-30 15:54:55 +08:00
|
|
|
/**
|
|
|
|
* clamp_val - return a value clamped to a given range using val's type
|
|
|
|
* @val: current value
|
2014-10-10 06:30:15 +08:00
|
|
|
* @lo: minimum allowable value
|
|
|
|
* @hi: maximum allowable value
|
2008-04-30 15:54:55 +08:00
|
|
|
*
|
|
|
|
* This macro does no typechecking and uses temporary variables of whatever
|
|
|
|
* type the input argument 'val' is. This is useful when val is an unsigned
|
|
|
|
* type and min and max are literals that will otherwise be assigned a signed
|
|
|
|
* integer type.
|
|
|
|
*/
|
2014-10-10 06:30:15 +08:00
|
|
|
#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-01-08 10:09:12 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* swap - swap value of @a and @b
|
|
|
|
*/
|
2009-02-05 07:11:59 +08:00
|
|
|
#define swap(a, b) \
|
|
|
|
do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
|
2009-01-08 10:09:12 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* container_of - cast a member of a structure out to the containing structure
|
|
|
|
* @ptr: the pointer to the member.
|
|
|
|
* @type: the type of the container struct this is embedded in.
|
|
|
|
* @member: the name of the member within the struct.
|
|
|
|
*
|
|
|
|
*/
|
kernel.h: handle pointers to arrays better in container_of()
If the first parameter of container_of() is a pointer to a
non-const-qualified array type (and the third parameter names a
non-const-qualified array member), the local variable __mptr will be
defined with a const-qualified array type. In ISO C, these types are
incompatible. They work as expected in GNU C, but some versions will
issue warnings. For example, GCC 4.9 produces the warning
"initialization from incompatible pointer type".
Here is an example of where the problem occurs:
-------------------------------------------------------
#include <linux/kernel.h>
#include <linux/module.h>
MODULE_LICENSE("GPL");
struct st {
int a;
char b[16];
};
static int __init example_init(void) {
struct st t = { .a = 101, .b = "hello" };
char (*p)[16] = &t.b;
struct st *x = container_of(p, struct st, b);
printk(KERN_DEBUG "%p %p\n", (void *)&t, (void *)x);
return 0;
}
static void __exit example_exit(void) {
}
module_init(example_init);
module_exit(example_exit);
-------------------------------------------------------
Building the module with gcc-4.9 results in these warnings (where '{m}'
is the module source and '{k}' is the kernel source):
-------------------------------------------------------
In file included from {m}/example.c:1:0:
{m}/example.c: In function `example_init':
{k}/include/linux/kernel.h:854:48: warning: initialization from incompatible pointer type
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
^
{m}/example.c:14:17: note: in expansion of macro `container_of'
struct st *x = container_of(p, struct st, b);
^
{k}/include/linux/kernel.h:854:48: warning: (near initialization for `x')
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
^
{m}/example.c:14:17: note: in expansion of macro `container_of'
struct st *x = container_of(p, struct st, b);
^
-------------------------------------------------------
Replace the type checking performed by the macro to avoid these
warnings. Make sure `*(ptr)` either has type compatible with the
member, or has type compatible with `void`, ignoring qualifiers. Raise
compiler errors if this is not true. This is stronger than the previous
behaviour, which only resulted in compiler warnings for a type mismatch.
[arnd@arndb.de: fix new warnings for container_of()]
Link: http://lkml.kernel.org/r/20170620200940.90557-1-arnd@arndb.de
Link: http://lkml.kernel.org/r/20170525120316.24473-7-abbotti@mev.co.uk
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-13 05:33:04 +08:00
|
|
|
#define container_of(ptr, type, member) ({ \
|
|
|
|
void *__mptr = (void *)(ptr); \
|
|
|
|
BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
|
|
|
|
!__same_type(*(ptr), void), \
|
|
|
|
"pointer type mismatch in container_of()"); \
|
|
|
|
((type *)(__mptr - offsetof(type, member))); })
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-07-26 08:13:03 +08:00
|
|
|
/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
|
|
|
|
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
|
|
|
|
# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
|
|
|
|
#endif
|
2011-07-26 08:13:02 +08:00
|
|
|
|
2014-03-24 09:30:34 +08:00
|
|
|
/* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */
|
2015-05-27 09:39:38 +08:00
|
|
|
#define VERIFY_OCTAL_PERMISSIONS(perms) \
|
|
|
|
(BUILD_BUG_ON_ZERO((perms) < 0) + \
|
|
|
|
BUILD_BUG_ON_ZERO((perms) > 0777) + \
|
|
|
|
/* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */ \
|
|
|
|
BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) + \
|
|
|
|
BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) + \
|
|
|
|
/* USER_WRITABLE >= GROUP_WRITABLE */ \
|
|
|
|
BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) + \
|
|
|
|
/* OTHER_WRITABLE? Generally considered a bad idea. */ \
|
|
|
|
BUILD_BUG_ON_ZERO((perms) & 2) + \
|
2014-03-24 09:30:34 +08:00
|
|
|
(perms))
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|