From 0f8ad5f2e93425812c393c91ceb5af3d95e79b10 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 3 Jul 2020 15:40:29 +0200 Subject: [PATCH 01/67] kcsan: Add support for atomic builtins Some architectures (currently e.g. s390 partially) implement atomics using the compiler's atomic builtins (__atomic_*, __sync_*). To support enabling KCSAN on such architectures in future, or support experimental use of these builtins, implement support for them. We should also avoid breaking KCSAN kernels due to use (accidental or otherwise) of atomic builtins in drivers, as has happened in the past: https://lkml.kernel.org/r/5231d2c0-41d9-6721-e15f-a7eedf3ce69e@infradead.org The instrumentation is subtly different from regular reads/writes: TSAN instrumentation replaces the use of atomic builtins with a call into the runtime, and the runtime's job is to also execute the desired atomic operation. We rely on the __atomic_* compiler builtins, available with all KCSAN-supported compilers, to implement each TSAN atomic instrumentation function. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 110 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 9147ff6a12e5..682d9fd76733 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -879,3 +879,113 @@ void __tsan_init(void) { } EXPORT_SYMBOL(__tsan_init); + +/* + * Instrumentation for atomic builtins (__atomic_*, __sync_*). + * + * Normal kernel code _should not_ be using them directly, but some + * architectures may implement some or all atomics using the compilers' + * builtins. + * + * Note: If an architecture decides to fully implement atomics using the + * builtins, because they are implicitly instrumented by KCSAN (and KASAN, + * etc.), implementing the ARCH_ATOMIC interface (to get instrumentation via + * atomic-instrumented) is no longer necessary. + * + * TSAN instrumentation replaces atomic accesses with calls to any of the below + * functions, whose job is to also execute the operation itself. + */ + +#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \ + u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \ + u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \ + { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \ + return __atomic_load_n(ptr, memorder); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_load); \ + void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \ + void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \ + { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + __atomic_store_n(ptr, v, memorder); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_store) + +#define DEFINE_TSAN_ATOMIC_RMW(op, bits, suffix) \ + u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \ + u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \ + { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + return __atomic_##op##suffix(ptr, v, memorder); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_##op) + +/* + * Note: CAS operations are always classified as write, even in case they + * fail. We cannot perform check_access() after a write, as it might lead to + * false positives, in cases such as: + * + * T0: __atomic_compare_exchange_n(&p->flag, &old, 1, ...) + * + * T1: if (__atomic_load_n(&p->flag, ...)) { + * modify *p; + * p->flag = 0; + * } + * + * The only downside is that, if there are 3 threads, with one CAS that + * succeeds, another CAS that fails, and an unmarked racing operation, we may + * point at the wrong CAS as the source of the race. However, if we assume that + * all CAS can succeed in some other execution, the data race is still valid. + */ +#define DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strength, weak) \ + int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ + u##bits val, int mo, int fail_mo); \ + int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ + u##bits val, int mo, int fail_mo) \ + { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_##strength) + +#define DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits) \ + u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ + int mo, int fail_mo); \ + u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ + int mo, int fail_mo) \ + { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + __atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \ + return exp; \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_val) + +#define DEFINE_TSAN_ATOMIC_OPS(bits) \ + DEFINE_TSAN_ATOMIC_LOAD_STORE(bits); \ + DEFINE_TSAN_ATOMIC_RMW(exchange, bits, _n); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_add, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_sub, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_and, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_or, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_xor, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_nand, bits, ); \ + DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strong, 0); \ + DEFINE_TSAN_ATOMIC_CMPXCHG(bits, weak, 1); \ + DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits) + +DEFINE_TSAN_ATOMIC_OPS(8); +DEFINE_TSAN_ATOMIC_OPS(16); +DEFINE_TSAN_ATOMIC_OPS(32); +DEFINE_TSAN_ATOMIC_OPS(64); + +void __tsan_atomic_thread_fence(int memorder); +void __tsan_atomic_thread_fence(int memorder) +{ + __atomic_thread_fence(memorder); +} +EXPORT_SYMBOL(__tsan_atomic_thread_fence); + +void __tsan_atomic_signal_fence(int memorder); +void __tsan_atomic_signal_fence(int memorder) { } +EXPORT_SYMBOL(__tsan_atomic_signal_fence); From 883957b1c4ac5554ce515e882b7b2b20cbadfdd1 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 3 Jul 2020 15:40:30 +0200 Subject: [PATCH 02/67] objtool: Add atomic builtin TSAN instrumentation to uaccess whitelist Adds the new TSAN functions that may be emitted for atomic builtins to objtool's uaccess whitelist. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Cc: Josh Poimboeuf Cc: Peter Zijlstra --- tools/objtool/check.c | 50 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e034a8f24f46..7546a9d4e3cf 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -528,6 +528,56 @@ static const char *uaccess_safe_builtin[] = { "__tsan_write4", "__tsan_write8", "__tsan_write16", + "__tsan_atomic8_load", + "__tsan_atomic16_load", + "__tsan_atomic32_load", + "__tsan_atomic64_load", + "__tsan_atomic8_store", + "__tsan_atomic16_store", + "__tsan_atomic32_store", + "__tsan_atomic64_store", + "__tsan_atomic8_exchange", + "__tsan_atomic16_exchange", + "__tsan_atomic32_exchange", + "__tsan_atomic64_exchange", + "__tsan_atomic8_fetch_add", + "__tsan_atomic16_fetch_add", + "__tsan_atomic32_fetch_add", + "__tsan_atomic64_fetch_add", + "__tsan_atomic8_fetch_sub", + "__tsan_atomic16_fetch_sub", + "__tsan_atomic32_fetch_sub", + "__tsan_atomic64_fetch_sub", + "__tsan_atomic8_fetch_and", + "__tsan_atomic16_fetch_and", + "__tsan_atomic32_fetch_and", + "__tsan_atomic64_fetch_and", + "__tsan_atomic8_fetch_or", + "__tsan_atomic16_fetch_or", + "__tsan_atomic32_fetch_or", + "__tsan_atomic64_fetch_or", + "__tsan_atomic8_fetch_xor", + "__tsan_atomic16_fetch_xor", + "__tsan_atomic32_fetch_xor", + "__tsan_atomic64_fetch_xor", + "__tsan_atomic8_fetch_nand", + "__tsan_atomic16_fetch_nand", + "__tsan_atomic32_fetch_nand", + "__tsan_atomic64_fetch_nand", + "__tsan_atomic8_compare_exchange_strong", + "__tsan_atomic16_compare_exchange_strong", + "__tsan_atomic32_compare_exchange_strong", + "__tsan_atomic64_compare_exchange_strong", + "__tsan_atomic8_compare_exchange_weak", + "__tsan_atomic16_compare_exchange_weak", + "__tsan_atomic32_compare_exchange_weak", + "__tsan_atomic64_compare_exchange_weak", + "__tsan_atomic8_compare_exchange_val", + "__tsan_atomic16_compare_exchange_val", + "__tsan_atomic32_compare_exchange_val", + "__tsan_atomic64_compare_exchange_val", + "__tsan_atomic_thread_fence", + "__tsan_atomic_signal_fence", /* KCOV */ "write_comp_data", "check_kcov_mode", From f9ea63193135473ed6b6ff06f016eb6248100041 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 3 Jul 2020 15:40:31 +0200 Subject: [PATCH 03/67] kcsan: Add atomic builtin test case Adds test case to kcsan-test module, to test atomic builtin instrumentation works. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan-test.c | 63 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c index fed6fcb5768c..721180cbbab1 100644 --- a/kernel/kcsan/kcsan-test.c +++ b/kernel/kcsan/kcsan-test.c @@ -390,6 +390,15 @@ static noinline void test_kernel_seqlock_writer(void) write_sequnlock_irqrestore(&test_seqlock, flags); } +static noinline void test_kernel_atomic_builtins(void) +{ + /* + * Generate concurrent accesses, expecting no reports, ensuring KCSAN + * treats builtin atomics as actually atomic. + */ + __atomic_load_n(&test_var, __ATOMIC_RELAXED); +} + /* ===== Test cases ===== */ /* Simple test with normal data race. */ @@ -852,6 +861,59 @@ static void test_seqlock_noreport(struct kunit *test) KUNIT_EXPECT_FALSE(test, match_never); } +/* + * Test atomic builtins work and required instrumentation functions exist. We + * also test that KCSAN understands they're atomic by racing with them via + * test_kernel_atomic_builtins(), and expect no reports. + * + * The atomic builtins _SHOULD NOT_ be used in normal kernel code! + */ +static void test_atomic_builtins(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_atomic_builtins, test_kernel_atomic_builtins); + do { + long tmp; + + kcsan_enable_current(); + + __atomic_store_n(&test_var, 42L, __ATOMIC_RELAXED); + KUNIT_EXPECT_EQ(test, 42L, __atomic_load_n(&test_var, __ATOMIC_RELAXED)); + + KUNIT_EXPECT_EQ(test, 42L, __atomic_exchange_n(&test_var, 20, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 20L, test_var); + + tmp = 20L; + KUNIT_EXPECT_TRUE(test, __atomic_compare_exchange_n(&test_var, &tmp, 30L, + 0, __ATOMIC_RELAXED, + __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, tmp, 20L); + KUNIT_EXPECT_EQ(test, test_var, 30L); + KUNIT_EXPECT_FALSE(test, __atomic_compare_exchange_n(&test_var, &tmp, 40L, + 1, __ATOMIC_RELAXED, + __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, tmp, 30L); + KUNIT_EXPECT_EQ(test, test_var, 30L); + + KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 31L, __atomic_fetch_sub(&test_var, 1, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_and(&test_var, 0xf, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 14L, __atomic_fetch_xor(&test_var, 0xf, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 1L, __atomic_fetch_or(&test_var, 0xf0, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 241L, __atomic_fetch_nand(&test_var, 0xf, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, -2L, test_var); + + __atomic_thread_fence(__ATOMIC_SEQ_CST); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + kcsan_disable_current(); + + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + /* * Each test case is run with different numbers of threads. Until KUnit supports * passing arguments for each test case, we encode #threads in the test case @@ -891,6 +953,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped), KCSAN_KUNIT_CASE(test_jiffies_noreport), KCSAN_KUNIT_CASE(test_seqlock_noreport), + KCSAN_KUNIT_CASE(test_atomic_builtins), {}, }; From 14e2ac8de0f91f12122a49f09897b0cd05256460 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Jul 2020 09:00:01 +0200 Subject: [PATCH 04/67] kcsan: Support compounded read-write instrumentation Add support for compounded read-write instrumentation if supported by the compiler. Adds the necessary instrumentation functions, and a new type which is used to generate a more descriptive report. Furthermore, such compounded memory access instrumentation is excluded from the "assume aligned writes up to word size are atomic" rule, because we cannot assume that the compiler emits code that is atomic for compound ops. LLVM/Clang added support for the feature in: https://github.com/llvm/llvm-project/commit/785d41a261d136b64ab6c15c5d35f2adc5ad53e3 The new instrumentation is emitted for sets of memory accesses in the same basic block to the same address with at least one read appearing before a write. These typically result from compound operations such as ++, --, +=, -=, |=, &=, etc. but also equivalent forms such as "var = var + 1". Where the compiler determines that it is equivalent to emit a call to a single __tsan_read_write instead of separate __tsan_read and __tsan_write, we can then benefit from improved performance and better reporting for such access patterns. The new reports now show that the ops are both reads and writes, for example: read-write to 0xffffffff90548a38 of 8 bytes by task 143 on cpu 3: test_kernel_rmw_array+0x45/0xa0 access_thread+0x71/0xb0 kthread+0x21e/0x240 ret_from_fork+0x22/0x30 read-write to 0xffffffff90548a38 of 8 bytes by task 144 on cpu 2: test_kernel_rmw_array+0x45/0xa0 access_thread+0x71/0xb0 kthread+0x21e/0x240 ret_from_fork+0x22/0x30 Acked-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/kcsan-checks.h | 45 ++++++++++++++++++++++++------------ kernel/kcsan/core.c | 23 ++++++++++++++---- kernel/kcsan/report.c | 4 ++++ scripts/Makefile.kcsan | 2 +- 4 files changed, 53 insertions(+), 21 deletions(-) diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index c5f6c1dcf7e3..cf14840609ce 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -7,19 +7,13 @@ #include #include -/* - * ACCESS TYPE MODIFIERS - * - * : normal read access; - * WRITE : write access; - * ATOMIC: access is atomic; - * ASSERT: access is not a regular access, but an assertion; - * SCOPED: access is a scoped access; - */ -#define KCSAN_ACCESS_WRITE 0x1 -#define KCSAN_ACCESS_ATOMIC 0x2 -#define KCSAN_ACCESS_ASSERT 0x4 -#define KCSAN_ACCESS_SCOPED 0x8 +/* Access types -- if KCSAN_ACCESS_WRITE is not set, the access is a read. */ +#define KCSAN_ACCESS_WRITE (1 << 0) /* Access is a write. */ +#define KCSAN_ACCESS_COMPOUND (1 << 1) /* Compounded read-write instrumentation. */ +#define KCSAN_ACCESS_ATOMIC (1 << 2) /* Access is atomic. */ +/* The following are special, and never due to compiler instrumentation. */ +#define KCSAN_ACCESS_ASSERT (1 << 3) /* Access is an assertion. */ +#define KCSAN_ACCESS_SCOPED (1 << 4) /* Access is a scoped access. */ /* * __kcsan_*: Always calls into the runtime when KCSAN is enabled. This may be used @@ -204,6 +198,15 @@ static inline void __kcsan_disable_current(void) { } #define __kcsan_check_write(ptr, size) \ __kcsan_check_access(ptr, size, KCSAN_ACCESS_WRITE) +/** + * __kcsan_check_read_write - check regular read-write access for races + * + * @ptr: address of access + * @size: size of access + */ +#define __kcsan_check_read_write(ptr, size) \ + __kcsan_check_access(ptr, size, KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE) + /** * kcsan_check_read - check regular read access for races * @@ -221,18 +224,30 @@ static inline void __kcsan_disable_current(void) { } #define kcsan_check_write(ptr, size) \ kcsan_check_access(ptr, size, KCSAN_ACCESS_WRITE) +/** + * kcsan_check_read_write - check regular read-write access for races + * + * @ptr: address of access + * @size: size of access + */ +#define kcsan_check_read_write(ptr, size) \ + kcsan_check_access(ptr, size, KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE) + /* * Check for atomic accesses: if atomic accesses are not ignored, this simply * aliases to kcsan_check_access(), otherwise becomes a no-op. */ #ifdef CONFIG_KCSAN_IGNORE_ATOMICS -#define kcsan_check_atomic_read(...) do { } while (0) -#define kcsan_check_atomic_write(...) do { } while (0) +#define kcsan_check_atomic_read(...) do { } while (0) +#define kcsan_check_atomic_write(...) do { } while (0) +#define kcsan_check_atomic_read_write(...) do { } while (0) #else #define kcsan_check_atomic_read(ptr, size) \ kcsan_check_access(ptr, size, KCSAN_ACCESS_ATOMIC) #define kcsan_check_atomic_write(ptr, size) \ kcsan_check_access(ptr, size, KCSAN_ACCESS_ATOMIC | KCSAN_ACCESS_WRITE) +#define kcsan_check_atomic_read_write(ptr, size) \ + kcsan_check_access(ptr, size, KCSAN_ACCESS_ATOMIC | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND) #endif /** diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 682d9fd76733..4c8b40b14314 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -223,7 +223,7 @@ is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) && (type & KCSAN_ACCESS_WRITE) && size <= sizeof(long) && - IS_ALIGNED((unsigned long)ptr, size)) + !(type & KCSAN_ACCESS_COMPOUND) && IS_ALIGNED((unsigned long)ptr, size)) return true; /* Assume aligned writes up to word size are atomic. */ if (ctx->atomic_next > 0) { @@ -793,7 +793,17 @@ EXPORT_SYMBOL(__kcsan_check_access); EXPORT_SYMBOL(__tsan_write##size); \ void __tsan_unaligned_write##size(void *ptr) \ __alias(__tsan_write##size); \ - EXPORT_SYMBOL(__tsan_unaligned_write##size) + EXPORT_SYMBOL(__tsan_unaligned_write##size); \ + void __tsan_read_write##size(void *ptr); \ + void __tsan_read_write##size(void *ptr) \ + { \ + check_access(ptr, size, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE); \ + } \ + EXPORT_SYMBOL(__tsan_read_write##size); \ + void __tsan_unaligned_read_write##size(void *ptr) \ + __alias(__tsan_read_write##size); \ + EXPORT_SYMBOL(__tsan_unaligned_read_write##size) DEFINE_TSAN_READ_WRITE(1); DEFINE_TSAN_READ_WRITE(2); @@ -916,7 +926,8 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ return __atomic_##op##suffix(ptr, v, memorder); \ } \ EXPORT_SYMBOL(__tsan_atomic##bits##_##op) @@ -944,7 +955,8 @@ EXPORT_SYMBOL(__tsan_init); int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ u##bits val, int mo, int fail_mo) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \ } \ EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_##strength) @@ -955,7 +967,8 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ int mo, int fail_mo) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ __atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \ return exp; \ } \ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 9d07e175de0f..3e83a69239fa 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -228,6 +228,10 @@ static const char *get_access_type(int type) return "write"; case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: return "write (marked)"; + case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE: + return "read-write"; + case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "read-write (marked)"; case KCSAN_ACCESS_SCOPED: return "read (scoped)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC: diff --git a/scripts/Makefile.kcsan b/scripts/Makefile.kcsan index c50f27b3ac56..c37f9518d5d9 100644 --- a/scripts/Makefile.kcsan +++ b/scripts/Makefile.kcsan @@ -11,5 +11,5 @@ endif # of some options does not break KCSAN nor causes false positive reports. CFLAGS_KCSAN := -fsanitize=thread \ $(call cc-option,$(call cc-param,tsan-instrument-func-entry-exit=0) -fno-optimize-sibling-calls) \ - $(call cc-option,$(call cc-param,tsan-instrument-read-before-write=1)) \ + $(call cc-option,$(call cc-param,tsan-compound-read-before-write=1),$(call cc-option,$(call cc-param,tsan-instrument-read-before-write=1))) \ $(call cc-param,tsan-distinguish-volatile=1) From a81b37590ff2e2507940ec278910b1d315dc73b3 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Jul 2020 09:00:02 +0200 Subject: [PATCH 05/67] objtool, kcsan: Add __tsan_read_write to uaccess whitelist Adds the new __tsan_read_write compound instrumentation to objtool's uaccess whitelist. Acked-by: Peter Zijlstra (Intel) Cc: Josh Poimboeuf Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- tools/objtool/check.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 7546a9d4e3cf..5eee156f6f90 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -528,6 +528,11 @@ static const char *uaccess_safe_builtin[] = { "__tsan_write4", "__tsan_write8", "__tsan_write16", + "__tsan_read_write1", + "__tsan_read_write2", + "__tsan_read_write4", + "__tsan_read_write8", + "__tsan_read_write16", "__tsan_atomic8_load", "__tsan_atomic16_load", "__tsan_atomic32_load", From 106a307fd0a762e2d47e1cf99e6da43763887a18 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Jul 2020 09:00:03 +0200 Subject: [PATCH 06/67] kcsan: Skew delay to be longer for certain access types For compound instrumentation and assert accesses, skew the watchpoint delay to be longer if randomized. This is useful to improve race detection for such accesses. For compound accesses we should increase the delay as we've aggregated both read and write instrumentation. By giving up 1 call into the runtime, we're less likely to set up a watchpoint and thus less likely to detect a race. We can balance this by increasing the watchpoint delay. For assert accesses, we know these are of increased interest, and we wish to increase our chances of detecting races for such checks. Note that, kcsan_udelay_{task,interrupt} define the upper bound delays. When randomized, delays are uniformly distributed between [0, delay]. Skewing the delay does not break this promise as long as the defined upper bounds are still adhered to. The current skew results in delays uniformly distributed between [delay/2, delay]. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 4c8b40b14314..95a364e22aab 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -283,11 +283,15 @@ static __always_inline bool kcsan_is_enabled(void) return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; } -static inline unsigned int get_delay(void) +static inline unsigned int get_delay(int type) { unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt; + /* For certain access types, skew the random delay to be longer. */ + unsigned int skew_delay_order = + (type & (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_ASSERT)) ? 1 : 0; + return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? - prandom_u32_max(delay) : + prandom_u32_max(delay >> skew_delay_order) : 0); } @@ -470,7 +474,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * Delay this thread, to increase probability of observing a racy * conflicting access. */ - udelay(get_delay()); + udelay(get_delay(type)); /* * Re-read value, and check if it is as expected; if not, we infer a From 9d1335cc1e97cc3da0d14f640dd716e614083e8b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Jul 2020 09:00:04 +0200 Subject: [PATCH 07/67] kcsan: Add missing CONFIG_KCSAN_IGNORE_ATOMICS checks Add missing CONFIG_KCSAN_IGNORE_ATOMICS checks for the builtin atomics instrumentation. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 95a364e22aab..99e5044bc942 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -914,14 +914,19 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \ + } \ return __atomic_load_n(ptr, memorder); \ } \ EXPORT_SYMBOL(__tsan_atomic##bits##_load); \ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + } \ __atomic_store_n(ptr, v, memorder); \ } \ EXPORT_SYMBOL(__tsan_atomic##bits##_store) @@ -930,8 +935,11 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, \ - KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ + KCSAN_ACCESS_ATOMIC); \ + } \ return __atomic_##op##suffix(ptr, v, memorder); \ } \ EXPORT_SYMBOL(__tsan_atomic##bits##_##op) @@ -959,8 +967,11 @@ EXPORT_SYMBOL(__tsan_init); int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ u##bits val, int mo, int fail_mo) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, \ - KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ + KCSAN_ACCESS_ATOMIC); \ + } \ return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \ } \ EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_##strength) @@ -971,8 +982,11 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ int mo, int fail_mo) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, \ - KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ + KCSAN_ACCESS_ATOMIC); \ + } \ __atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \ return exp; \ } \ From bec4a2474890a6884eb890c778ea02bccaaae6eb Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Jul 2020 09:00:05 +0200 Subject: [PATCH 08/67] kcsan: Test support for compound instrumentation Changes kcsan-test module to support checking reports that include compound instrumentation. Since we should not fail the test if this support is unavailable, we have to add a config variable that the test can use to decide what to check for. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan-test.c | 65 ++++++++++++++++++++++++++++++--------- lib/Kconfig.kcsan | 5 +++ 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c index 721180cbbab1..ebe7fd245104 100644 --- a/kernel/kcsan/kcsan-test.c +++ b/kernel/kcsan/kcsan-test.c @@ -27,6 +27,12 @@ #include #include +#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE +#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE) +#else +#define __KCSAN_ACCESS_RW(alt) (alt) +#endif + /* Points to current test-case memory access "kernels". */ static void (*access_kernels[2])(void); @@ -186,20 +192,21 @@ static bool report_matches(const struct expect_report *r) /* Access 1 & 2 */ for (i = 0; i < 2; ++i) { + const int ty = r->access[i].type; const char *const access_type = - (r->access[i].type & KCSAN_ACCESS_ASSERT) ? - ((r->access[i].type & KCSAN_ACCESS_WRITE) ? - "assert no accesses" : - "assert no writes") : - ((r->access[i].type & KCSAN_ACCESS_WRITE) ? - "write" : - "read"); + (ty & KCSAN_ACCESS_ASSERT) ? + ((ty & KCSAN_ACCESS_WRITE) ? + "assert no accesses" : + "assert no writes") : + ((ty & KCSAN_ACCESS_WRITE) ? + ((ty & KCSAN_ACCESS_COMPOUND) ? + "read-write" : + "write") : + "read"); const char *const access_type_aux = - (r->access[i].type & KCSAN_ACCESS_ATOMIC) ? - " (marked)" : - ((r->access[i].type & KCSAN_ACCESS_SCOPED) ? - " (scoped)" : - ""); + (ty & KCSAN_ACCESS_ATOMIC) ? + " (marked)" : + ((ty & KCSAN_ACCESS_SCOPED) ? " (scoped)" : ""); if (i == 1) { /* Access 2 */ @@ -277,6 +284,12 @@ static noinline void test_kernel_write_atomic(void) WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1); } +static noinline void test_kernel_atomic_rmw(void) +{ + /* Use builtin, so we can set up the "bad" atomic/non-atomic scenario. */ + __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED); +} + __no_kcsan static noinline void test_kernel_write_uninstrumented(void) { test_var++; } @@ -439,8 +452,8 @@ static void test_concurrent_races(struct kunit *test) const struct expect_report expect = { .access = { /* NULL will match any address. */ - { test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE }, - { test_kernel_rmw_array, NULL, 0, 0 }, + { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) }, }, }; static const struct expect_report never = { @@ -629,6 +642,29 @@ static void test_read_plain_atomic_write(struct kunit *test) KUNIT_EXPECT_TRUE(test, match_expect); } +/* Test that atomic RMWs generate correct report. */ +__no_kcsan +static void test_read_plain_atomic_rmw(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_atomic_rmw, &test_var, sizeof(test_var), + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC }, + }, + }; + bool match_expect = false; + + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) + return; + + begin_test_checks(test_kernel_read, test_kernel_atomic_rmw); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + /* Zero-sized accesses should never cause data race reports. */ __no_kcsan static void test_zero_size_access(struct kunit *test) @@ -942,6 +978,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_write_write_struct_part), KCSAN_KUNIT_CASE(test_read_atomic_write_atomic), KCSAN_KUNIT_CASE(test_read_plain_atomic_write), + KCSAN_KUNIT_CASE(test_read_plain_atomic_rmw), KCSAN_KUNIT_CASE(test_zero_size_access), KCSAN_KUNIT_CASE(test_data_race), KCSAN_KUNIT_CASE(test_assert_exclusive_writer), diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index 3d282d51849b..f271ff5fbb5a 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -40,6 +40,11 @@ menuconfig KCSAN if KCSAN +# Compiler capabilities that should not fail the test if they are unavailable. +config CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE + def_bool (CC_IS_CLANG && $(cc-option,-fsanitize=thread -mllvm -tsan-compound-read-before-write=1)) || \ + (CC_IS_GCC && $(cc-option,-fsanitize=thread --param tsan-compound-read-before-write=1)) + config KCSAN_VERBOSE bool "Show verbose reports with more information about system state" depends on PROVE_LOCKING From 00047c2e6d7c576c1a847f7db07ef0fc58085f22 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Jul 2020 09:00:06 +0200 Subject: [PATCH 09/67] instrumented.h: Introduce read-write instrumentation hooks Introduce read-write instrumentation hooks, to more precisely denote an operation's behaviour. KCSAN is able to distinguish compound instrumentation, and with the new instrumentation we then benefit from improved reporting. More importantly, read-write compound operations should not implicitly be treated as atomic, if they aren't actually atomic. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/instrumented.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index 43e6ea591975..42faebbaa202 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -42,6 +42,21 @@ static __always_inline void instrument_write(const volatile void *v, size_t size kcsan_check_write(v, size); } +/** + * instrument_read_write - instrument regular read-write access + * + * Instrument a regular write access. The instrumentation should be inserted + * before the actual write happens. + * + * @ptr address of access + * @size size of access + */ +static __always_inline void instrument_read_write(const volatile void *v, size_t size) +{ + kasan_check_write(v, size); + kcsan_check_read_write(v, size); +} + /** * instrument_atomic_read - instrument atomic read access * @@ -72,6 +87,21 @@ static __always_inline void instrument_atomic_write(const volatile void *v, size kcsan_check_atomic_write(v, size); } +/** + * instrument_atomic_read_write - instrument atomic read-write access + * + * Instrument an atomic read-write access. The instrumentation should be + * inserted before the actual write happens. + * + * @ptr address of access + * @size size of access + */ +static __always_inline void instrument_atomic_read_write(const volatile void *v, size_t size) +{ + kasan_check_write(v, size); + kcsan_check_atomic_read_write(v, size); +} + /** * instrument_copy_to_user - instrument reads of copy_to_user * From b159eeccb75a7916278d95e2ff5540e670682748 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Jul 2020 09:00:07 +0200 Subject: [PATCH 10/67] asm-generic/bitops: Use instrument_read_write() where appropriate Use the new instrument_read_write() where appropriate. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/asm-generic/bitops/instrumented-atomic.h | 6 +++--- include/asm-generic/bitops/instrumented-lock.h | 2 +- include/asm-generic/bitops/instrumented-non-atomic.h | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/asm-generic/bitops/instrumented-atomic.h b/include/asm-generic/bitops/instrumented-atomic.h index fb2cb33a4013..81915dcd4b4e 100644 --- a/include/asm-generic/bitops/instrumented-atomic.h +++ b/include/asm-generic/bitops/instrumented-atomic.h @@ -67,7 +67,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr) */ static inline bool test_and_set_bit(long nr, volatile unsigned long *addr) { - instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); + instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_set_bit(nr, addr); } @@ -80,7 +80,7 @@ static inline bool test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) { - instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); + instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_clear_bit(nr, addr); } @@ -93,7 +93,7 @@ static inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) */ static inline bool test_and_change_bit(long nr, volatile unsigned long *addr) { - instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); + instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_change_bit(nr, addr); } diff --git a/include/asm-generic/bitops/instrumented-lock.h b/include/asm-generic/bitops/instrumented-lock.h index b9bec468ae03..75ef606f7145 100644 --- a/include/asm-generic/bitops/instrumented-lock.h +++ b/include/asm-generic/bitops/instrumented-lock.h @@ -52,7 +52,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) */ static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr) { - instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); + instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_set_bit_lock(nr, addr); } diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h index 20f788a25ef9..f86234c7c10c 100644 --- a/include/asm-generic/bitops/instrumented-non-atomic.h +++ b/include/asm-generic/bitops/instrumented-non-atomic.h @@ -68,7 +68,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr) */ static inline bool __test_and_set_bit(long nr, volatile unsigned long *addr) { - instrument_write(addr + BIT_WORD(nr), sizeof(long)); + instrument_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch___test_and_set_bit(nr, addr); } @@ -82,7 +82,7 @@ static inline bool __test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr) { - instrument_write(addr + BIT_WORD(nr), sizeof(long)); + instrument_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch___test_and_clear_bit(nr, addr); } @@ -96,7 +96,7 @@ static inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr) */ static inline bool __test_and_change_bit(long nr, volatile unsigned long *addr) { - instrument_write(addr + BIT_WORD(nr), sizeof(long)); + instrument_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch___test_and_change_bit(nr, addr); } From 3570a1bcf45e9a7ddf9ba0e8d6d57cc67675cfef Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Jul 2020 09:00:08 +0200 Subject: [PATCH 11/67] locking/atomics: Use read-write instrumentation for atomic RMWs Use instrument_atomic_read_write() for atomic RMW ops. Cc: Will Deacon Cc: Boqun Feng Cc: Arnd Bergmann Cc: Acked-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/asm-generic/atomic-instrumented.h | 330 +++++++++++----------- scripts/atomic/gen-atomic-instrumented.sh | 21 +- 2 files changed, 180 insertions(+), 171 deletions(-) diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h index 379986e40159..cd223b68b69d 100644 --- a/include/asm-generic/atomic-instrumented.h +++ b/include/asm-generic/atomic-instrumented.h @@ -60,7 +60,7 @@ atomic_set_release(atomic_t *v, int i) static __always_inline void atomic_add(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_add(i, v); } #define atomic_add atomic_add @@ -69,7 +69,7 @@ atomic_add(int i, atomic_t *v) static __always_inline int atomic_add_return(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_return(i, v); } #define atomic_add_return atomic_add_return @@ -79,7 +79,7 @@ atomic_add_return(int i, atomic_t *v) static __always_inline int atomic_add_return_acquire(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_return_acquire(i, v); } #define atomic_add_return_acquire atomic_add_return_acquire @@ -89,7 +89,7 @@ atomic_add_return_acquire(int i, atomic_t *v) static __always_inline int atomic_add_return_release(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_return_release(i, v); } #define atomic_add_return_release atomic_add_return_release @@ -99,7 +99,7 @@ atomic_add_return_release(int i, atomic_t *v) static __always_inline int atomic_add_return_relaxed(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_return_relaxed(i, v); } #define atomic_add_return_relaxed atomic_add_return_relaxed @@ -109,7 +109,7 @@ atomic_add_return_relaxed(int i, atomic_t *v) static __always_inline int atomic_fetch_add(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_add(i, v); } #define atomic_fetch_add atomic_fetch_add @@ -119,7 +119,7 @@ atomic_fetch_add(int i, atomic_t *v) static __always_inline int atomic_fetch_add_acquire(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_add_acquire(i, v); } #define atomic_fetch_add_acquire atomic_fetch_add_acquire @@ -129,7 +129,7 @@ atomic_fetch_add_acquire(int i, atomic_t *v) static __always_inline int atomic_fetch_add_release(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_add_release(i, v); } #define atomic_fetch_add_release atomic_fetch_add_release @@ -139,7 +139,7 @@ atomic_fetch_add_release(int i, atomic_t *v) static __always_inline int atomic_fetch_add_relaxed(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_add_relaxed(i, v); } #define atomic_fetch_add_relaxed atomic_fetch_add_relaxed @@ -148,7 +148,7 @@ atomic_fetch_add_relaxed(int i, atomic_t *v) static __always_inline void atomic_sub(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_sub(i, v); } #define atomic_sub atomic_sub @@ -157,7 +157,7 @@ atomic_sub(int i, atomic_t *v) static __always_inline int atomic_sub_return(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_sub_return(i, v); } #define atomic_sub_return atomic_sub_return @@ -167,7 +167,7 @@ atomic_sub_return(int i, atomic_t *v) static __always_inline int atomic_sub_return_acquire(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_sub_return_acquire(i, v); } #define atomic_sub_return_acquire atomic_sub_return_acquire @@ -177,7 +177,7 @@ atomic_sub_return_acquire(int i, atomic_t *v) static __always_inline int atomic_sub_return_release(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_sub_return_release(i, v); } #define atomic_sub_return_release atomic_sub_return_release @@ -187,7 +187,7 @@ atomic_sub_return_release(int i, atomic_t *v) static __always_inline int atomic_sub_return_relaxed(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_sub_return_relaxed(i, v); } #define atomic_sub_return_relaxed atomic_sub_return_relaxed @@ -197,7 +197,7 @@ atomic_sub_return_relaxed(int i, atomic_t *v) static __always_inline int atomic_fetch_sub(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_sub(i, v); } #define atomic_fetch_sub atomic_fetch_sub @@ -207,7 +207,7 @@ atomic_fetch_sub(int i, atomic_t *v) static __always_inline int atomic_fetch_sub_acquire(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_sub_acquire(i, v); } #define atomic_fetch_sub_acquire atomic_fetch_sub_acquire @@ -217,7 +217,7 @@ atomic_fetch_sub_acquire(int i, atomic_t *v) static __always_inline int atomic_fetch_sub_release(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_sub_release(i, v); } #define atomic_fetch_sub_release atomic_fetch_sub_release @@ -227,7 +227,7 @@ atomic_fetch_sub_release(int i, atomic_t *v) static __always_inline int atomic_fetch_sub_relaxed(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_sub_relaxed(i, v); } #define atomic_fetch_sub_relaxed atomic_fetch_sub_relaxed @@ -237,7 +237,7 @@ atomic_fetch_sub_relaxed(int i, atomic_t *v) static __always_inline void atomic_inc(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_inc(v); } #define atomic_inc atomic_inc @@ -247,7 +247,7 @@ atomic_inc(atomic_t *v) static __always_inline int atomic_inc_return(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_return(v); } #define atomic_inc_return atomic_inc_return @@ -257,7 +257,7 @@ atomic_inc_return(atomic_t *v) static __always_inline int atomic_inc_return_acquire(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_return_acquire(v); } #define atomic_inc_return_acquire atomic_inc_return_acquire @@ -267,7 +267,7 @@ atomic_inc_return_acquire(atomic_t *v) static __always_inline int atomic_inc_return_release(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_return_release(v); } #define atomic_inc_return_release atomic_inc_return_release @@ -277,7 +277,7 @@ atomic_inc_return_release(atomic_t *v) static __always_inline int atomic_inc_return_relaxed(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_return_relaxed(v); } #define atomic_inc_return_relaxed atomic_inc_return_relaxed @@ -287,7 +287,7 @@ atomic_inc_return_relaxed(atomic_t *v) static __always_inline int atomic_fetch_inc(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_inc(v); } #define atomic_fetch_inc atomic_fetch_inc @@ -297,7 +297,7 @@ atomic_fetch_inc(atomic_t *v) static __always_inline int atomic_fetch_inc_acquire(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_inc_acquire(v); } #define atomic_fetch_inc_acquire atomic_fetch_inc_acquire @@ -307,7 +307,7 @@ atomic_fetch_inc_acquire(atomic_t *v) static __always_inline int atomic_fetch_inc_release(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_inc_release(v); } #define atomic_fetch_inc_release atomic_fetch_inc_release @@ -317,7 +317,7 @@ atomic_fetch_inc_release(atomic_t *v) static __always_inline int atomic_fetch_inc_relaxed(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_inc_relaxed(v); } #define atomic_fetch_inc_relaxed atomic_fetch_inc_relaxed @@ -327,7 +327,7 @@ atomic_fetch_inc_relaxed(atomic_t *v) static __always_inline void atomic_dec(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_dec(v); } #define atomic_dec atomic_dec @@ -337,7 +337,7 @@ atomic_dec(atomic_t *v) static __always_inline int atomic_dec_return(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_return(v); } #define atomic_dec_return atomic_dec_return @@ -347,7 +347,7 @@ atomic_dec_return(atomic_t *v) static __always_inline int atomic_dec_return_acquire(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_return_acquire(v); } #define atomic_dec_return_acquire atomic_dec_return_acquire @@ -357,7 +357,7 @@ atomic_dec_return_acquire(atomic_t *v) static __always_inline int atomic_dec_return_release(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_return_release(v); } #define atomic_dec_return_release atomic_dec_return_release @@ -367,7 +367,7 @@ atomic_dec_return_release(atomic_t *v) static __always_inline int atomic_dec_return_relaxed(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_return_relaxed(v); } #define atomic_dec_return_relaxed atomic_dec_return_relaxed @@ -377,7 +377,7 @@ atomic_dec_return_relaxed(atomic_t *v) static __always_inline int atomic_fetch_dec(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_dec(v); } #define atomic_fetch_dec atomic_fetch_dec @@ -387,7 +387,7 @@ atomic_fetch_dec(atomic_t *v) static __always_inline int atomic_fetch_dec_acquire(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_dec_acquire(v); } #define atomic_fetch_dec_acquire atomic_fetch_dec_acquire @@ -397,7 +397,7 @@ atomic_fetch_dec_acquire(atomic_t *v) static __always_inline int atomic_fetch_dec_release(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_dec_release(v); } #define atomic_fetch_dec_release atomic_fetch_dec_release @@ -407,7 +407,7 @@ atomic_fetch_dec_release(atomic_t *v) static __always_inline int atomic_fetch_dec_relaxed(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_dec_relaxed(v); } #define atomic_fetch_dec_relaxed atomic_fetch_dec_relaxed @@ -416,7 +416,7 @@ atomic_fetch_dec_relaxed(atomic_t *v) static __always_inline void atomic_and(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_and(i, v); } #define atomic_and atomic_and @@ -425,7 +425,7 @@ atomic_and(int i, atomic_t *v) static __always_inline int atomic_fetch_and(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_and(i, v); } #define atomic_fetch_and atomic_fetch_and @@ -435,7 +435,7 @@ atomic_fetch_and(int i, atomic_t *v) static __always_inline int atomic_fetch_and_acquire(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_and_acquire(i, v); } #define atomic_fetch_and_acquire atomic_fetch_and_acquire @@ -445,7 +445,7 @@ atomic_fetch_and_acquire(int i, atomic_t *v) static __always_inline int atomic_fetch_and_release(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_and_release(i, v); } #define atomic_fetch_and_release atomic_fetch_and_release @@ -455,7 +455,7 @@ atomic_fetch_and_release(int i, atomic_t *v) static __always_inline int atomic_fetch_and_relaxed(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_and_relaxed(i, v); } #define atomic_fetch_and_relaxed atomic_fetch_and_relaxed @@ -465,7 +465,7 @@ atomic_fetch_and_relaxed(int i, atomic_t *v) static __always_inline void atomic_andnot(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_andnot(i, v); } #define atomic_andnot atomic_andnot @@ -475,7 +475,7 @@ atomic_andnot(int i, atomic_t *v) static __always_inline int atomic_fetch_andnot(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_andnot(i, v); } #define atomic_fetch_andnot atomic_fetch_andnot @@ -485,7 +485,7 @@ atomic_fetch_andnot(int i, atomic_t *v) static __always_inline int atomic_fetch_andnot_acquire(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_andnot_acquire(i, v); } #define atomic_fetch_andnot_acquire atomic_fetch_andnot_acquire @@ -495,7 +495,7 @@ atomic_fetch_andnot_acquire(int i, atomic_t *v) static __always_inline int atomic_fetch_andnot_release(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_andnot_release(i, v); } #define atomic_fetch_andnot_release atomic_fetch_andnot_release @@ -505,7 +505,7 @@ atomic_fetch_andnot_release(int i, atomic_t *v) static __always_inline int atomic_fetch_andnot_relaxed(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_andnot_relaxed(i, v); } #define atomic_fetch_andnot_relaxed atomic_fetch_andnot_relaxed @@ -514,7 +514,7 @@ atomic_fetch_andnot_relaxed(int i, atomic_t *v) static __always_inline void atomic_or(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_or(i, v); } #define atomic_or atomic_or @@ -523,7 +523,7 @@ atomic_or(int i, atomic_t *v) static __always_inline int atomic_fetch_or(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_or(i, v); } #define atomic_fetch_or atomic_fetch_or @@ -533,7 +533,7 @@ atomic_fetch_or(int i, atomic_t *v) static __always_inline int atomic_fetch_or_acquire(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_or_acquire(i, v); } #define atomic_fetch_or_acquire atomic_fetch_or_acquire @@ -543,7 +543,7 @@ atomic_fetch_or_acquire(int i, atomic_t *v) static __always_inline int atomic_fetch_or_release(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_or_release(i, v); } #define atomic_fetch_or_release atomic_fetch_or_release @@ -553,7 +553,7 @@ atomic_fetch_or_release(int i, atomic_t *v) static __always_inline int atomic_fetch_or_relaxed(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_or_relaxed(i, v); } #define atomic_fetch_or_relaxed atomic_fetch_or_relaxed @@ -562,7 +562,7 @@ atomic_fetch_or_relaxed(int i, atomic_t *v) static __always_inline void atomic_xor(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_xor(i, v); } #define atomic_xor atomic_xor @@ -571,7 +571,7 @@ atomic_xor(int i, atomic_t *v) static __always_inline int atomic_fetch_xor(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_xor(i, v); } #define atomic_fetch_xor atomic_fetch_xor @@ -581,7 +581,7 @@ atomic_fetch_xor(int i, atomic_t *v) static __always_inline int atomic_fetch_xor_acquire(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_xor_acquire(i, v); } #define atomic_fetch_xor_acquire atomic_fetch_xor_acquire @@ -591,7 +591,7 @@ atomic_fetch_xor_acquire(int i, atomic_t *v) static __always_inline int atomic_fetch_xor_release(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_xor_release(i, v); } #define atomic_fetch_xor_release atomic_fetch_xor_release @@ -601,7 +601,7 @@ atomic_fetch_xor_release(int i, atomic_t *v) static __always_inline int atomic_fetch_xor_relaxed(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_xor_relaxed(i, v); } #define atomic_fetch_xor_relaxed atomic_fetch_xor_relaxed @@ -611,7 +611,7 @@ atomic_fetch_xor_relaxed(int i, atomic_t *v) static __always_inline int atomic_xchg(atomic_t *v, int i) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_xchg(v, i); } #define atomic_xchg atomic_xchg @@ -621,7 +621,7 @@ atomic_xchg(atomic_t *v, int i) static __always_inline int atomic_xchg_acquire(atomic_t *v, int i) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_xchg_acquire(v, i); } #define atomic_xchg_acquire atomic_xchg_acquire @@ -631,7 +631,7 @@ atomic_xchg_acquire(atomic_t *v, int i) static __always_inline int atomic_xchg_release(atomic_t *v, int i) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_xchg_release(v, i); } #define atomic_xchg_release atomic_xchg_release @@ -641,7 +641,7 @@ atomic_xchg_release(atomic_t *v, int i) static __always_inline int atomic_xchg_relaxed(atomic_t *v, int i) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_xchg_relaxed(v, i); } #define atomic_xchg_relaxed atomic_xchg_relaxed @@ -651,7 +651,7 @@ atomic_xchg_relaxed(atomic_t *v, int i) static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_cmpxchg(v, old, new); } #define atomic_cmpxchg atomic_cmpxchg @@ -661,7 +661,7 @@ atomic_cmpxchg(atomic_t *v, int old, int new) static __always_inline int atomic_cmpxchg_acquire(atomic_t *v, int old, int new) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_cmpxchg_acquire(v, old, new); } #define atomic_cmpxchg_acquire atomic_cmpxchg_acquire @@ -671,7 +671,7 @@ atomic_cmpxchg_acquire(atomic_t *v, int old, int new) static __always_inline int atomic_cmpxchg_release(atomic_t *v, int old, int new) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_cmpxchg_release(v, old, new); } #define atomic_cmpxchg_release atomic_cmpxchg_release @@ -681,7 +681,7 @@ atomic_cmpxchg_release(atomic_t *v, int old, int new) static __always_inline int atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_cmpxchg_relaxed(v, old, new); } #define atomic_cmpxchg_relaxed atomic_cmpxchg_relaxed @@ -691,8 +691,8 @@ atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) { - instrument_atomic_write(v, sizeof(*v)); - instrument_atomic_write(old, sizeof(*old)); + instrument_atomic_read_write(v, sizeof(*v)); + instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_try_cmpxchg(v, old, new); } #define atomic_try_cmpxchg atomic_try_cmpxchg @@ -702,8 +702,8 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new) static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { - instrument_atomic_write(v, sizeof(*v)); - instrument_atomic_write(old, sizeof(*old)); + instrument_atomic_read_write(v, sizeof(*v)); + instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_try_cmpxchg_acquire(v, old, new); } #define atomic_try_cmpxchg_acquire atomic_try_cmpxchg_acquire @@ -713,8 +713,8 @@ atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) static __always_inline bool atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { - instrument_atomic_write(v, sizeof(*v)); - instrument_atomic_write(old, sizeof(*old)); + instrument_atomic_read_write(v, sizeof(*v)); + instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_try_cmpxchg_release(v, old, new); } #define atomic_try_cmpxchg_release atomic_try_cmpxchg_release @@ -724,8 +724,8 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) static __always_inline bool atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { - instrument_atomic_write(v, sizeof(*v)); - instrument_atomic_write(old, sizeof(*old)); + instrument_atomic_read_write(v, sizeof(*v)); + instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_try_cmpxchg_relaxed(v, old, new); } #define atomic_try_cmpxchg_relaxed atomic_try_cmpxchg_relaxed @@ -735,7 +735,7 @@ atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_sub_and_test(i, v); } #define atomic_sub_and_test atomic_sub_and_test @@ -745,7 +745,7 @@ atomic_sub_and_test(int i, atomic_t *v) static __always_inline bool atomic_dec_and_test(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_and_test(v); } #define atomic_dec_and_test atomic_dec_and_test @@ -755,7 +755,7 @@ atomic_dec_and_test(atomic_t *v) static __always_inline bool atomic_inc_and_test(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_and_test(v); } #define atomic_inc_and_test atomic_inc_and_test @@ -765,7 +765,7 @@ atomic_inc_and_test(atomic_t *v) static __always_inline bool atomic_add_negative(int i, atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_negative(i, v); } #define atomic_add_negative atomic_add_negative @@ -775,7 +775,7 @@ atomic_add_negative(int i, atomic_t *v) static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_add_unless(v, a, u); } #define atomic_fetch_add_unless atomic_fetch_add_unless @@ -785,7 +785,7 @@ atomic_fetch_add_unless(atomic_t *v, int a, int u) static __always_inline bool atomic_add_unless(atomic_t *v, int a, int u) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_unless(v, a, u); } #define atomic_add_unless atomic_add_unless @@ -795,7 +795,7 @@ atomic_add_unless(atomic_t *v, int a, int u) static __always_inline bool atomic_inc_not_zero(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_not_zero(v); } #define atomic_inc_not_zero atomic_inc_not_zero @@ -805,7 +805,7 @@ atomic_inc_not_zero(atomic_t *v) static __always_inline bool atomic_inc_unless_negative(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_unless_negative(v); } #define atomic_inc_unless_negative atomic_inc_unless_negative @@ -815,7 +815,7 @@ atomic_inc_unless_negative(atomic_t *v) static __always_inline bool atomic_dec_unless_positive(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_unless_positive(v); } #define atomic_dec_unless_positive atomic_dec_unless_positive @@ -825,7 +825,7 @@ atomic_dec_unless_positive(atomic_t *v) static __always_inline int atomic_dec_if_positive(atomic_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_if_positive(v); } #define atomic_dec_if_positive atomic_dec_if_positive @@ -870,7 +870,7 @@ atomic64_set_release(atomic64_t *v, s64 i) static __always_inline void atomic64_add(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_add(i, v); } #define atomic64_add atomic64_add @@ -879,7 +879,7 @@ atomic64_add(s64 i, atomic64_t *v) static __always_inline s64 atomic64_add_return(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_return(i, v); } #define atomic64_add_return atomic64_add_return @@ -889,7 +889,7 @@ atomic64_add_return(s64 i, atomic64_t *v) static __always_inline s64 atomic64_add_return_acquire(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_return_acquire(i, v); } #define atomic64_add_return_acquire atomic64_add_return_acquire @@ -899,7 +899,7 @@ atomic64_add_return_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_add_return_release(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_return_release(i, v); } #define atomic64_add_return_release atomic64_add_return_release @@ -909,7 +909,7 @@ atomic64_add_return_release(s64 i, atomic64_t *v) static __always_inline s64 atomic64_add_return_relaxed(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_return_relaxed(i, v); } #define atomic64_add_return_relaxed atomic64_add_return_relaxed @@ -919,7 +919,7 @@ atomic64_add_return_relaxed(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_add(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_add(i, v); } #define atomic64_fetch_add atomic64_fetch_add @@ -929,7 +929,7 @@ atomic64_fetch_add(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_add_acquire(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_add_acquire(i, v); } #define atomic64_fetch_add_acquire atomic64_fetch_add_acquire @@ -939,7 +939,7 @@ atomic64_fetch_add_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_add_release(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_add_release(i, v); } #define atomic64_fetch_add_release atomic64_fetch_add_release @@ -949,7 +949,7 @@ atomic64_fetch_add_release(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_add_relaxed(i, v); } #define atomic64_fetch_add_relaxed atomic64_fetch_add_relaxed @@ -958,7 +958,7 @@ atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) static __always_inline void atomic64_sub(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_sub(i, v); } #define atomic64_sub atomic64_sub @@ -967,7 +967,7 @@ atomic64_sub(s64 i, atomic64_t *v) static __always_inline s64 atomic64_sub_return(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_sub_return(i, v); } #define atomic64_sub_return atomic64_sub_return @@ -977,7 +977,7 @@ atomic64_sub_return(s64 i, atomic64_t *v) static __always_inline s64 atomic64_sub_return_acquire(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_sub_return_acquire(i, v); } #define atomic64_sub_return_acquire atomic64_sub_return_acquire @@ -987,7 +987,7 @@ atomic64_sub_return_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_sub_return_release(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_sub_return_release(i, v); } #define atomic64_sub_return_release atomic64_sub_return_release @@ -997,7 +997,7 @@ atomic64_sub_return_release(s64 i, atomic64_t *v) static __always_inline s64 atomic64_sub_return_relaxed(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_sub_return_relaxed(i, v); } #define atomic64_sub_return_relaxed atomic64_sub_return_relaxed @@ -1007,7 +1007,7 @@ atomic64_sub_return_relaxed(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_sub(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_sub(i, v); } #define atomic64_fetch_sub atomic64_fetch_sub @@ -1017,7 +1017,7 @@ atomic64_fetch_sub(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_sub_acquire(i, v); } #define atomic64_fetch_sub_acquire atomic64_fetch_sub_acquire @@ -1027,7 +1027,7 @@ atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_sub_release(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_sub_release(i, v); } #define atomic64_fetch_sub_release atomic64_fetch_sub_release @@ -1037,7 +1037,7 @@ atomic64_fetch_sub_release(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_sub_relaxed(i, v); } #define atomic64_fetch_sub_relaxed atomic64_fetch_sub_relaxed @@ -1047,7 +1047,7 @@ atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) static __always_inline void atomic64_inc(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_inc(v); } #define atomic64_inc atomic64_inc @@ -1057,7 +1057,7 @@ atomic64_inc(atomic64_t *v) static __always_inline s64 atomic64_inc_return(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_return(v); } #define atomic64_inc_return atomic64_inc_return @@ -1067,7 +1067,7 @@ atomic64_inc_return(atomic64_t *v) static __always_inline s64 atomic64_inc_return_acquire(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_return_acquire(v); } #define atomic64_inc_return_acquire atomic64_inc_return_acquire @@ -1077,7 +1077,7 @@ atomic64_inc_return_acquire(atomic64_t *v) static __always_inline s64 atomic64_inc_return_release(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_return_release(v); } #define atomic64_inc_return_release atomic64_inc_return_release @@ -1087,7 +1087,7 @@ atomic64_inc_return_release(atomic64_t *v) static __always_inline s64 atomic64_inc_return_relaxed(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_return_relaxed(v); } #define atomic64_inc_return_relaxed atomic64_inc_return_relaxed @@ -1097,7 +1097,7 @@ atomic64_inc_return_relaxed(atomic64_t *v) static __always_inline s64 atomic64_fetch_inc(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_inc(v); } #define atomic64_fetch_inc atomic64_fetch_inc @@ -1107,7 +1107,7 @@ atomic64_fetch_inc(atomic64_t *v) static __always_inline s64 atomic64_fetch_inc_acquire(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_inc_acquire(v); } #define atomic64_fetch_inc_acquire atomic64_fetch_inc_acquire @@ -1117,7 +1117,7 @@ atomic64_fetch_inc_acquire(atomic64_t *v) static __always_inline s64 atomic64_fetch_inc_release(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_inc_release(v); } #define atomic64_fetch_inc_release atomic64_fetch_inc_release @@ -1127,7 +1127,7 @@ atomic64_fetch_inc_release(atomic64_t *v) static __always_inline s64 atomic64_fetch_inc_relaxed(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_inc_relaxed(v); } #define atomic64_fetch_inc_relaxed atomic64_fetch_inc_relaxed @@ -1137,7 +1137,7 @@ atomic64_fetch_inc_relaxed(atomic64_t *v) static __always_inline void atomic64_dec(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_dec(v); } #define atomic64_dec atomic64_dec @@ -1147,7 +1147,7 @@ atomic64_dec(atomic64_t *v) static __always_inline s64 atomic64_dec_return(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_return(v); } #define atomic64_dec_return atomic64_dec_return @@ -1157,7 +1157,7 @@ atomic64_dec_return(atomic64_t *v) static __always_inline s64 atomic64_dec_return_acquire(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_return_acquire(v); } #define atomic64_dec_return_acquire atomic64_dec_return_acquire @@ -1167,7 +1167,7 @@ atomic64_dec_return_acquire(atomic64_t *v) static __always_inline s64 atomic64_dec_return_release(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_return_release(v); } #define atomic64_dec_return_release atomic64_dec_return_release @@ -1177,7 +1177,7 @@ atomic64_dec_return_release(atomic64_t *v) static __always_inline s64 atomic64_dec_return_relaxed(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_return_relaxed(v); } #define atomic64_dec_return_relaxed atomic64_dec_return_relaxed @@ -1187,7 +1187,7 @@ atomic64_dec_return_relaxed(atomic64_t *v) static __always_inline s64 atomic64_fetch_dec(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_dec(v); } #define atomic64_fetch_dec atomic64_fetch_dec @@ -1197,7 +1197,7 @@ atomic64_fetch_dec(atomic64_t *v) static __always_inline s64 atomic64_fetch_dec_acquire(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_dec_acquire(v); } #define atomic64_fetch_dec_acquire atomic64_fetch_dec_acquire @@ -1207,7 +1207,7 @@ atomic64_fetch_dec_acquire(atomic64_t *v) static __always_inline s64 atomic64_fetch_dec_release(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_dec_release(v); } #define atomic64_fetch_dec_release atomic64_fetch_dec_release @@ -1217,7 +1217,7 @@ atomic64_fetch_dec_release(atomic64_t *v) static __always_inline s64 atomic64_fetch_dec_relaxed(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_dec_relaxed(v); } #define atomic64_fetch_dec_relaxed atomic64_fetch_dec_relaxed @@ -1226,7 +1226,7 @@ atomic64_fetch_dec_relaxed(atomic64_t *v) static __always_inline void atomic64_and(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_and(i, v); } #define atomic64_and atomic64_and @@ -1235,7 +1235,7 @@ atomic64_and(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_and(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_and(i, v); } #define atomic64_fetch_and atomic64_fetch_and @@ -1245,7 +1245,7 @@ atomic64_fetch_and(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_and_acquire(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_and_acquire(i, v); } #define atomic64_fetch_and_acquire atomic64_fetch_and_acquire @@ -1255,7 +1255,7 @@ atomic64_fetch_and_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_and_release(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_and_release(i, v); } #define atomic64_fetch_and_release atomic64_fetch_and_release @@ -1265,7 +1265,7 @@ atomic64_fetch_and_release(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_and_relaxed(i, v); } #define atomic64_fetch_and_relaxed atomic64_fetch_and_relaxed @@ -1275,7 +1275,7 @@ atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) static __always_inline void atomic64_andnot(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_andnot(i, v); } #define atomic64_andnot atomic64_andnot @@ -1285,7 +1285,7 @@ atomic64_andnot(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_andnot(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_andnot(i, v); } #define atomic64_fetch_andnot atomic64_fetch_andnot @@ -1295,7 +1295,7 @@ atomic64_fetch_andnot(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_andnot_acquire(i, v); } #define atomic64_fetch_andnot_acquire atomic64_fetch_andnot_acquire @@ -1305,7 +1305,7 @@ atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_andnot_release(i, v); } #define atomic64_fetch_andnot_release atomic64_fetch_andnot_release @@ -1315,7 +1315,7 @@ atomic64_fetch_andnot_release(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_andnot_relaxed(i, v); } #define atomic64_fetch_andnot_relaxed atomic64_fetch_andnot_relaxed @@ -1324,7 +1324,7 @@ atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) static __always_inline void atomic64_or(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_or(i, v); } #define atomic64_or atomic64_or @@ -1333,7 +1333,7 @@ atomic64_or(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_or(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_or(i, v); } #define atomic64_fetch_or atomic64_fetch_or @@ -1343,7 +1343,7 @@ atomic64_fetch_or(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_or_acquire(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_or_acquire(i, v); } #define atomic64_fetch_or_acquire atomic64_fetch_or_acquire @@ -1353,7 +1353,7 @@ atomic64_fetch_or_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_or_release(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_or_release(i, v); } #define atomic64_fetch_or_release atomic64_fetch_or_release @@ -1363,7 +1363,7 @@ atomic64_fetch_or_release(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_or_relaxed(i, v); } #define atomic64_fetch_or_relaxed atomic64_fetch_or_relaxed @@ -1372,7 +1372,7 @@ atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) static __always_inline void atomic64_xor(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_xor(i, v); } #define atomic64_xor atomic64_xor @@ -1381,7 +1381,7 @@ atomic64_xor(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_xor(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_xor(i, v); } #define atomic64_fetch_xor atomic64_fetch_xor @@ -1391,7 +1391,7 @@ atomic64_fetch_xor(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_xor_acquire(i, v); } #define atomic64_fetch_xor_acquire atomic64_fetch_xor_acquire @@ -1401,7 +1401,7 @@ atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_xor_release(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_xor_release(i, v); } #define atomic64_fetch_xor_release atomic64_fetch_xor_release @@ -1411,7 +1411,7 @@ atomic64_fetch_xor_release(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_xor_relaxed(i, v); } #define atomic64_fetch_xor_relaxed atomic64_fetch_xor_relaxed @@ -1421,7 +1421,7 @@ atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) static __always_inline s64 atomic64_xchg(atomic64_t *v, s64 i) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_xchg(v, i); } #define atomic64_xchg atomic64_xchg @@ -1431,7 +1431,7 @@ atomic64_xchg(atomic64_t *v, s64 i) static __always_inline s64 atomic64_xchg_acquire(atomic64_t *v, s64 i) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_xchg_acquire(v, i); } #define atomic64_xchg_acquire atomic64_xchg_acquire @@ -1441,7 +1441,7 @@ atomic64_xchg_acquire(atomic64_t *v, s64 i) static __always_inline s64 atomic64_xchg_release(atomic64_t *v, s64 i) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_xchg_release(v, i); } #define atomic64_xchg_release atomic64_xchg_release @@ -1451,7 +1451,7 @@ atomic64_xchg_release(atomic64_t *v, s64 i) static __always_inline s64 atomic64_xchg_relaxed(atomic64_t *v, s64 i) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_xchg_relaxed(v, i); } #define atomic64_xchg_relaxed atomic64_xchg_relaxed @@ -1461,7 +1461,7 @@ atomic64_xchg_relaxed(atomic64_t *v, s64 i) static __always_inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_cmpxchg(v, old, new); } #define atomic64_cmpxchg atomic64_cmpxchg @@ -1471,7 +1471,7 @@ atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) static __always_inline s64 atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_cmpxchg_acquire(v, old, new); } #define atomic64_cmpxchg_acquire atomic64_cmpxchg_acquire @@ -1481,7 +1481,7 @@ atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) static __always_inline s64 atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_cmpxchg_release(v, old, new); } #define atomic64_cmpxchg_release atomic64_cmpxchg_release @@ -1491,7 +1491,7 @@ atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) static __always_inline s64 atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_cmpxchg_relaxed(v, old, new); } #define atomic64_cmpxchg_relaxed atomic64_cmpxchg_relaxed @@ -1501,8 +1501,8 @@ atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { - instrument_atomic_write(v, sizeof(*v)); - instrument_atomic_write(old, sizeof(*old)); + instrument_atomic_read_write(v, sizeof(*v)); + instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic64_try_cmpxchg(v, old, new); } #define atomic64_try_cmpxchg atomic64_try_cmpxchg @@ -1512,8 +1512,8 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { - instrument_atomic_write(v, sizeof(*v)); - instrument_atomic_write(old, sizeof(*old)); + instrument_atomic_read_write(v, sizeof(*v)); + instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic64_try_cmpxchg_acquire(v, old, new); } #define atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg_acquire @@ -1523,8 +1523,8 @@ atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) static __always_inline bool atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { - instrument_atomic_write(v, sizeof(*v)); - instrument_atomic_write(old, sizeof(*old)); + instrument_atomic_read_write(v, sizeof(*v)); + instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic64_try_cmpxchg_release(v, old, new); } #define atomic64_try_cmpxchg_release atomic64_try_cmpxchg_release @@ -1534,8 +1534,8 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) static __always_inline bool atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { - instrument_atomic_write(v, sizeof(*v)); - instrument_atomic_write(old, sizeof(*old)); + instrument_atomic_read_write(v, sizeof(*v)); + instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic64_try_cmpxchg_relaxed(v, old, new); } #define atomic64_try_cmpxchg_relaxed atomic64_try_cmpxchg_relaxed @@ -1545,7 +1545,7 @@ atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) static __always_inline bool atomic64_sub_and_test(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_sub_and_test(i, v); } #define atomic64_sub_and_test atomic64_sub_and_test @@ -1555,7 +1555,7 @@ atomic64_sub_and_test(s64 i, atomic64_t *v) static __always_inline bool atomic64_dec_and_test(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_and_test(v); } #define atomic64_dec_and_test atomic64_dec_and_test @@ -1565,7 +1565,7 @@ atomic64_dec_and_test(atomic64_t *v) static __always_inline bool atomic64_inc_and_test(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_and_test(v); } #define atomic64_inc_and_test atomic64_inc_and_test @@ -1575,7 +1575,7 @@ atomic64_inc_and_test(atomic64_t *v) static __always_inline bool atomic64_add_negative(s64 i, atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_negative(i, v); } #define atomic64_add_negative atomic64_add_negative @@ -1585,7 +1585,7 @@ atomic64_add_negative(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_add_unless(v, a, u); } #define atomic64_fetch_add_unless atomic64_fetch_add_unless @@ -1595,7 +1595,7 @@ atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) static __always_inline bool atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_unless(v, a, u); } #define atomic64_add_unless atomic64_add_unless @@ -1605,7 +1605,7 @@ atomic64_add_unless(atomic64_t *v, s64 a, s64 u) static __always_inline bool atomic64_inc_not_zero(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_not_zero(v); } #define atomic64_inc_not_zero atomic64_inc_not_zero @@ -1615,7 +1615,7 @@ atomic64_inc_not_zero(atomic64_t *v) static __always_inline bool atomic64_inc_unless_negative(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_unless_negative(v); } #define atomic64_inc_unless_negative atomic64_inc_unless_negative @@ -1625,7 +1625,7 @@ atomic64_inc_unless_negative(atomic64_t *v) static __always_inline bool atomic64_dec_unless_positive(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_unless_positive(v); } #define atomic64_dec_unless_positive atomic64_dec_unless_positive @@ -1635,7 +1635,7 @@ atomic64_dec_unless_positive(atomic64_t *v) static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v) { - instrument_atomic_write(v, sizeof(*v)); + instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_if_positive(v); } #define atomic64_dec_if_positive atomic64_dec_if_positive @@ -1786,4 +1786,4 @@ atomic64_dec_if_positive(atomic64_t *v) }) #endif /* _ASM_GENERIC_ATOMIC_INSTRUMENTED_H */ -// 89bf97f3a7509b740845e51ddf31055b48a81f40 +// 9d5e6a315fb1335d02f0ccd3655a91c3dafcc63e diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh index 6afadf73da17..2b7fec7e6abc 100755 --- a/scripts/atomic/gen-atomic-instrumented.sh +++ b/scripts/atomic/gen-atomic-instrumented.sh @@ -5,9 +5,10 @@ ATOMICDIR=$(dirname $0) . ${ATOMICDIR}/atomic-tbl.sh -#gen_param_check(arg) +#gen_param_check(meta, arg) gen_param_check() { + local meta="$1"; shift local arg="$1"; shift local type="${arg%%:*}" local name="$(gen_param_name "${arg}")" @@ -17,17 +18,25 @@ gen_param_check() i) return;; esac - # We don't write to constant parameters - [ ${type#c} != ${type} ] && rw="read" + if [ ${type#c} != ${type} ]; then + # We don't write to constant parameters. + rw="read" + elif [ "${meta}" != "s" ]; then + # An atomic RMW: if this parameter is not a constant, and this atomic is + # not just a 's'tore, this parameter is both read from and written to. + rw="read_write" + fi printf "\tinstrument_atomic_${rw}(${name}, sizeof(*${name}));\n" } -#gen_param_check(arg...) +#gen_params_checks(meta, arg...) gen_params_checks() { + local meta="$1"; shift + while [ "$#" -gt 0 ]; do - gen_param_check "$1" + gen_param_check "$meta" "$1" shift; done } @@ -77,7 +86,7 @@ gen_proto_order_variant() local ret="$(gen_ret_type "${meta}" "${int}")" local params="$(gen_params "${int}" "${atomic}" "$@")" - local checks="$(gen_params_checks "$@")" + local checks="$(gen_params_checks "${meta}" "$@")" local args="$(gen_args "$@")" local retstmt="$(gen_ret_stmt "${meta}")" From 69b2c81bc894606670204f0ae08f406dbcce836d Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 31 Jul 2020 10:17:19 +0200 Subject: [PATCH 12/67] kcsan: Simplify debugfs counter to name mapping Simplify counter ID to name mapping by using an array with designated inits. This way, we can turn a run-time BUG() into a compile-time static assertion failure if a counter name is missing. No functional change intended. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/debugfs.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 023e49c58d55..3a9566addeff 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -19,6 +19,18 @@ * Statistics counters. */ static atomic_long_t counters[KCSAN_COUNTER_COUNT]; +static const char *const counter_names[] = { + [KCSAN_COUNTER_USED_WATCHPOINTS] = "used_watchpoints", + [KCSAN_COUNTER_SETUP_WATCHPOINTS] = "setup_watchpoints", + [KCSAN_COUNTER_DATA_RACES] = "data_races", + [KCSAN_COUNTER_ASSERT_FAILURES] = "assert_failures", + [KCSAN_COUNTER_NO_CAPACITY] = "no_capacity", + [KCSAN_COUNTER_REPORT_RACES] = "report_races", + [KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN] = "races_unknown_origin", + [KCSAN_COUNTER_UNENCODABLE_ACCESSES] = "unencodable_accesses", + [KCSAN_COUNTER_ENCODING_FALSE_POSITIVES] = "encoding_false_positives", +}; +static_assert(ARRAY_SIZE(counter_names) == KCSAN_COUNTER_COUNT); /* * Addresses for filtering functions from reporting. This list can be used as a @@ -39,24 +51,6 @@ static struct { }; static DEFINE_SPINLOCK(report_filterlist_lock); -static const char *counter_to_name(enum kcsan_counter_id id) -{ - switch (id) { - case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints"; - case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints"; - case KCSAN_COUNTER_DATA_RACES: return "data_races"; - case KCSAN_COUNTER_ASSERT_FAILURES: return "assert_failures"; - case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity"; - case KCSAN_COUNTER_REPORT_RACES: return "report_races"; - case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin"; - case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses"; - case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives"; - case KCSAN_COUNTER_COUNT: - BUG(); - } - return NULL; -} - void kcsan_counter_inc(enum kcsan_counter_id id) { atomic_long_inc(&counters[id]); @@ -271,8 +265,7 @@ static int show_info(struct seq_file *file, void *v) /* show stats */ seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled)); for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) - seq_printf(file, "%s: %ld\n", counter_to_name(i), - atomic_long_read(&counters[i])); + seq_printf(file, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i])); /* show filter functions, and filter type */ spin_lock_irqsave(&report_filterlist_lock, flags); From a4e74fa5f0d3e2a11f2d0deb522681d219a81426 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 31 Jul 2020 10:17:20 +0200 Subject: [PATCH 13/67] kcsan: Simplify constant string handling Simplify checking prefixes and length calculation of constant strings. For the former, the kernel provides str_has_prefix(), and the latter we should just use strlen("..") because GCC and Clang have optimizations that optimize these into constants. No functional change intended. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/debugfs.c | 8 ++++---- kernel/kcsan/report.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 3a9566addeff..116bdd8f050c 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -300,16 +300,16 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o WRITE_ONCE(kcsan_enabled, true); } else if (!strcmp(arg, "off")) { WRITE_ONCE(kcsan_enabled, false); - } else if (!strncmp(arg, "microbench=", sizeof("microbench=") - 1)) { + } else if (str_has_prefix(arg, "microbench=")) { unsigned long iters; - if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters)) + if (kstrtoul(&arg[strlen("microbench=")], 0, &iters)) return -EINVAL; microbenchmark(iters); - } else if (!strncmp(arg, "test=", sizeof("test=") - 1)) { + } else if (str_has_prefix(arg, "test=")) { unsigned long iters; - if (kstrtoul(&arg[sizeof("test=") - 1], 0, &iters)) + if (kstrtoul(&arg[strlen("test=")], 0, &iters)) return -EINVAL; test_thread(iters); } else if (!strcmp(arg, "whitelist")) { diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 3e83a69239fa..bf1d59449805 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -279,8 +279,8 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries cur = strnstr(buf, "kcsan_", len); if (cur) { - cur += sizeof("kcsan_") - 1; - if (strncmp(cur, "test", sizeof("test") - 1)) + cur += strlen("kcsan_"); + if (!str_has_prefix(cur, "test")) continue; /* KCSAN runtime function. */ /* KCSAN related test. */ } From 4700ccdf18fa3002d66769b69cf715cc58beea37 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 31 Jul 2020 10:17:21 +0200 Subject: [PATCH 14/67] kcsan: Remove debugfs test command Remove the debugfs test command, as it is no longer needed now that we have the KUnit+Torture based kcsan-test module. This is to avoid confusion around how KCSAN should be tested, as only the kcsan-test module is maintained. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/debugfs.c | 66 ------------------------------------------ 1 file changed, 66 deletions(-) diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 116bdd8f050c..de1da1b01aa4 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -98,66 +98,6 @@ static noinline void microbenchmark(unsigned long iters) current->kcsan_ctx = ctx_save; } -/* - * Simple test to create conflicting accesses. Write 'test=' to KCSAN's - * debugfs file from multiple tasks to generate real conflicts and show reports. - */ -static long test_dummy; -static long test_flags; -static long test_scoped; -static noinline void test_thread(unsigned long iters) -{ - const long CHANGE_BITS = 0xff00ff00ff00ff00L; - const struct kcsan_ctx ctx_save = current->kcsan_ctx; - cycles_t cycles; - - /* We may have been called from an atomic region; reset context. */ - memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); - - pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); - pr_info("test_dummy@%px, test_flags@%px, test_scoped@%px,\n", - &test_dummy, &test_flags, &test_scoped); - - cycles = get_cycles(); - while (iters--) { - /* These all should generate reports. */ - __kcsan_check_read(&test_dummy, sizeof(test_dummy)); - ASSERT_EXCLUSIVE_WRITER(test_dummy); - ASSERT_EXCLUSIVE_ACCESS(test_dummy); - - ASSERT_EXCLUSIVE_BITS(test_flags, ~CHANGE_BITS); /* no report */ - __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ - - ASSERT_EXCLUSIVE_BITS(test_flags, CHANGE_BITS); /* report */ - __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ - - /* not actually instrumented */ - WRITE_ONCE(test_dummy, iters); /* to observe value-change */ - __kcsan_check_write(&test_dummy, sizeof(test_dummy)); - - test_flags ^= CHANGE_BITS; /* generate value-change */ - __kcsan_check_write(&test_flags, sizeof(test_flags)); - - BUG_ON(current->kcsan_ctx.scoped_accesses.prev); - { - /* Should generate reports anywhere in this block. */ - ASSERT_EXCLUSIVE_WRITER_SCOPED(test_scoped); - ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_scoped); - BUG_ON(!current->kcsan_ctx.scoped_accesses.prev); - /* Unrelated accesses. */ - __kcsan_check_access(&cycles, sizeof(cycles), 0); - __kcsan_check_access(&cycles, sizeof(cycles), KCSAN_ACCESS_ATOMIC); - } - BUG_ON(current->kcsan_ctx.scoped_accesses.prev); - } - cycles = get_cycles() - cycles; - - pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); - - /* restore context */ - current->kcsan_ctx = ctx_save; -} - static int cmp_filterlist_addrs(const void *rhs, const void *lhs) { const unsigned long a = *(const unsigned long *)rhs; @@ -306,12 +246,6 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o if (kstrtoul(&arg[strlen("microbench=")], 0, &iters)) return -EINVAL; microbenchmark(iters); - } else if (str_has_prefix(arg, "test=")) { - unsigned long iters; - - if (kstrtoul(&arg[strlen("test=")], 0, &iters)) - return -EINVAL; - test_thread(iters); } else if (!strcmp(arg, "whitelist")) { set_report_filterlist_whitelist(true); } else if (!strcmp(arg, "blacklist")) { From 2778793072c31e3eb33842f3bd7da82dfc7efc6b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 31 Jul 2020 10:17:22 +0200 Subject: [PATCH 15/67] kcsan: Show message if enabled early Show a message in the kernel log if KCSAN was enabled early. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 99e5044bc942..b176400a2735 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kcsan: " fmt + #include #include #include @@ -463,7 +465,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) { kcsan_disable_current(); - pr_err("KCSAN: watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", + pr_err("watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", is_write ? "write" : "read", size, ptr, watchpoint_slot((unsigned long)ptr), encode_watchpoint((unsigned long)ptr, size, is_write)); @@ -623,8 +625,10 @@ void __init kcsan_init(void) * We are in the init task, and no other tasks should be running; * WRITE_ONCE without memory barrier is sufficient. */ - if (kcsan_early_enable) + if (kcsan_early_enable) { + pr_info("enabled early\n"); WRITE_ONCE(kcsan_enabled, true); + } } /* === Exported interface =================================================== */ From 178a1877d782c034f466edd80e30a107af5469df Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 31 Jul 2020 10:17:23 +0200 Subject: [PATCH 16/67] kcsan: Use pr_fmt for consistency Use the same pr_fmt throughout for consistency. [ The only exception is report.c, where the format must be kept precisely as-is. ] Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/debugfs.c | 8 +++++--- kernel/kcsan/selftest.c | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index de1da1b01aa4..6c4914fa2fad 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kcsan: " fmt + #include #include #include @@ -80,7 +82,7 @@ static noinline void microbenchmark(unsigned long iters) */ WRITE_ONCE(kcsan_enabled, false); - pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + pr_info("%s begin | iters: %lu\n", __func__, iters); cycles = get_cycles(); while (iters--) { @@ -91,7 +93,7 @@ static noinline void microbenchmark(unsigned long iters) } cycles = get_cycles() - cycles; - pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); + pr_info("%s end | cycles: %llu\n", __func__, cycles); WRITE_ONCE(kcsan_enabled, was_enabled); /* restore context */ @@ -154,7 +156,7 @@ static ssize_t insert_report_filterlist(const char *func) ssize_t ret = 0; if (!addr) { - pr_err("KCSAN: could not find function: '%s'\n", func); + pr_err("could not find function: '%s'\n", func); return -ENOENT; } diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index d26a052d3383..d98bc208d06d 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kcsan: " fmt + #include #include #include @@ -116,16 +118,16 @@ static int __init kcsan_selftest(void) if (do_test()) \ ++passed; \ else \ - pr_err("KCSAN selftest: " #do_test " failed"); \ + pr_err("selftest: " #do_test " failed"); \ } while (0) RUN_TEST(test_requires); RUN_TEST(test_encode_decode); RUN_TEST(test_matching_access); - pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total); + pr_info("selftest: %d/%d tests passed\n", passed, total); if (passed != total) - panic("KCSAN selftests failed"); + panic("selftests failed"); return 0; } postcore_initcall(kcsan_selftest); From 2e986b81f698e73c95e6456183f27b861f47bb87 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 10 Aug 2020 10:06:25 +0200 Subject: [PATCH 17/67] kcsan: Optimize debugfs stats counters Remove kcsan_counter_inc/dec() functions, as they perform no other logic, and are no longer needed. This avoids several calls in kcsan_setup_watchpoint() and kcsan_found_watchpoint(), as well as lets the compiler warn us about potential out-of-bounds accesses as the array's size is known at all usage sites at compile-time. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 22 +++++++++++----------- kernel/kcsan/debugfs.c | 21 +++++---------------- kernel/kcsan/kcsan.h | 12 ++++++------ kernel/kcsan/report.c | 2 +- 4 files changed, 23 insertions(+), 34 deletions(-) diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index b176400a2735..8a1ff605ff2d 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -367,13 +367,13 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, * already removed the watchpoint, or another thread consumed * the watchpoint before this thread. */ - kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_REPORT_RACES]); } if ((type & KCSAN_ACCESS_ASSERT) != 0) - kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); else - kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_DATA_RACES]); user_access_restore(flags); } @@ -414,7 +414,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) goto out; if (!check_encodable((unsigned long)ptr, size)) { - kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_UNENCODABLE_ACCESSES]); goto out; } @@ -434,12 +434,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * with which should_watch() returns true should be tweaked so * that this case happens very rarely. */ - kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_NO_CAPACITY]); goto out_unlock; } - kcsan_counter_inc(KCSAN_COUNTER_SETUP_WATCHPOINTS); - kcsan_counter_inc(KCSAN_COUNTER_USED_WATCHPOINTS); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_SETUP_WATCHPOINTS]); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]); /* * Read the current value, to later check and infer a race if the data @@ -541,16 +541,16 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * increment this counter. */ if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE) - kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL, watchpoint - watchpoints); } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) { /* Inferring a race, since the value should not have changed. */ - kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN]); if (is_assert) - kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE, @@ -563,7 +563,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * reused after this point. */ remove_watchpoint(watchpoint); - kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); + atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]); out_unlock: if (!kcsan_interrupt_watcher) local_irq_restore(irq_flags); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 6c4914fa2fad..3c8093a371b1 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -17,10 +17,7 @@ #include "kcsan.h" -/* - * Statistics counters. - */ -static atomic_long_t counters[KCSAN_COUNTER_COUNT]; +atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT]; static const char *const counter_names[] = { [KCSAN_COUNTER_USED_WATCHPOINTS] = "used_watchpoints", [KCSAN_COUNTER_SETUP_WATCHPOINTS] = "setup_watchpoints", @@ -53,16 +50,6 @@ static struct { }; static DEFINE_SPINLOCK(report_filterlist_lock); -void kcsan_counter_inc(enum kcsan_counter_id id) -{ - atomic_long_inc(&counters[id]); -} - -void kcsan_counter_dec(enum kcsan_counter_id id) -{ - atomic_long_dec(&counters[id]); -} - /* * The microbenchmark allows benchmarking KCSAN core runtime only. To run * multiple threads, pipe 'microbench=' from multiple tasks into the @@ -206,8 +193,10 @@ static int show_info(struct seq_file *file, void *v) /* show stats */ seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled)); - for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) - seq_printf(file, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i])); + for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) { + seq_printf(file, "%s: %ld\n", counter_names[i], + atomic_long_read(&kcsan_counters[i])); + } /* show filter functions, and filter type */ spin_lock_irqsave(&report_filterlist_lock, flags); diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 29480010dc30..8d4bf3431b3c 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -8,6 +8,7 @@ #ifndef _KERNEL_KCSAN_KCSAN_H #define _KERNEL_KCSAN_KCSAN_H +#include #include #include @@ -34,6 +35,10 @@ void kcsan_restore_irqtrace(struct task_struct *task); */ void kcsan_debugfs_init(void); +/* + * Statistics counters displayed via debugfs; should only be modified in + * slow-paths. + */ enum kcsan_counter_id { /* * Number of watchpoints currently in use. @@ -86,12 +91,7 @@ enum kcsan_counter_id { KCSAN_COUNTER_COUNT, /* number of counters */ }; - -/* - * Increment/decrement counter with given id; avoid calling these in fast-path. - */ -extern void kcsan_counter_inc(enum kcsan_counter_id id); -extern void kcsan_counter_dec(enum kcsan_counter_id id); +extern atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT]; /* * Returns true if data races in the function symbol that maps to func_addr diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index bf1d59449805..d3bf87e6007c 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -559,7 +559,7 @@ static bool prepare_report_consumer(unsigned long *flags, * If the actual accesses to not match, this was a false * positive due to watchpoint encoding. */ - kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ENCODING_FALSE_POSITIVES]); goto discard; } From 068df05363b79f54241bd6bd612055b8c16c5964 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 13 Aug 2020 18:38:59 +0200 Subject: [PATCH 18/67] bitops, kcsan: Partially revert instrumentation for non-atomic bitops Previous to the change to distinguish read-write accesses, when CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=y is set, KCSAN would consider the non-atomic bitops as atomic. We want to partially revert to this behaviour, but with one important distinction: report racing modifications, since lost bits due to non-atomicity are certainly possible. Given the operations here only modify a single bit, assuming non-atomicity of the writer is sufficient may be reasonable for certain usage (and follows the permissible nature of the "assume plain writes atomic" rule). In other words: 1. We want non-atomic read-modify-write races to be reported; this is accomplished by kcsan_check_read(), where any concurrent write (atomic or not) will generate a report. 2. We do not want to report races with marked readers, but -do- want to report races with unmarked readers; this is accomplished by the instrument_write() ("assume atomic write" with Kconfig option set). With the above rules, when KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected, it is hoped that KCSAN's reporting behaviour is better aligned with current expected permissible usage for non-atomic bitops. Note that, a side-effect of not telling KCSAN that the accesses are read-writes, is that this information is not displayed in the access summary in the report. It is, however, visible in inline-expanded stack traces. For now, it does not make sense to introduce yet another special case to KCSAN's runtime, only to cater to the case here. Cc: Dmitry Vyukov Cc: Paul E. McKenney Cc: Will Deacon Cc: Arnd Bergmann Cc: Daniel Axtens Cc: Michael Ellerman Cc: Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- .../bitops/instrumented-non-atomic.h | 30 +++++++++++++++++-- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h index f86234c7c10c..37363d570b9b 100644 --- a/include/asm-generic/bitops/instrumented-non-atomic.h +++ b/include/asm-generic/bitops/instrumented-non-atomic.h @@ -58,6 +58,30 @@ static inline void __change_bit(long nr, volatile unsigned long *addr) arch___change_bit(nr, addr); } +static inline void __instrument_read_write_bitop(long nr, volatile unsigned long *addr) +{ + if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) { + /* + * We treat non-atomic read-write bitops a little more special. + * Given the operations here only modify a single bit, assuming + * non-atomicity of the writer is sufficient may be reasonable + * for certain usage (and follows the permissible nature of the + * assume-plain-writes-atomic rule): + * 1. report read-modify-write races -> check read; + * 2. do not report races with marked readers, but do report + * races with unmarked readers -> check "atomic" write. + */ + kcsan_check_read(addr + BIT_WORD(nr), sizeof(long)); + /* + * Use generic write instrumentation, in case other sanitizers + * or tools are enabled alongside KCSAN. + */ + instrument_write(addr + BIT_WORD(nr), sizeof(long)); + } else { + instrument_read_write(addr + BIT_WORD(nr), sizeof(long)); + } +} + /** * __test_and_set_bit - Set a bit and return its old value * @nr: Bit to set @@ -68,7 +92,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr) */ static inline bool __test_and_set_bit(long nr, volatile unsigned long *addr) { - instrument_read_write(addr + BIT_WORD(nr), sizeof(long)); + __instrument_read_write_bitop(nr, addr); return arch___test_and_set_bit(nr, addr); } @@ -82,7 +106,7 @@ static inline bool __test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr) { - instrument_read_write(addr + BIT_WORD(nr), sizeof(long)); + __instrument_read_write_bitop(nr, addr); return arch___test_and_clear_bit(nr, addr); } @@ -96,7 +120,7 @@ static inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr) */ static inline bool __test_and_change_bit(long nr, volatile unsigned long *addr) { - instrument_read_write(addr + BIT_WORD(nr), sizeof(long)); + __instrument_read_write_bitop(nr, addr); return arch___test_and_change_bit(nr, addr); } From 6eb6d05958f3208b3dd2b5311f1a6e68abdbe5d5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Jul 2020 20:12:32 +0200 Subject: [PATCH 19/67] seqlock,tags: Add support for SEQCOUNT_LOCKTYPE() Such that we might easily find seqcount_LOCKTYPE_t and seqcount_LOCKTYPE_init(). Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200729161938.GB2678@hirez.programming.kicks-ass.net --- scripts/tags.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/tags.sh b/scripts/tags.sh index 32d3f53af10b..fc5b53259a16 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -201,6 +201,8 @@ regex_c=( '/\ Date: Wed, 29 Jul 2020 13:00:57 +0200 Subject: [PATCH 20/67] locking/refcount: Provide __refcount API to obtain the old value David requested means to obtain the old/previous value from the refcount API for tracing purposes. Duplicate (most of) the API as __refcount*() with an additional 'int *' argument into which, if !NULL, the old value will be stored. Requested-by: David Howells Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/20200729111120.GA2638@hirez.programming.kicks-ass.net --- include/linux/refcount.h | 65 +++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 0e3ee25eb156..7fabb1af18e0 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -165,7 +165,7 @@ static inline unsigned int refcount_read(const refcount_t *r) * * Return: false if the passed refcount is 0, true otherwise */ -static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) +static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) { int old = refcount_read(r); @@ -174,12 +174,20 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) break; } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i)); + if (oldp) + *oldp = old; + if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); return old; } +static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) +{ + return __refcount_add_not_zero(i, r, NULL); +} + /** * refcount_add - add a value to a refcount * @i: the value to add to the refcount @@ -196,16 +204,24 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. */ -static inline void refcount_add(int i, refcount_t *r) +static inline void __refcount_add(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_add_relaxed(i, &r->refs); + if (oldp) + *oldp = old; + if (unlikely(!old)) refcount_warn_saturate(r, REFCOUNT_ADD_UAF); else if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_OVF); } +static inline void refcount_add(int i, refcount_t *r) +{ + __refcount_add(i, r, NULL); +} + /** * refcount_inc_not_zero - increment a refcount unless it is 0 * @r: the refcount to increment @@ -219,9 +235,14 @@ static inline void refcount_add(int i, refcount_t *r) * * Return: true if the increment was successful, false otherwise */ +static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero(1, r, oldp); +} + static inline __must_check bool refcount_inc_not_zero(refcount_t *r) { - return refcount_add_not_zero(1, r); + return __refcount_inc_not_zero(r, NULL); } /** @@ -236,9 +257,14 @@ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) * Will WARN if the refcount is 0, as this represents a possible use-after-free * condition. */ +static inline void __refcount_inc(refcount_t *r, int *oldp) +{ + __refcount_add(1, r, oldp); +} + static inline void refcount_inc(refcount_t *r) { - refcount_add(1, r); + __refcount_inc(r, NULL); } /** @@ -261,10 +287,13 @@ static inline void refcount_inc(refcount_t *r) * * Return: true if the resulting refcount is 0, false otherwise */ -static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) +static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(i, &r->refs); + if (oldp) + *oldp = old; + if (old == i) { smp_acquire__after_ctrl_dep(); return true; @@ -276,6 +305,11 @@ static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) return false; } +static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) +{ + return __refcount_sub_and_test(i, r, NULL); +} + /** * refcount_dec_and_test - decrement a refcount and test if it is 0 * @r: the refcount @@ -289,9 +323,14 @@ static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) * * Return: true if the resulting refcount is 0, false otherwise */ +static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp) +{ + return __refcount_sub_and_test(1, r, oldp); +} + static inline __must_check bool refcount_dec_and_test(refcount_t *r) { - return refcount_sub_and_test(1, r); + return __refcount_dec_and_test(r, NULL); } /** @@ -304,10 +343,20 @@ static inline __must_check bool refcount_dec_and_test(refcount_t *r) * Provides release memory ordering, such that prior loads and stores are done * before. */ +static inline void __refcount_dec(refcount_t *r, int *oldp) +{ + int old = atomic_fetch_sub_release(1, &r->refs); + + if (oldp) + *oldp = old; + + if (unlikely(old <= 1)) + refcount_warn_saturate(r, REFCOUNT_DEC_LEAK); +} + static inline void refcount_dec(refcount_t *r) { - if (unlikely(atomic_fetch_sub_release(1, &r->refs) <= 1)) - refcount_warn_saturate(r, REFCOUNT_DEC_LEAK); + __refcount_dec(r, NULL); } extern __must_check bool refcount_dec_if_one(refcount_t *r); From a28e884b966e713da29caefbb347efea77367d22 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 16 Aug 2020 17:02:00 -0700 Subject: [PATCH 21/67] seqlock: Fix multiple kernel-doc warnings Fix kernel-doc warnings in . ../include/linux/seqlock.h:152: warning: Incorrect use of kernel-doc format: * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t ../include/linux/seqlock.h:164: warning: Incorrect use of kernel-doc format: * SEQCOUNT_LOCKTYPE() - Instantiate seqcount_LOCKNAME_t and helpers ../include/linux/seqlock.h:229: warning: Function parameter or member 'seq_name' not described in 'SEQCOUNT_LOCKTYPE_ZERO' ../include/linux/seqlock.h:229: warning: Function parameter or member 'assoc_lock' not described in 'SEQCOUNT_LOCKTYPE_ZERO' ../include/linux/seqlock.h:229: warning: Excess function parameter 'name' description in 'SEQCOUNT_LOCKTYPE_ZERO' ../include/linux/seqlock.h:229: warning: Excess function parameter 'lock' description in 'SEQCOUNT_LOCKTYPE_ZERO' ../include/linux/seqlock.h:695: warning: duplicate section name 'NOTE' Demote kernel-doc notation for the macros "seqcount_LOCKNAME_init()" and "SEQCOUNT_LOCKTYPE()"; scripts/kernel-doc does not handle them correctly. Rename function parameters in SEQCNT_LOCKNAME_ZERO() documentation to match the macro's argument names. Change the macro name in the documentation to SEQCOUNT_LOCKTYPE_ZERO() to match the macro's name. For raw_write_seqcount_latch(), rename the second NOTE: to NOTE2: to prevent a kernel-doc warning. However, the generated output is not quite as nice as it could be for this. Fix a typo: s/LOCKTYPR/LOCKTYPE/ Fixes: 0efc94c5d15c ("seqcount: Compress SEQCNT_LOCKNAME_ZERO()") Fixes: e4e9ab3f9f91 ("seqlock: Fold seqcount_LOCKNAME_init() definition") Fixes: a8772dccb2ec ("seqlock: Fold seqcount_LOCKNAME_t definition") Reported-by: kernel test robot Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200817000200.20993-1-rdunlap@infradead.org --- include/linux/seqlock.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 962d9768945f..300cbf312546 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -138,7 +138,7 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) #endif /** - * typedef seqcount_LOCKNAME_t - sequence counter with LOCKTYPR associated + * typedef seqcount_LOCKNAME_t - sequence counter with LOCKTYPE associated * @seqcount: The real sequence counter * @lock: Pointer to the associated spinlock * @@ -148,7 +148,7 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) * that the write side critical section is properly serialized. */ -/** +/* * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t * @s: Pointer to the seqcount_LOCKNAME_t instance * @lock: Pointer to the associated LOCKTYPE @@ -217,7 +217,7 @@ SEQCOUNT_LOCKTYPE(rwlock_t, rwlock, false, s->lock) SEQCOUNT_LOCKTYPE(struct mutex, mutex, true, s->lock) SEQCOUNT_LOCKTYPE(struct ww_mutex, ww_mutex, true, &s->lock->base) -/** +/* * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t * @name: Name of the seqcount_LOCKNAME_t instance * @lock: Pointer to the associated LOCKTYPE @@ -688,7 +688,7 @@ static inline int raw_read_seqcount_t_latch(seqcount_t *s) * to miss an entire modification sequence, once it resumes it might * observe the new entry. * - * NOTE: + * NOTE2: * * When data is a dynamic data structure; one should use regular RCU * patterns to manage the lifetimes of the objects within. From 92b4e9f11a636d1723cc0866bf8b9111b1e24339 Mon Sep 17 00:00:00 2001 From: Marta Rybczynska Date: Sun, 26 Jul 2020 20:54:40 +0200 Subject: [PATCH 22/67] Documentation/locking/locktypes: Fix local_locks documentation Fix issues with local_locks documentation: - fix function names, local_lock.h has local_unlock_irqrestore(), not local_lock_irqrestore() - fix mapping table, local_unlock_irqrestore() maps to local_irq_restore(), not _save() Signed-off-by: Marta Rybczynska Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Link: https://lkml.kernel.org/r/CAApg2=SKxQ3Sqwj6TZnV-0x0cKLXFKDaPvXT4N15MPDMKq724g@mail.gmail.com --- Documentation/locking/locktypes.rst | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Documentation/locking/locktypes.rst b/Documentation/locking/locktypes.rst index 4cefed8048ca..ddada4a53749 100644 --- a/Documentation/locking/locktypes.rst +++ b/Documentation/locking/locktypes.rst @@ -164,14 +164,14 @@ by disabling preemption or interrupts. On non-PREEMPT_RT kernels local_lock operations map to the preemption and interrupt disabling and enabling primitives: - =========================== ====================== - local_lock(&llock) preempt_disable() - local_unlock(&llock) preempt_enable() - local_lock_irq(&llock) local_irq_disable() - local_unlock_irq(&llock) local_irq_enable() - local_lock_save(&llock) local_irq_save() - local_lock_restore(&llock) local_irq_save() - =========================== ====================== + =============================== ====================== + local_lock(&llock) preempt_disable() + local_unlock(&llock) preempt_enable() + local_lock_irq(&llock) local_irq_disable() + local_unlock_irq(&llock) local_irq_enable() + local_lock_irqsave(&llock) local_irq_save() + local_unlock_irqrestore(&llock) local_irq_restore() + =============================== ====================== The named scope of local_lock has two advantages over the regular primitives: @@ -353,14 +353,14 @@ protection scope. So the following substitution is wrong:: { local_irq_save(flags); -> local_lock_irqsave(&local_lock_1, flags); func3(); - local_irq_restore(flags); -> local_lock_irqrestore(&local_lock_1, flags); + local_irq_restore(flags); -> local_unlock_irqrestore(&local_lock_1, flags); } func2() { local_irq_save(flags); -> local_lock_irqsave(&local_lock_2, flags); func3(); - local_irq_restore(flags); -> local_lock_irqrestore(&local_lock_2, flags); + local_irq_restore(flags); -> local_unlock_irqrestore(&local_lock_2, flags); } func3() @@ -379,14 +379,14 @@ PREEMPT_RT-specific semantics of spinlock_t. The correct substitution is:: { local_irq_save(flags); -> local_lock_irqsave(&local_lock, flags); func3(); - local_irq_restore(flags); -> local_lock_irqrestore(&local_lock, flags); + local_irq_restore(flags); -> local_unlock_irqrestore(&local_lock, flags); } func2() { local_irq_save(flags); -> local_lock_irqsave(&local_lock, flags); func3(); - local_irq_restore(flags); -> local_lock_irqrestore(&local_lock, flags); + local_irq_restore(flags); -> local_unlock_irqrestore(&local_lock, flags); } func3() From e918188611f073063415f40fae568fa4d86d9044 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:20 +0800 Subject: [PATCH 23/67] locking: More accurate annotations for read_lock() On the archs using QUEUED_RWLOCKS, read_lock() is not always a recursive read lock, actually it's only recursive if in_interrupt() is true. So change the annotation accordingly to catch more deadlocks. Note we used to treat read_lock() as pure recursive read locks in lib/locking-seftest.c, and this is useful, especially for the lockdep development selftest, so we keep this via a variable to force switching lock annotation for read_lock(). Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-2-boqun.feng@gmail.com --- include/linux/lockdep.h | 23 ++++++++++++++++++++++- kernel/locking/lockdep.c | 14 ++++++++++++++ lib/locking-selftest.c | 11 +++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 6a584b3e5c74..7cae5ea00d59 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -469,6 +469,20 @@ static inline void print_irqtrace_events(struct task_struct *curr) } #endif +/* Variable used to make lockdep treat read_lock() as recursive in selftests */ +#ifdef CONFIG_DEBUG_LOCKING_API_SELFTESTS +extern unsigned int force_read_lock_recursive; +#else /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ +#define force_read_lock_recursive 0 +#endif /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ + +#ifdef CONFIG_LOCKDEP +extern bool read_lock_is_recursive(void); +#else /* CONFIG_LOCKDEP */ +/* If !LOCKDEP, the value is meaningless */ +#define read_lock_is_recursive() 0 +#endif + /* * For trivial one-depth nesting of a lock-class, the following * global define can be used. (Subsystems with multiple levels @@ -490,7 +504,14 @@ static inline void print_irqtrace_events(struct task_struct *curr) #define spin_release(l, i) lock_release(l, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) -#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) +#define rwlock_acquire_read(l, s, t, i) \ +do { \ + if (read_lock_is_recursive()) \ + lock_acquire_shared_recursive(l, s, t, NULL, i); \ + else \ + lock_acquire_shared(l, s, t, NULL, i); \ +} while (0) + #define rwlock_release(l, i) lock_release(l, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 54b74fabf40c..77cd9e6520c4 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4967,6 +4967,20 @@ static bool lockdep_nmi(void) return true; } +/* + * read_lock() is recursive if: + * 1. We force lockdep think this way in selftests or + * 2. The implementation is not queued read/write lock or + * 3. The locker is at an in_interrupt() context. + */ +bool read_lock_is_recursive(void) +{ + return force_read_lock_recursive || + !IS_ENABLED(CONFIG_QUEUED_RWLOCKS) || + in_interrupt(); +} +EXPORT_SYMBOL_GPL(read_lock_is_recursive); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 14f44f59e733..caadc4dd3368 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -28,6 +28,7 @@ * Change this to 1 if you want to see the failure printouts: */ static unsigned int debug_locks_verbose; +unsigned int force_read_lock_recursive; static DEFINE_WD_CLASS(ww_lockdep); @@ -1978,6 +1979,11 @@ void locking_selftest(void) return; } + /* + * treats read_lock() as recursive read locks for testing purpose + */ + force_read_lock_recursive = 1; + /* * Run the testsuite: */ @@ -2073,6 +2079,11 @@ void locking_selftest(void) ww_tests(); + force_read_lock_recursive = 0; + /* + * queued_read_lock() specific test cases can be put here + */ + if (unexpected_testcase_failures) { printk("-----------------------------------------------------------------\n"); debug_locks = 0; From 224ec489d3cdb0af6794e257eeee39d98dc9c5b2 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:21 +0800 Subject: [PATCH 24/67] lockdep/Documention: Recursive read lock detection reasoning This patch add the documentation piece for the reasoning of deadlock detection related to recursive read lock. The following sections are added: * Explain what is a recursive read lock, and what deadlock cases they could introduce. * Introduce the notations for different types of dependencies, and the definition of strong paths. * Proof for a closed strong path is both sufficient and necessary for deadlock detections with recursive read locks involved. The proof could also explain why we call the path "strong" Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-3-boqun.feng@gmail.com --- Documentation/locking/lockdep-design.rst | 258 +++++++++++++++++++++++ 1 file changed, 258 insertions(+) diff --git a/Documentation/locking/lockdep-design.rst b/Documentation/locking/lockdep-design.rst index 23fcbc4d3fc0..cec03bd1294a 100644 --- a/Documentation/locking/lockdep-design.rst +++ b/Documentation/locking/lockdep-design.rst @@ -392,3 +392,261 @@ Run the command and save the output, then compare against the output from a later run of this command to identify the leakers. This same output can also help you find situations where runtime lock initialization has been omitted. + +Recursive read locks: +--------------------- +The whole of the rest document tries to prove a certain type of cycle is equivalent +to deadlock possibility. + +There are three types of lockers: writers (i.e. exclusive lockers, like +spin_lock() or write_lock()), non-recursive readers (i.e. shared lockers, like +down_read()) and recursive readers (recursive shared lockers, like rcu_read_lock()). +And we use the following notations of those lockers in the rest of the document: + + W or E: stands for writers (exclusive lockers). + r: stands for non-recursive readers. + R: stands for recursive readers. + S: stands for all readers (non-recursive + recursive), as both are shared lockers. + N: stands for writers and non-recursive readers, as both are not recursive. + +Obviously, N is "r or W" and S is "r or R". + +Recursive readers, as their name indicates, are the lockers allowed to acquire +even inside the critical section of another reader of the same lock instance, +in other words, allowing nested read-side critical sections of one lock instance. + +While non-recursive readers will cause a self deadlock if trying to acquire inside +the critical section of another reader of the same lock instance. + +The difference between recursive readers and non-recursive readers is because: +recursive readers get blocked only by a write lock *holder*, while non-recursive +readers could get blocked by a write lock *waiter*. Considering the follow example: + + TASK A: TASK B: + + read_lock(X); + write_lock(X); + read_lock_2(X); + +Task A gets the reader (no matter whether recursive or non-recursive) on X via +read_lock() first. And when task B tries to acquire writer on X, it will block +and become a waiter for writer on X. Now if read_lock_2() is recursive readers, +task A will make progress, because writer waiters don't block recursive readers, +and there is no deadlock. However, if read_lock_2() is non-recursive readers, +it will get blocked by writer waiter B, and cause a self deadlock. + +Block conditions on readers/writers of the same lock instance: +-------------------------------------------------------------- +There are simply four block conditions: + +1. Writers block other writers. +2. Readers block writers. +3. Writers block both recursive readers and non-recursive readers. +4. And readers (recursive or not) don't block other recursive readers but + may block non-recursive readers (because of the potential co-existing + writer waiters) + +Block condition matrix, Y means the row blocks the column, and N means otherwise. + + | E | r | R | + +---+---+---+---+ + E | Y | Y | Y | + +---+---+---+---+ + r | Y | Y | N | + +---+---+---+---+ + R | Y | Y | N | + + (W: writers, r: non-recursive readers, R: recursive readers) + + +acquired recursively. Unlike non-recursive read locks, recursive read locks +only get blocked by current write lock *holders* other than write lock +*waiters*, for example: + + TASK A: TASK B: + + read_lock(X); + + write_lock(X); + + read_lock(X); + +is not a deadlock for recursive read locks, as while the task B is waiting for +the lock X, the second read_lock() doesn't need to wait because it's a recursive +read lock. However if the read_lock() is non-recursive read lock, then the above +case is a deadlock, because even if the write_lock() in TASK B cannot get the +lock, but it can block the second read_lock() in TASK A. + +Note that a lock can be a write lock (exclusive lock), a non-recursive read +lock (non-recursive shared lock) or a recursive read lock (recursive shared +lock), depending on the lock operations used to acquire it (more specifically, +the value of the 'read' parameter for lock_acquire()). In other words, a single +lock instance has three types of acquisition depending on the acquisition +functions: exclusive, non-recursive read, and recursive read. + +To be concise, we call that write locks and non-recursive read locks as +"non-recursive" locks and recursive read locks as "recursive" locks. + +Recursive locks don't block each other, while non-recursive locks do (this is +even true for two non-recursive read locks). A non-recursive lock can block the +corresponding recursive lock, and vice versa. + +A deadlock case with recursive locks involved is as follow: + + TASK A: TASK B: + + read_lock(X); + read_lock(Y); + write_lock(Y); + write_lock(X); + +Task A is waiting for task B to read_unlock() Y and task B is waiting for task +A to read_unlock() X. + +Dependency types and strong dependency paths: +--------------------------------------------- +Lock dependencies record the orders of the acquisitions of a pair of locks, and +because there are 3 types for lockers, there are, in theory, 9 types of lock +dependencies, but we can show that 4 types of lock dependencies are enough for +deadlock detection. + +For each lock dependency: + + L1 -> L2 + +, which means lockdep has seen L1 held before L2 held in the same context at runtime. +And in deadlock detection, we care whether we could get blocked on L2 with L1 held, +IOW, whether there is a locker L3 that L1 blocks L3 and L2 gets blocked by L3. So +we only care about 1) what L1 blocks and 2) what blocks L2. As a result, we can combine +recursive readers and non-recursive readers for L1 (as they block the same types) and +we can combine writers and non-recursive readers for L2 (as they get blocked by the +same types). + +With the above combination for simplification, there are 4 types of dependency edges +in the lockdep graph: + +1) -(ER)->: exclusive writer to recursive reader dependency, "X -(ER)-> Y" means + X -> Y and X is a writer and Y is a recursive reader. + +2) -(EN)->: exclusive writer to non-recursive locker dependency, "X -(EN)-> Y" means + X -> Y and X is a writer and Y is either a writer or non-recursive reader. + +3) -(SR)->: shared reader to recursive reader dependency, "X -(SR)-> Y" means + X -> Y and X is a reader (recursive or not) and Y is a recursive reader. + +4) -(SN)->: shared reader to non-recursive locker dependency, "X -(SN)-> Y" means + X -> Y and X is a reader (recursive or not) and Y is either a writer or + non-recursive reader. + +Note that given two locks, they may have multiple dependencies between them, for example: + + TASK A: + + read_lock(X); + write_lock(Y); + ... + + TASK B: + + write_lock(X); + write_lock(Y); + +, we have both X -(SN)-> Y and X -(EN)-> Y in the dependency graph. + +We use -(xN)-> to represent edges that are either -(EN)-> or -(SN)->, the +similar for -(Ex)->, -(xR)-> and -(Sx)-> + +A "path" is a series of conjunct dependency edges in the graph. And we define a +"strong" path, which indicates the strong dependency throughout each dependency +in the path, as the path that doesn't have two conjunct edges (dependencies) as +-(xR)-> and -(Sx)->. In other words, a "strong" path is a path from a lock +walking to another through the lock dependencies, and if X -> Y -> Z is in the +path (where X, Y, Z are locks), and the walk from X to Y is through a -(SR)-> or +-(ER)-> dependency, the walk from Y to Z must not be through a -(SN)-> or +-(SR)-> dependency. + +We will see why the path is called "strong" in next section. + +Recursive Read Deadlock Detection: +---------------------------------- + +We now prove two things: + +Lemma 1: + +If there is a closed strong path (i.e. a strong circle), then there is a +combination of locking sequences that causes deadlock. I.e. a strong circle is +sufficient for deadlock detection. + +Lemma 2: + +If there is no closed strong path (i.e. strong circle), then there is no +combination of locking sequences that could cause deadlock. I.e. strong +circles are necessary for deadlock detection. + +With these two Lemmas, we can easily say a closed strong path is both sufficient +and necessary for deadlocks, therefore a closed strong path is equivalent to +deadlock possibility. As a closed strong path stands for a dependency chain that +could cause deadlocks, so we call it "strong", considering there are dependency +circles that won't cause deadlocks. + +Proof for sufficiency (Lemma 1): + +Let's say we have a strong circle: + + L1 -> L2 ... -> Ln -> L1 + +, which means we have dependencies: + + L1 -> L2 + L2 -> L3 + ... + Ln-1 -> Ln + Ln -> L1 + +We now can construct a combination of locking sequences that cause deadlock: + +Firstly let's make one CPU/task get the L1 in L1 -> L2, and then another get +the L2 in L2 -> L3, and so on. After this, all of the Lx in Lx -> Lx+1 are +held by different CPU/tasks. + +And then because we have L1 -> L2, so the holder of L1 is going to acquire L2 +in L1 -> L2, however since L2 is already held by another CPU/task, plus L1 -> +L2 and L2 -> L3 are not -(xR)-> and -(Sx)-> (the definition of strong), which +means either L2 in L1 -> L2 is a non-recursive locker (blocked by anyone) or +the L2 in L2 -> L3, is writer (blocking anyone), therefore the holder of L1 +cannot get L2, it has to wait L2's holder to release. + +Moreover, we can have a similar conclusion for L2's holder: it has to wait L3's +holder to release, and so on. We now can prove that Lx's holder has to wait for +Lx+1's holder to release, and note that Ln+1 is L1, so we have a circular +waiting scenario and nobody can get progress, therefore a deadlock. + +Proof for necessary (Lemma 2): + +Lemma 2 is equivalent to: If there is a deadlock scenario, then there must be a +strong circle in the dependency graph. + +According to Wikipedia[1], if there is a deadlock, then there must be a circular +waiting scenario, means there are N CPU/tasks, where CPU/task P1 is waiting for +a lock held by P2, and P2 is waiting for a lock held by P3, ... and Pn is waiting +for a lock held by P1. Let's name the lock Px is waiting as Lx, so since P1 is waiting +for L1 and holding Ln, so we will have Ln -> L1 in the dependency graph. Similarly, +we have L1 -> L2, L2 -> L3, ..., Ln-1 -> Ln in the dependency graph, which means we +have a circle: + + Ln -> L1 -> L2 -> ... -> Ln + +, and now let's prove the circle is strong: + +For a lock Lx, Px contributes the dependency Lx-1 -> Lx and Px+1 contributes +the dependency Lx -> Lx+1, and since Px is waiting for Px+1 to release Lx, +so it's impossible that Lx on Px+1 is a reader and Lx on Px is a recursive +reader, because readers (no matter recursive or not) don't block recursive +readers, therefore Lx-1 -> Lx and Lx -> Lx+1 cannot be a -(xR)-> -(Sx)-> pair, +and this is true for any lock in the circle, therefore, the circle is strong. + +References: +----------- +[1]: https://en.wikipedia.org/wiki/Deadlock +[2]: Shibu, K. (2009). Intro To Embedded Systems (1st ed.). Tata McGraw-Hill From b11be024de164213f6338973d76ab9ab139120cd Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:22 +0800 Subject: [PATCH 25/67] lockdep: Demagic the return value of BFS __bfs() could return four magic numbers: 1: search succeeds, but none match. 0: search succeeds, find one match. -1: search fails because of the cq is full. -2: search fails because a invalid node is found. This patch cleans things up by using a enum type for the return value of __bfs() and its friends, this improves the code readability of the code, and further, could help if we want to extend the BFS. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-4-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 155 ++++++++++++++++++++++----------------- 1 file changed, 89 insertions(+), 66 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 77cd9e6520c4..462c68cfb378 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1471,28 +1471,58 @@ static inline struct list_head *get_dep_list(struct lock_list *lock, int offset) return lock_class + offset; } +/* + * Return values of a bfs search: + * + * BFS_E* indicates an error + * BFS_R* indicates a result (match or not) + * + * BFS_EINVALIDNODE: Find a invalid node in the graph. + * + * BFS_EQUEUEFULL: The queue is full while doing the bfs. + * + * BFS_RMATCH: Find the matched node in the graph, and put that node into + * *@target_entry. + * + * BFS_RNOMATCH: Haven't found the matched node and keep *@target_entry + * _unchanged_. + */ +enum bfs_result { + BFS_EINVALIDNODE = -2, + BFS_EQUEUEFULL = -1, + BFS_RMATCH = 0, + BFS_RNOMATCH = 1, +}; + +/* + * bfs_result < 0 means error + */ +static inline bool bfs_error(enum bfs_result res) +{ + return res < 0; +} /* * Forward- or backward-dependency search, used for both circular dependency * checking and hardirq-unsafe/softirq-unsafe checking. */ -static int __bfs(struct lock_list *source_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry, - int offset) +static enum bfs_result __bfs(struct lock_list *source_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry, + int offset) { struct lock_list *entry; struct lock_list *lock; struct list_head *head; struct circular_queue *cq = &lock_cq; - int ret = 1; + enum bfs_result ret = BFS_RNOMATCH; lockdep_assert_locked(); if (match(source_entry, data)) { *target_entry = source_entry; - ret = 0; + ret = BFS_RMATCH; goto exit; } @@ -1506,7 +1536,7 @@ static int __bfs(struct lock_list *source_entry, while ((lock = __cq_dequeue(cq))) { if (!lock->class) { - ret = -2; + ret = BFS_EINVALIDNODE; goto exit; } @@ -1518,12 +1548,12 @@ static int __bfs(struct lock_list *source_entry, mark_lock_accessed(entry, lock); if (match(entry, data)) { *target_entry = entry; - ret = 0; + ret = BFS_RMATCH; goto exit; } if (__cq_enqueue(cq, entry)) { - ret = -1; + ret = BFS_EQUEUEFULL; goto exit; } cq_depth = __cq_get_elem_count(cq); @@ -1536,20 +1566,22 @@ exit: return ret; } -static inline int __bfs_forwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) +static inline enum bfs_result +__bfs_forwards(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { return __bfs(src_entry, data, match, target_entry, offsetof(struct lock_class, locks_after)); } -static inline int __bfs_backwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) +static inline enum bfs_result +__bfs_backwards(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { return __bfs(src_entry, data, match, target_entry, offsetof(struct lock_class, locks_before)); @@ -1775,18 +1807,18 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) /* * Check that the dependency graph starting at can lead to - * or not. Print an error and return 0 if it does. + * or not. */ -static noinline int +static noinline enum bfs_result check_path(struct lock_class *target, struct lock_list *src_entry, struct lock_list **target_entry) { - int ret; + enum bfs_result ret; ret = __bfs_forwards(src_entry, (void *)target, class_equal, target_entry); - if (unlikely(ret < 0)) + if (unlikely(bfs_error(ret))) print_bfs_bug(ret); return ret; @@ -1797,13 +1829,13 @@ check_path(struct lock_class *target, struct lock_list *src_entry, * lead to . If it can, there is a circle when adding * -> dependency. * - * Print an error and return 0 if it does. + * Print an error and return BFS_RMATCH if it does. */ -static noinline int +static noinline enum bfs_result check_noncircular(struct held_lock *src, struct held_lock *target, struct lock_trace **const trace) { - int ret; + enum bfs_result ret; struct lock_list *target_entry; struct lock_list src_entry = { .class = hlock_class(src), @@ -1814,7 +1846,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target, ret = check_path(hlock_class(target), &src_entry, &target_entry); - if (unlikely(!ret)) { + if (unlikely(ret == BFS_RMATCH)) { if (!*trace) { /* * If save_trace fails here, the printing might @@ -1836,12 +1868,13 @@ check_noncircular(struct held_lock *src, struct held_lock *target, * or not. If it can, -> dependency is already * in the graph. * - * Print an error and return 2 if it does or 1 if it does not. + * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if + * any error appears in the bfs search. */ -static noinline int +static noinline enum bfs_result check_redundant(struct held_lock *src, struct held_lock *target) { - int ret; + enum bfs_result ret; struct lock_list *target_entry; struct lock_list src_entry = { .class = hlock_class(src), @@ -1852,11 +1885,8 @@ check_redundant(struct held_lock *src, struct held_lock *target) ret = check_path(hlock_class(target), &src_entry, &target_entry); - if (!ret) { + if (ret == BFS_RMATCH) debug_atomic_inc(nr_redundant); - ret = 2; - } else if (ret < 0) - ret = 0; return ret; } @@ -1886,17 +1916,14 @@ static inline int usage_match(struct lock_list *entry, void *mask) * Find a node in the forwards-direction dependency sub-graph starting * at @root->class that matches @bit. * - * Return 0 if such a node exists in the subgraph, and put that node + * Return BFS_MATCH if such a node exists in the subgraph, and put that node * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. */ -static int +static enum bfs_result find_usage_forwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { - int result; + enum bfs_result result; debug_atomic_inc(nr_find_usage_forwards_checks); @@ -1908,18 +1935,12 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask, /* * Find a node in the backwards-direction dependency sub-graph starting * at @root->class that matches @bit. - * - * Return 0 if such a node exists in the subgraph, and put that node - * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. */ -static int +static enum bfs_result find_usage_backwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { - int result; + enum bfs_result result; debug_atomic_inc(nr_find_usage_backwards_checks); @@ -2247,7 +2268,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, struct lock_list *target_entry1; struct lock_list *target_entry; struct lock_list this, that; - int ret; + enum bfs_result ret; /* * Step 1: gather all hard/soft IRQs usages backward in an @@ -2257,7 +2278,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, this.class = hlock_class(prev); ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } @@ -2276,12 +2297,12 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, that.class = hlock_class(next); ret = find_usage_forwards(&that, forward_mask, &target_entry1); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; /* * Step 3: we found a bad match! Now retrieve a lock from the backward @@ -2291,11 +2312,11 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, backward_mask = original_mask(target_entry1->class->usage_mask); ret = find_usage_backwards(&this, backward_mask, &target_entry); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (DEBUG_LOCKS_WARN_ON(ret == 1)) + if (DEBUG_LOCKS_WARN_ON(ret == BFS_RNOMATCH)) return 1; /* @@ -2463,7 +2484,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, struct lock_trace **const trace) { struct lock_list *entry; - int ret; + enum bfs_result ret; if (!hlock_class(prev)->key || !hlock_class(next)->key) { /* @@ -2494,7 +2515,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * in the graph whose neighbours are to be checked. */ ret = check_noncircular(next, prev, trace); - if (unlikely(ret <= 0)) + if (unlikely(bfs_error(ret) || ret == BFS_RMATCH)) return 0; if (!check_irq_usage(curr, prev, next)) @@ -2531,8 +2552,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Is the -> link redundant? */ ret = check_redundant(prev, next); - if (ret != 1) - return ret; + if (bfs_error(ret)) + return 0; + else if (ret == BFS_RMATCH) + return 2; #endif if (!*trace) { @@ -3436,19 +3459,19 @@ static int check_usage_forwards(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit bit, const char *irqclass) { - int ret; + enum bfs_result ret; struct lock_list root; struct lock_list *target_entry; root.parent = NULL; root.class = hlock_class(this); ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; print_irq_inversion_bug(curr, &root, target_entry, this, 1, irqclass); @@ -3463,19 +3486,19 @@ static int check_usage_backwards(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit bit, const char *irqclass) { - int ret; + enum bfs_result ret; struct lock_list root; struct lock_list *target_entry; root.parent = NULL; root.class = hlock_class(this); ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; print_irq_inversion_bug(curr, &root, target_entry, this, 0, irqclass); From d563bc6ead9e79be37067d58509a605b67378184 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:23 +0800 Subject: [PATCH 26/67] lockdep: Make __bfs() visit every dependency until a match Currently, __bfs() will do a breadth-first search in the dependency graph and visit each lock class in the graph exactly once, so for example, in the following graph: A ---------> B | ^ | | +----------> C a __bfs() call starts at A, will visit B through dependency A -> B and visit C through dependency A -> C and that's it, IOW, __bfs() will not visit dependency C -> B. This is OK for now, as we only have strong dependencies in the dependency graph, so whenever there is a traverse path from A to B in __bfs(), it means A has strong dependencies to B (IOW, B depends on A strongly). So no need to visit all dependencies in the graph. However, as we are going to add recursive-read lock into the dependency graph, as a result, not all the paths mean strong dependencies, in the same example above, dependency A -> B may be a weak dependency and traverse A -> C -> B may be a strong dependency path. And with the old way of __bfs() (i.e. visiting every lock class exactly once), we will miss the strong dependency path, which will result into failing to find a deadlock. To cure this for the future, we need to find a way for __bfs() to visit each dependency, rather than each class, exactly once in the search until we find a match. The solution is simple: We used to mark lock_class::lockdep_dependency_gen_id to indicate a class has been visited in __bfs(), now we change the semantics a little bit: we now mark lock_class::lockdep_dependency_gen_id to indicate _all the dependencies_ in its lock_{after,before} have been visited in the __bfs() (note we only take one direction in a __bfs() search). In this way, every dependency is guaranteed to be visited until we find a match. Note: the checks in mark_lock_accessed() and lock_accessed() are removed, because after this modification, we may call these two functions on @source_entry of __bfs(), which may not be the entry in "list_entries" Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-5-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 63 +++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 462c68cfb378..150686a71be0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1421,23 +1421,19 @@ static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) return (cq->rear - cq->front) & CQ_MASK; } -static inline void mark_lock_accessed(struct lock_list *lock, - struct lock_list *parent) +static inline void mark_lock_accessed(struct lock_list *lock) { - unsigned long nr; - - nr = lock - list_entries; - WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ - lock->parent = parent; lock->class->dep_gen_id = lockdep_dependency_gen_id; } +static inline void visit_lock_entry(struct lock_list *lock, + struct lock_list *parent) +{ + lock->parent = parent; +} + static inline unsigned long lock_accessed(struct lock_list *lock) { - unsigned long nr; - - nr = lock - list_entries; - WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ return lock->class->dep_gen_id == lockdep_dependency_gen_id; } @@ -1540,26 +1536,39 @@ static enum bfs_result __bfs(struct lock_list *source_entry, goto exit; } + /* + * If we have visited all the dependencies from this @lock to + * others (iow, if we have visited all lock_list entries in + * @lock->class->locks_{after,before}) we skip, otherwise go + * and visit all the dependencies in the list and mark this + * list accessed. + */ + if (lock_accessed(lock)) + continue; + else + mark_lock_accessed(lock); + head = get_dep_list(lock, offset); - list_for_each_entry_rcu(entry, head, entry) { - if (!lock_accessed(entry)) { - unsigned int cq_depth; - mark_lock_accessed(entry, lock); - if (match(entry, data)) { - *target_entry = entry; - ret = BFS_RMATCH; - goto exit; - } + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); - if (__cq_enqueue(cq, entry)) { - ret = BFS_EQUEUEFULL; - goto exit; - } - cq_depth = __cq_get_elem_count(cq); - if (max_bfs_queue_depth < cq_depth) - max_bfs_queue_depth = cq_depth; + list_for_each_entry_rcu(entry, head, entry) { + unsigned int cq_depth; + + visit_lock_entry(entry, lock); + if (match(entry, data)) { + *target_entry = entry; + ret = BFS_RMATCH; + goto exit; } + + if (__cq_enqueue(cq, entry)) { + ret = BFS_EQUEUEFULL; + goto exit; + } + cq_depth = __cq_get_elem_count(cq); + if (max_bfs_queue_depth < cq_depth) + max_bfs_queue_depth = cq_depth; } } exit: From bd76eca10de2eb9998d5125b08e8997cbf5508d5 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:24 +0800 Subject: [PATCH 27/67] lockdep: Reduce the size of lock_list::distance lock_list::distance is always not greater than MAX_LOCK_DEPTH (which is 48 right now), so a u16 will fit. This patch reduces the size of lock_list::distance to save space, so that we can introduce other fields to help detect recursive read lock deadlocks without increasing the size of lock_list structure. Suggested-by: Peter Zijlstra Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-6-boqun.feng@gmail.com --- include/linux/lockdep.h | 2 +- kernel/locking/lockdep.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 7cae5ea00d59..22750102b5fe 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -54,7 +54,7 @@ struct lock_list { struct lock_class *class; struct lock_class *links_to; const struct lock_trace *trace; - int distance; + u16 distance; /* * The parent field is used to implement breadth-first search, and the diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 150686a71be0..668a983d6405 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1320,7 +1320,7 @@ static struct lock_list *alloc_list_entry(void) */ static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, - unsigned long ip, int distance, + unsigned long ip, u16 distance, const struct lock_trace *trace) { struct lock_list *entry; @@ -2489,7 +2489,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, + struct held_lock *next, u16 distance, struct lock_trace **const trace) { struct lock_list *entry; @@ -2622,7 +2622,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) goto out_bug; for (;;) { - int distance = curr->lockdep_depth - depth + 1; + u16 distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; /* From 3454a36d6a39186de508dd43df590a6363364176 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:25 +0800 Subject: [PATCH 28/67] lockdep: Introduce lock_list::dep To add recursive read locks into the dependency graph, we need to store the types of dependencies for the BFS later. There are four types of dependencies: * Exclusive -> Non-recursive dependencies: EN e.g. write_lock(prev) held and try to acquire write_lock(next) or non-recursive read_lock(next), which can be represented as "prev -(EN)-> next" * Shared -> Non-recursive dependencies: SN e.g. read_lock(prev) held and try to acquire write_lock(next) or non-recursive read_lock(next), which can be represented as "prev -(SN)-> next" * Exclusive -> Recursive dependencies: ER e.g. write_lock(prev) held and try to acquire recursive read_lock(next), which can be represented as "prev -(ER)-> next" * Shared -> Recursive dependencies: SR e.g. read_lock(prev) held and try to acquire recursive read_lock(next), which can be represented as "prev -(SR)-> next" So we use 4 bits for the presence of each type in lock_list::dep. Helper functions and macros are also introduced to convert a pair of locks into lock_list::dep bit and maintain the addition of different types of dependencies. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-7-boqun.feng@gmail.com --- include/linux/lockdep.h | 2 + kernel/locking/lockdep.c | 92 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 90 insertions(+), 4 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 22750102b5fe..35c8bb0108dd 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -55,6 +55,8 @@ struct lock_list { struct lock_class *links_to; const struct lock_trace *trace; u16 distance; + /* bitmap of different dependencies from head to this */ + u8 dep; /* * The parent field is used to implement breadth-first search, and the diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 668a983d6405..16ad1b74aa30 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1320,7 +1320,7 @@ static struct lock_list *alloc_list_entry(void) */ static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, - unsigned long ip, u16 distance, + unsigned long ip, u16 distance, u8 dep, const struct lock_trace *trace) { struct lock_list *entry; @@ -1334,6 +1334,7 @@ static int add_lock_to_list(struct lock_class *this, entry->class = this; entry->links_to = links_to; + entry->dep = dep; entry->distance = distance; entry->trace = trace; /* @@ -1498,6 +1499,57 @@ static inline bool bfs_error(enum bfs_result res) return res < 0; } +/* + * DEP_*_BIT in lock_list::dep + * + * For dependency @prev -> @next: + * + * SR: @prev is shared reader (->read != 0) and @next is recursive reader + * (->read == 2) + * ER: @prev is exclusive locker (->read == 0) and @next is recursive reader + * SN: @prev is shared reader and @next is non-recursive locker (->read != 2) + * EN: @prev is exclusive locker and @next is non-recursive locker + * + * Note that we define the value of DEP_*_BITs so that: + * bit0 is prev->read == 0 + * bit1 is next->read != 2 + */ +#define DEP_SR_BIT (0 + (0 << 1)) /* 0 */ +#define DEP_ER_BIT (1 + (0 << 1)) /* 1 */ +#define DEP_SN_BIT (0 + (1 << 1)) /* 2 */ +#define DEP_EN_BIT (1 + (1 << 1)) /* 3 */ + +#define DEP_SR_MASK (1U << (DEP_SR_BIT)) +#define DEP_ER_MASK (1U << (DEP_ER_BIT)) +#define DEP_SN_MASK (1U << (DEP_SN_BIT)) +#define DEP_EN_MASK (1U << (DEP_EN_BIT)) + +static inline unsigned int +__calc_dep_bit(struct held_lock *prev, struct held_lock *next) +{ + return (prev->read == 0) + ((next->read != 2) << 1); +} + +static inline u8 calc_dep(struct held_lock *prev, struct held_lock *next) +{ + return 1U << __calc_dep_bit(prev, next); +} + +/* + * calculate the dep_bit for backwards edges. We care about whether @prev is + * shared and whether @next is recursive. + */ +static inline unsigned int +__calc_dep_bitb(struct held_lock *prev, struct held_lock *next) +{ + return (next->read != 2) + ((prev->read == 0) << 1); +} + +static inline u8 calc_depb(struct held_lock *prev, struct held_lock *next) +{ + return 1U << __calc_dep_bitb(prev, next); +} + /* * Forward- or backward-dependency search, used for both circular dependency * checking and hardirq-unsafe/softirq-unsafe checking. @@ -2552,7 +2604,35 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; - return 1; + entry->dep |= calc_dep(prev, next); + + /* + * Also, update the reverse dependency in @next's + * ->locks_before list. + * + * Here we reuse @entry as the cursor, which is fine + * because we won't go to the next iteration of the + * outer loop: + * + * For normal cases, we return in the inner loop. + * + * If we fail to return, we have inconsistency, i.e. + * ::locks_after contains while + * ::locks_before doesn't contain . In + * that case, we return after the inner and indicate + * something is wrong. + */ + list_for_each_entry(entry, &hlock_class(next)->locks_before, entry) { + if (entry->class == hlock_class(prev)) { + if (distance == 1) + entry->distance = 1; + entry->dep |= calc_depb(prev, next); + return 1; + } + } + + /* is not found in ::locks_before */ + return 0; } } @@ -2579,14 +2659,18 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(prev)->locks_after, - next->acquire_ip, distance, *trace); + next->acquire_ip, distance, + calc_dep(prev, next), + *trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(next)->locks_before, - next->acquire_ip, distance, *trace); + next->acquire_ip, distance, + calc_depb(prev, next), + *trace); if (!ret) return 0; From 6971c0f345620aae5e6172207a57b7524603a34e Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:26 +0800 Subject: [PATCH 29/67] lockdep: Extend __bfs() to work with multiple types of dependencies Now we have four types of dependencies in the dependency graph, and not all the pathes carry real dependencies (the dependencies that may cause a deadlock), for example: Given lock A and B, if we have: CPU1 CPU2 ============= ============== write_lock(A); read_lock(B); read_lock(B); write_lock(A); (assuming read_lock(B) is a recursive reader) then we have dependencies A -(ER)-> B, and B -(SN)-> A, and a dependency path A -(ER)-> B -(SN)-> A. In lockdep w/o recursive locks, a dependency path from A to A means a deadlock. However, the above case is obviously not a deadlock, because no one holds B exclusively, therefore no one waits for the other to release B, so who get A first in CPU1 and CPU2 will run non-blockingly. As a result, dependency path A -(ER)-> B -(SN)-> A is not a real/strong dependency that could cause a deadlock. From the observation above, we know that for a dependency path to be real/strong, no two adjacent dependencies can be as -(*R)-> -(S*)->. Now our mission is to make __bfs() traverse only the strong dependency paths, which is simple: we record whether we only have -(*R)-> for the previous lock_list of the path in lock_list::only_xr, and when we pick a dependency in the traverse, we 1) filter out -(S*)-> dependency if the previous lock_list only has -(*R)-> dependency (i.e. ->only_xr is true) and 2) set the next lock_list::only_xr to true if we only have -(*R)-> left after we filter out dependencies based on 1), otherwise, set it to false. With this extension for __bfs(), we now need to initialize the root of __bfs() properly (with a correct ->only_xr), to do so, we introduce some helper functions, which also cleans up a little bit for the __bfs() root initialization code. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-8-boqun.feng@gmail.com --- include/linux/lockdep.h | 2 + kernel/locking/lockdep.c | 113 ++++++++++++++++++++++++++++++++------- 2 files changed, 96 insertions(+), 19 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 35c8bb0108dd..57d642d378c7 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -57,6 +57,8 @@ struct lock_list { u16 distance; /* bitmap of different dependencies from head to this */ u8 dep; + /* used by BFS to record whether "prev -> this" only has -(*R)-> */ + u8 only_xr; /* * The parent field is used to implement breadth-first search, and the diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 16ad1b74aa30..5abc227db0e9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1551,8 +1551,72 @@ static inline u8 calc_depb(struct held_lock *prev, struct held_lock *next) } /* - * Forward- or backward-dependency search, used for both circular dependency - * checking and hardirq-unsafe/softirq-unsafe checking. + * Initialize a lock_list entry @lock belonging to @class as the root for a BFS + * search. + */ +static inline void __bfs_init_root(struct lock_list *lock, + struct lock_class *class) +{ + lock->class = class; + lock->parent = NULL; + lock->only_xr = 0; +} + +/* + * Initialize a lock_list entry @lock based on a lock acquisition @hlock as the + * root for a BFS search. + * + * ->only_xr of the initial lock node is set to @hlock->read == 2, to make sure + * that -> @hlock and @hlock -> is not -(*R)-> + * and -(S*)->. + */ +static inline void bfs_init_root(struct lock_list *lock, + struct held_lock *hlock) +{ + __bfs_init_root(lock, hlock_class(hlock)); + lock->only_xr = (hlock->read == 2); +} + +/* + * Similar to bfs_init_root() but initialize the root for backwards BFS. + * + * ->only_xr of the initial lock node is set to @hlock->read != 0, to make sure + * that -> @hlock and @hlock -> is not + * -(*S)-> and -(R*)-> (reverse order of -(*R)-> and -(S*)->). + */ +static inline void bfs_init_rootb(struct lock_list *lock, + struct held_lock *hlock) +{ + __bfs_init_root(lock, hlock_class(hlock)); + lock->only_xr = (hlock->read != 0); +} + +/* + * Breadth-First Search to find a strong path in the dependency graph. + * + * @source_entry: the source of the path we are searching for. + * @data: data used for the second parameter of @match function + * @match: match function for the search + * @target_entry: pointer to the target of a matched path + * @offset: the offset to struct lock_class to determine whether it is + * locks_after or locks_before + * + * We may have multiple edges (considering different kinds of dependencies, + * e.g. ER and SN) between two nodes in the dependency graph. But + * only the strong dependency path in the graph is relevant to deadlocks. A + * strong dependency path is a dependency path that doesn't have two adjacent + * dependencies as -(*R)-> -(S*)->, please see: + * + * Documentation/locking/lockdep-design.rst + * + * for more explanation of the definition of strong dependency paths + * + * In __bfs(), we only traverse in the strong dependency path: + * + * In lock_list::only_xr, we record whether the previous dependency only + * has -(*R)-> in the search, and if it does (prev only has -(*R)->), we + * filter out any -(S*)-> in the current dependency and after that, the + * ->only_xr is set according to whether we only have -(*R)-> left. */ static enum bfs_result __bfs(struct lock_list *source_entry, void *data, @@ -1582,6 +1646,7 @@ static enum bfs_result __bfs(struct lock_list *source_entry, __cq_enqueue(cq, source_entry); while ((lock = __cq_dequeue(cq))) { + bool prev_only_xr; if (!lock->class) { ret = BFS_EINVALIDNODE; @@ -1602,10 +1667,26 @@ static enum bfs_result __bfs(struct lock_list *source_entry, head = get_dep_list(lock, offset); - DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + prev_only_xr = lock->only_xr; list_for_each_entry_rcu(entry, head, entry) { unsigned int cq_depth; + u8 dep = entry->dep; + + /* + * Mask out all -(S*)-> if we only have *R in previous + * step, because -(*R)-> -(S*)-> don't make up a strong + * dependency. + */ + if (prev_only_xr) + dep &= ~(DEP_SR_MASK | DEP_SN_MASK); + + /* If nothing left, we skip */ + if (!dep) + continue; + + /* If there are only -(*R)-> left, set that for the next step */ + entry->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK)); visit_lock_entry(entry, lock); if (match(entry, data)) { @@ -1827,8 +1908,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) unsigned long ret, flags; struct lock_list this; - this.parent = NULL; - this.class = class; + __bfs_init_root(&this, class); raw_local_irq_save(flags); lockdep_lock(); @@ -1854,8 +1934,7 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) unsigned long ret, flags; struct lock_list this; - this.parent = NULL; - this.class = class; + __bfs_init_root(&this, class); raw_local_irq_save(flags); lockdep_lock(); @@ -1898,10 +1977,9 @@ check_noncircular(struct held_lock *src, struct held_lock *target, { enum bfs_result ret; struct lock_list *target_entry; - struct lock_list src_entry = { - .class = hlock_class(src), - .parent = NULL, - }; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); debug_atomic_inc(nr_cyclic_checks); @@ -1937,10 +2015,9 @@ check_redundant(struct held_lock *src, struct held_lock *target) { enum bfs_result ret; struct lock_list *target_entry; - struct lock_list src_entry = { - .class = hlock_class(src), - .parent = NULL, - }; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); debug_atomic_inc(nr_redundant_checks); @@ -3556,8 +3633,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, struct lock_list root; struct lock_list *target_entry; - root.parent = NULL; - root.class = hlock_class(this); + bfs_init_root(&root, this); ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); if (bfs_error(ret)) { print_bfs_bug(ret); @@ -3583,8 +3659,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, struct lock_list root; struct lock_list *target_entry; - root.parent = NULL; - root.class = hlock_class(this); + bfs_init_rootb(&root, this); ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); if (bfs_error(ret)) { print_bfs_bug(ret); From 61775ed243433ff0556c4f76905929fe01e92922 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:27 +0800 Subject: [PATCH 30/67] lockdep: Make __bfs(.match) return bool The "match" parameter of __bfs() is used for checking whether we hit a match in the search, therefore it should return a boolean value rather than an integer for better readability. This patch then changes the return type of the function parameter and the match functions to bool. Suggested-by: Peter Zijlstra Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-9-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 5abc227db0e9..78cd74d77be9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1620,7 +1620,7 @@ static inline void bfs_init_rootb(struct lock_list *lock, */ static enum bfs_result __bfs(struct lock_list *source_entry, void *data, - int (*match)(struct lock_list *entry, void *data), + bool (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry, int offset) { @@ -1711,7 +1711,7 @@ exit: static inline enum bfs_result __bfs_forwards(struct lock_list *src_entry, void *data, - int (*match)(struct lock_list *entry, void *data), + bool (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { return __bfs(src_entry, data, match, target_entry, @@ -1722,7 +1722,7 @@ __bfs_forwards(struct lock_list *src_entry, static inline enum bfs_result __bfs_backwards(struct lock_list *src_entry, void *data, - int (*match)(struct lock_list *entry, void *data), + bool (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { return __bfs(src_entry, data, match, target_entry, @@ -1833,7 +1833,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, print_circular_bug_entry(entry, depth); } -static inline int class_equal(struct lock_list *entry, void *data) +static inline bool class_equal(struct lock_list *entry, void *data) { return entry->class == data; } @@ -1888,10 +1888,10 @@ static noinline void print_bfs_bug(int ret) WARN(1, "lockdep bfs error:%d\n", ret); } -static int noop_count(struct lock_list *entry, void *data) +static bool noop_count(struct lock_list *entry, void *data) { (*(unsigned long *)data)++; - return 0; + return false; } static unsigned long __lockdep_count_forward_deps(struct lock_list *this) @@ -2032,11 +2032,11 @@ check_redundant(struct held_lock *src, struct held_lock *target) #ifdef CONFIG_TRACE_IRQFLAGS -static inline int usage_accumulate(struct lock_list *entry, void *mask) +static inline bool usage_accumulate(struct lock_list *entry, void *mask) { *(unsigned long *)mask |= entry->class->usage_mask; - return 0; + return false; } /* @@ -2045,9 +2045,9 @@ static inline int usage_accumulate(struct lock_list *entry, void *mask) * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ -static inline int usage_match(struct lock_list *entry, void *mask) +static inline bool usage_match(struct lock_list *entry, void *mask) { - return entry->class->usage_mask & *(unsigned long *)mask; + return !!(entry->class->usage_mask & *(unsigned long *)mask); } /* From 9de0c9bbcedf752e762c67f105bff342e30f9105 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:28 +0800 Subject: [PATCH 31/67] lockdep: Support deadlock detection for recursive read locks in check_noncircular() Currently, lockdep only has limit support for deadlock detection for recursive read locks. This patch support deadlock detection for recursive read locks. The basic idea is: We are about to add dependency B -> A in to the dependency graph, we use check_noncircular() to find whether we have a strong dependency path A -> .. -> B so that we have a strong dependency circle (a closed strong dependency path): A -> .. -> B -> A , which doesn't have two adjacent dependencies as -(*R)-> L -(S*)->. Since A -> .. -> B is already a strong dependency path, so if either B -> A is -(E*)-> or A -> .. -> B is -(*N)->, the circle A -> .. -> B -> A is strong, otherwise not. So we introduce a new match function hlock_conflict() to replace the class_equal() for the deadlock check in check_noncircular(). Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-10-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 43 ++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 78cd74d77be9..9160f1ddc435 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1838,10 +1838,37 @@ static inline bool class_equal(struct lock_list *entry, void *data) return entry->class == data; } +/* + * We are about to add B -> A into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * We will have a deadlock case (conflict) if A -> .. -> B -> A is a strong + * dependency cycle, that means: + * + * Either + * + * a) B -> A is -(E*)-> + * + * or + * + * b) A -> .. -> B is -(*N)-> (i.e. A -> .. -(*N)-> B) + * + * as then we don't have -(*R)-> -(S*)-> in the cycle. + */ +static inline bool hlock_conflict(struct lock_list *entry, void *data) +{ + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 0 || /* B -> A is -(E*)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ +} + static noinline void print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt) + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; struct lock_list *parent; @@ -1950,13 +1977,13 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) * or not. */ static noinline enum bfs_result -check_path(struct lock_class *target, struct lock_list *src_entry, +check_path(struct held_lock *target, struct lock_list *src_entry, + bool (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { enum bfs_result ret; - ret = __bfs_forwards(src_entry, (void *)target, class_equal, - target_entry); + ret = __bfs_forwards(src_entry, target, match, target_entry); if (unlikely(bfs_error(ret))) print_bfs_bug(ret); @@ -1983,7 +2010,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target, debug_atomic_inc(nr_cyclic_checks); - ret = check_path(hlock_class(target), &src_entry, &target_entry); + ret = check_path(target, &src_entry, hlock_conflict, &target_entry); if (unlikely(ret == BFS_RMATCH)) { if (!*trace) { @@ -2021,7 +2048,7 @@ check_redundant(struct held_lock *src, struct held_lock *target) debug_atomic_inc(nr_redundant_checks); - ret = check_path(hlock_class(target), &src_entry, &target_entry); + ret = check_path(target, &src_entry, class_equal, &target_entry); if (ret == BFS_RMATCH) debug_atomic_inc(nr_redundant); From 68e305678583f13a67e2ce22088c2520bd4f97b4 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:29 +0800 Subject: [PATCH 32/67] lockdep: Adjust check_redundant() for recursive read change check_redundant() will report redundancy if it finds a path could replace the about-to-add dependency in the BFS search. With recursive read lock changes, we certainly need to change the match function for the check_redundant(), because the path needs to match not only the lock class but also the dependency kinds. For example, if the about-to-add dependency @prev -> @next is A -(SN)-> B, and we find a path A -(S*)-> .. -(*R)->B in the dependency graph with __bfs() (for simplicity, we can also say we find an -(SR)-> path from A to B), we can not replace the dependency with that path in the BFS search. Because the -(SN)-> dependency can make a strong path with a following -(S*)-> dependency, however an -(SR)-> path cannot. Further, we can replace an -(SN)-> dependency with a -(EN)-> path, that means if we find a path which is stronger than or equal to the about-to-add dependency, we can report the redundancy. By "stronger", it means both the start and the end of the path are not weaker than the start and the end of the dependency (E is "stronger" than S and N is "stronger" than R), so that we can replace the dependency with that path. To make sure we find a path whose start point is not weaker than the about-to-add dependency, we use a trick: the ->only_xr of the root (start point) of __bfs() is initialized as @prev-> == 0, therefore if @prev is E, __bfs() will pick only -(E*)-> for the first dependency, otherwise, __bfs() can pick -(E*)-> or -(S*)-> for the first dependency. To make sure we find a path whose end point is not weaker than the about-to-add dependency, we replace the match function for __bfs() check_redundant(), we check for the case that either @next is R (anything is not weaker than it) or the end point of the path is N (which is not weaker than anything). Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-11-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 47 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 9160f1ddc435..42e2f1fbbd5f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1833,9 +1833,39 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, print_circular_bug_entry(entry, depth); } -static inline bool class_equal(struct lock_list *entry, void *data) +/* + * We are about to add A -> B into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * If A -> .. -> B can replace A -> B in any __bfs() search (means the former + * is _stronger_ than or equal to the latter), we consider A -> B as redundant. + * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A + * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the + * dependency graph, as any strong path ..-> A -> B ->.. we can get with + * having dependency A -> B, we could already get a equivalent path ..-> A -> + * .. -> B -> .. with A -> .. -> B. Therefore A -> B is reduntant. + * + * We need to make sure both the start and the end of A -> .. -> B is not + * weaker than A -> B. For the start part, please see the comment in + * check_redundant(). For the end part, we need: + * + * Either + * + * a) A -> B is -(*R)-> (everything is not weaker than that) + * + * or + * + * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) + * + */ +static inline bool hlock_equal(struct lock_list *entry, void *data) { - return entry->class == data; + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 2 || /* A -> B is -(*R)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ } /* @@ -2045,10 +2075,21 @@ check_redundant(struct held_lock *src, struct held_lock *target) struct lock_list src_entry; bfs_init_root(&src_entry, src); + /* + * Special setup for check_redundant(). + * + * To report redundant, we need to find a strong dependency path that + * is equal to or stronger than -> . So if is E, + * we need to let __bfs() only search for a path starting at a -(E*)->, + * we achieve this by setting the initial node's ->only_xr to true in + * that case. And if is S, we set initial ->only_xr to false + * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant. + */ + src_entry.only_xr = src->read == 0; debug_atomic_inc(nr_redundant_checks); - ret = check_path(target, &src_entry, class_equal, &target_entry); + ret = check_path(target, &src_entry, hlock_equal, &target_entry); if (ret == BFS_RMATCH) debug_atomic_inc(nr_redundant); From f08e3888574d490b31481eef6d84c61bedba7a47 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:30 +0800 Subject: [PATCH 33/67] lockdep: Fix recursive read lock related safe->unsafe detection Currently, in safe->unsafe detection, lockdep misses the fact that a LOCK_ENABLED_IRQ_*_READ usage and a LOCK_USED_IN_IRQ_*_READ usage may cause deadlock too, for example: P1 P2 write_lock(l1); read_lock(l2); write_lock(l2); read_lock(l1); Actually, all of the following cases may cause deadlocks: LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_* LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_* LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*_READ LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*_READ To fix this, we need to 1) change the calculation of exclusive_mask() so that READ bits are not dropped and 2) always call usage() in mark_lock_irq() to check usage deadlocks, even when the new usage of the lock is READ. Besides, adjust usage_match() and usage_acculumate() to recursive read lock changes. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-12-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 192 +++++++++++++++++++++++++++++---------- 1 file changed, 143 insertions(+), 49 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 42e2f1fbbd5f..6644974a4f5a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2100,22 +2100,72 @@ check_redundant(struct held_lock *src, struct held_lock *target) #ifdef CONFIG_TRACE_IRQFLAGS +/* + * Forwards and backwards subgraph searching, for the purposes of + * proving that two subgraphs can be connected by a new dependency + * without creating any illegal irq-safe -> irq-unsafe lock dependency. + * + * A irq safe->unsafe deadlock happens with the following conditions: + * + * 1) We have a strong dependency path A -> ... -> B + * + * 2) and we have ENABLED_IRQ usage of B and USED_IN_IRQ usage of A, therefore + * irq can create a new dependency B -> A (consider the case that a holder + * of B gets interrupted by an irq whose handler will try to acquire A). + * + * 3) the dependency circle A -> ... -> B -> A we get from 1) and 2) is a + * strong circle: + * + * For the usage bits of B: + * a) if A -> B is -(*N)->, then B -> A could be any type, so any + * ENABLED_IRQ usage suffices. + * b) if A -> B is -(*R)->, then B -> A must be -(E*)->, so only + * ENABLED_IRQ_*_READ usage suffices. + * + * For the usage bits of A: + * c) if A -> B is -(E*)->, then B -> A could be any type, so any + * USED_IN_IRQ usage suffices. + * d) if A -> B is -(S*)->, then B -> A must be -(*N)->, so only + * USED_IN_IRQ_*_READ usage suffices. + */ + +/* + * There is a strong dependency path in the dependency graph: A -> B, and now + * we need to decide which usage bit of A should be accumulated to detect + * safe->unsafe bugs. + * + * Note that usage_accumulate() is used in backwards search, so ->only_xr + * stands for whether A -> B only has -(S*)-> (in this case ->only_xr is true). + * + * As above, if only_xr is false, which means A -> B has -(E*)-> dependency + * path, any usage of A should be considered. Otherwise, we should only + * consider _READ usage. + */ static inline bool usage_accumulate(struct lock_list *entry, void *mask) { - *(unsigned long *)mask |= entry->class->usage_mask; + if (!entry->only_xr) + *(unsigned long *)mask |= entry->class->usage_mask; + else /* Mask out _READ usage bits */ + *(unsigned long *)mask |= (entry->class->usage_mask & LOCKF_IRQ); return false; } /* - * Forwards and backwards subgraph searching, for the purposes of - * proving that two subgraphs can be connected by a new dependency - * without creating any illegal irq-safe -> irq-unsafe lock dependency. + * There is a strong dependency path in the dependency graph: A -> B, and now + * we need to decide which usage bit of B conflicts with the usage bits of A, + * i.e. which usage bit of B may introduce safe->unsafe deadlocks. + * + * As above, if only_xr is false, which means A -> B has -(*N)-> dependency + * path, any usage of B should be considered. Otherwise, we should only + * consider _READ usage. */ - static inline bool usage_match(struct lock_list *entry, void *mask) { - return !!(entry->class->usage_mask & *(unsigned long *)mask); + if (!entry->only_xr) + return !!(entry->class->usage_mask & *(unsigned long *)mask); + else /* Mask out _READ usage bits */ + return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask); } /* @@ -2406,17 +2456,39 @@ static unsigned long invert_dir_mask(unsigned long mask) } /* - * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all - * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). - * And then mask out all bitnr0. + * Note that a LOCK_ENABLED_IRQ_*_READ usage and a LOCK_USED_IN_IRQ_*_READ + * usage may cause deadlock too, for example: + * + * P1 P2 + * + * write_lock(l1); + * read_lock(l2); + * write_lock(l2); + * + * read_lock(l1); + * + * , in above case, l1 will be marked as LOCK_USED_IN_IRQ_HARDIRQ_READ and l2 + * will marked as LOCK_ENABLE_IRQ_HARDIRQ_READ, and this is a possible + * deadlock. + * + * In fact, all of the following cases may cause deadlocks: + * + * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_* + * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_* + * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*_READ + * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*_READ + * + * As a result, to calculate the "exclusive mask", first we invert the + * direction (USED_IN/ENABLED) of the original mask, and 1) for all bits with + * bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). 2) for all + * bits with bitnr0 cleared (LOCK_*_READ), add those with bitnr0 set (LOCK_*). */ static unsigned long exclusive_mask(unsigned long mask) { unsigned long excl = invert_dir_mask(mask); - /* Strip read */ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; - excl &= ~LOCKF_IRQ_READ; + excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; return excl; } @@ -2433,6 +2505,7 @@ static unsigned long original_mask(unsigned long mask) unsigned long excl = invert_dir_mask(mask); /* Include read in existing usages */ + excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; return excl; @@ -2447,14 +2520,24 @@ static int find_exclusive_match(unsigned long mask, enum lock_usage_bit *bitp, enum lock_usage_bit *excl_bitp) { - int bit, excl; + int bit, excl, excl_read; for_each_set_bit(bit, &mask, LOCK_USED) { + /* + * exclusive_bit() strips the read bit, however, + * LOCK_ENABLED_IRQ_*_READ may cause deadlocks too, so we need + * to search excl | LOCK_USAGE_READ_MASK as well. + */ excl = exclusive_bit(bit); + excl_read = excl | LOCK_USAGE_READ_MASK; if (excl_mask & lock_flag(excl)) { *bitp = bit; *excl_bitp = excl; return 0; + } else if (excl_mask & lock_flag(excl_read)) { + *bitp = bit; + *excl_bitp = excl_read; + return 0; } } return -1; @@ -2480,8 +2563,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, * Step 1: gather all hard/soft IRQs usages backward in an * accumulated usage mask. */ - this.parent = NULL; - this.class = hlock_class(prev); + bfs_init_rootb(&this, prev); ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); if (bfs_error(ret)) { @@ -2499,8 +2581,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, */ forward_mask = exclusive_mask(usage_mask); - that.parent = NULL; - that.class = hlock_class(next); + bfs_init_root(&that, next); ret = find_usage_forwards(&that, forward_mask, &target_entry1); if (bfs_error(ret)) { @@ -3695,14 +3776,16 @@ print_irq_inversion_bug(struct task_struct *curr, */ static int check_usage_forwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) + enum lock_usage_bit bit) { enum bfs_result ret; struct lock_list root; struct lock_list *target_entry; + enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK; + unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit); bfs_init_root(&root, this); - ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); + ret = find_usage_forwards(&root, usage_mask, &target_entry); if (bfs_error(ret)) { print_bfs_bug(ret); return 0; @@ -3710,8 +3793,15 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, if (ret == BFS_RNOMATCH) return 1; - print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); + /* Check whether write or read usage is the match */ + if (target_entry->class->usage_mask & lock_flag(bit)) { + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, state_name(bit)); + } else { + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, state_name(read_bit)); + } + return 0; } @@ -3721,14 +3811,16 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, */ static int check_usage_backwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) + enum lock_usage_bit bit) { enum bfs_result ret; struct lock_list root; struct lock_list *target_entry; + enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK; + unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit); bfs_init_rootb(&root, this); - ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); + ret = find_usage_backwards(&root, usage_mask, &target_entry); if (bfs_error(ret)) { print_bfs_bug(ret); return 0; @@ -3736,8 +3828,15 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, if (ret == BFS_RNOMATCH) return 1; - print_irq_inversion_bug(curr, &root, target_entry, - this, 0, irqclass); + /* Check whether write or read usage is the match */ + if (target_entry->class->usage_mask & lock_flag(bit)) { + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, state_name(bit)); + } else { + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, state_name(read_bit)); + } + return 0; } @@ -3776,8 +3875,6 @@ static int SOFTIRQ_verbose(struct lock_class *class) return 0; } -#define STRICT_READ_CHECKS 1 - static int (*state_verbose_f[])(struct lock_class *class) = { #define LOCKDEP_STATE(__STATE) \ __STATE##_verbose, @@ -3802,16 +3899,6 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int read = new_bit & LOCK_USAGE_READ_MASK; int dir = new_bit & LOCK_USAGE_DIR_MASK; - /* - * mark USED_IN has to look forwards -- to ensure no dependency - * has ENABLED state, which would allow recursion deadlocks. - * - * mark ENABLED has to look backwards -- to ensure no dependee - * has USED_IN state, which, again, would allow recursion deadlocks. - */ - check_usage_f usage = dir ? - check_usage_backwards : check_usage_forwards; - /* * Validate that this particular lock does not have conflicting * usage states. @@ -3819,24 +3906,31 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, if (!valid_state(curr, this, new_bit, excl_bit)) return 0; + /* + * Check for read in write conflicts + */ + if (!read && !valid_state(curr, this, new_bit, + excl_bit + LOCK_USAGE_READ_MASK)) + return 0; + + /* * Validate that the lock dependencies don't have conflicting usage * states. */ - if ((!read || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) - return 0; - - /* - * Check for read in write conflicts - */ - if (!read) { - if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK)) + if (dir) { + /* + * mark ENABLED has to look backwards -- to ensure no dependee + * has USED_IN state, which, again, would allow recursion deadlocks. + */ + if (!check_usage_backwards(curr, this, excl_bit)) return 0; - - if (STRICT_READ_CHECKS && - !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK, - state_name(new_bit + LOCK_USAGE_READ_MASK))) + } else { + /* + * mark USED_IN has to look forwards -- to ensure no dependency + * has ENABLED state, which would allow recursion deadlocks. + */ + if (!check_usage_forwards(curr, this, excl_bit)) return 0; } From 621c9dac0eea7607cb9a57cc9ba47fbcd4e644c9 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:31 +0800 Subject: [PATCH 34/67] lockdep: Add recursive read locks into dependency graph Since we have all the fundamental to handle recursive read locks, we now add them into the dependency graph. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-13-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 6644974a4f5a..b87766e080f5 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2808,16 +2808,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (!check_irq_usage(curr, prev, next)) return 0; - /* - * For recursive read-locks we do all the dependency checks, - * but we dont store read-triggered dependencies (only - * write-triggered dependencies). This ensures that only the - * write-side dependencies matter, and that if for example a - * write-lock never takes any other locks, then the reads are - * equivalent to a NOP. - */ - if (next->read == 2 || prev->read == 2) - return 1; /* * Is the -> dependency already present? * @@ -2935,13 +2925,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) u16 distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; - /* - * Only non-recursive-read entries get new dependencies - * added: - */ - if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, distance, - &trace); + if (hlock->check) { + int ret = check_prev_add(curr, hlock, next, distance, &trace); if (!ret) return 0; From d4f200e579e96051f1f081f795820787826eb234 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:32 +0800 Subject: [PATCH 35/67] lockdep/selftest: Add a R-L/L-W test case specific to chain cache behavior As our chain cache doesn't differ read/write locks, so even we can detect a read-lock/lock-write deadlock in check_noncircular(), we can still be fooled if a read-lock/lock-read case(which is not a deadlock) comes first. So introduce this test case to test specific to the chain cache behavior on detecting recursive read lock related deadlocks. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-14-boqun.feng@gmail.com --- lib/locking-selftest.c | 47 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index caadc4dd3368..002d1ec09852 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -396,6 +396,49 @@ static void rwsem_ABBA1(void) MU(Y1); // should fail } +/* + * read_lock(A) + * spin_lock(B) + * spin_lock(B) + * write_lock(A) + * + * This test case is aimed at poking whether the chain cache prevents us from + * detecting a read-lock/lock-write deadlock: if the chain cache doesn't differ + * read/write locks, the following case may happen + * + * { read_lock(A)->lock(B) dependency exists } + * + * P0: + * lock(B); + * read_lock(A); + * + * { Not a deadlock, B -> A is added in the chain cache } + * + * P1: + * lock(B); + * write_lock(A); + * + * { B->A found in chain cache, not reported as a deadlock } + * + */ +static void rlock_chaincache_ABBA1(void) +{ + RL(X1); + L(Y1); + U(Y1); + RU(X1); + + L(Y1); + RL(X1); + RU(X1); + U(Y1); + + L(Y1); + WL(X1); + WU(X1); + U(Y1); // should fail +} + /* * read_lock(A) * spin_lock(B) @@ -2062,6 +2105,10 @@ void locking_selftest(void) pr_cont(" |"); dotest(rwsem_ABBA3, FAILURE, LOCKTYPE_RWSEM); + print_testname("chain cached mixed R-L/L-W ABBA"); + pr_cont(" |"); + dotest(rlock_chaincache_ABBA1, FAILURE, LOCKTYPE_RWLOCK); + printk(" --------------------------------------------------------------------------\n"); /* From f611e8cf98ec908b9c2c0da6064a660fc6022487 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:33 +0800 Subject: [PATCH 36/67] lockdep: Take read/write status in consideration when generate chainkey Currently, the chainkey of a lock chain is a hash sum of the class_idx of all the held locks, the read/write status are not taken in to consideration while generating the chainkey. This could result into a problem, if we have: P1() { read_lock(B); lock(A); } P2() { lock(A); read_lock(B); } P3() { lock(A); write_lock(B); } , and P1(), P2(), P3() run one by one. And when running P2(), lockdep detects such a lock chain A -> B is not a deadlock, then it's added in the chain cache, and then when running P3(), even if it's a deadlock, we could miss it because of the hit of chain cache. This could be confirmed by self testcase "chain cached mixed R-L/L-W ". To resolve this, we use concept "hlock_id" to generate the chainkey, the hlock_id is a tuple (hlock->class_idx, hlock->read), which fits in a u16 type. With this, the chainkeys are different is the lock sequences have the same locks but different read/write status. Besides, since we use "hlock_id" to generate chainkeys, the chain_hlocks array now store the "hlock_id"s rather than lock_class indexes. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-15-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 53 ++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b87766e080f5..cccf4bc759c6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -371,6 +371,21 @@ static struct hlist_head classhash_table[CLASSHASH_SIZE]; static struct hlist_head chainhash_table[CHAINHASH_SIZE]; +/* + * the id of held_lock + */ +static inline u16 hlock_id(struct held_lock *hlock) +{ + BUILD_BUG_ON(MAX_LOCKDEP_KEYS_BITS + 2 > 16); + + return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS)); +} + +static inline unsigned int chain_hlock_class_idx(u16 hlock_id) +{ + return hlock_id & (MAX_LOCKDEP_KEYS - 1); +} + /* * The hash key of the lock dependency chains is a hash itself too: * it's a hash of all locks taken up to that lock, including that lock. @@ -3202,7 +3217,10 @@ static inline void free_chain_hlocks(int base, int size) struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) { - return lock_classes + chain_hlocks[chain->base + i]; + u16 chain_hlock = chain_hlocks[chain->base + i]; + unsigned int class_idx = chain_hlock_class_idx(chain_hlock); + + return lock_classes + class_idx - 1; } /* @@ -3228,12 +3246,12 @@ static inline int get_first_held_lock(struct task_struct *curr, /* * Returns the next chain_key iteration */ -static u64 print_chain_key_iteration(int class_idx, u64 chain_key) +static u64 print_chain_key_iteration(u16 hlock_id, u64 chain_key) { - u64 new_chain_key = iterate_chain_key(chain_key, class_idx); + u64 new_chain_key = iterate_chain_key(chain_key, hlock_id); - printk(" class_idx:%d -> chain_key:%016Lx", - class_idx, + printk(" hlock_id:%d -> chain_key:%016Lx", + (unsigned int)hlock_id, (unsigned long long)new_chain_key); return new_chain_key; } @@ -3250,12 +3268,12 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne hlock_next->irq_context); for (; i < depth; i++) { hlock = curr->held_locks + i; - chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); + chain_key = print_chain_key_iteration(hlock_id(hlock), chain_key); print_lock(hlock); } - print_chain_key_iteration(hlock_next->class_idx, chain_key); + print_chain_key_iteration(hlock_id(hlock_next), chain_key); print_lock(hlock_next); } @@ -3263,14 +3281,14 @@ static void print_chain_keys_chain(struct lock_chain *chain) { int i; u64 chain_key = INITIAL_CHAIN_KEY; - int class_id; + u16 hlock_id; printk("depth: %u\n", chain->depth); for (i = 0; i < chain->depth; i++) { - class_id = chain_hlocks[chain->base + i]; - chain_key = print_chain_key_iteration(class_id, chain_key); + hlock_id = chain_hlocks[chain->base + i]; + chain_key = print_chain_key_iteration(hlock_id, chain_key); - print_lock_name(lock_classes + class_id); + print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1); printk("\n"); } } @@ -3319,7 +3337,7 @@ static int check_no_collision(struct task_struct *curr, } for (j = 0; j < chain->depth - 1; j++, i++) { - id = curr->held_locks[i].class_idx; + id = hlock_id(&curr->held_locks[i]); if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { print_collision(curr, hlock, chain); @@ -3368,7 +3386,6 @@ static inline int add_chain_cache(struct task_struct *curr, struct held_lock *hlock, u64 chain_key) { - struct lock_class *class = hlock_class(hlock); struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; int i, j; @@ -3411,11 +3428,11 @@ static inline int add_chain_cache(struct task_struct *curr, chain->base = j; for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx; + int lock_id = hlock_id(curr->held_locks + i); chain_hlocks[chain->base + j] = lock_id; } - chain_hlocks[chain->base + j] = class - lock_classes; + chain_hlocks[chain->base + j] = hlock_id(hlock); hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(chain->irq_context); @@ -3602,7 +3619,7 @@ static void check_chain_key(struct task_struct *curr) if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) chain_key = INITIAL_CHAIN_KEY; - chain_key = iterate_chain_key(chain_key, hlock->class_idx); + chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); prev_hlock = hlock; } if (chain_key != curr->curr_chain_key) { @@ -4749,7 +4766,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, chain_key = INITIAL_CHAIN_KEY; chain_head = 1; } - chain_key = iterate_chain_key(chain_key, class_idx); + chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); if (nest_lock && !__lock_is_held(nest_lock, -1)) { print_lock_nested_lock_not_held(curr, hlock, ip); @@ -5648,7 +5665,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf, int i; for (i = chain->base; i < chain->base + chain->depth; i++) { - if (chain_hlocks[i] != class - lock_classes) + if (chain_hlock_class_idx(chain_hlocks[i]) != class - lock_classes) continue; /* * Each lock class occurs at most once in a lock chain so once From 31e0d747708272356bee9b6a1b90c1e6525b0f6d Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:34 +0800 Subject: [PATCH 37/67] lockdep/selftest: Unleash irq_read_recursion2 and add more Now since we can handle recursive read related irq inversion deadlocks correctly, uncomment the irq_read_recursion2 and add more testcases. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-16-boqun.feng@gmail.com --- lib/locking-selftest.c | 59 +++++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 002d1ec09852..f65a658cc9e3 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -1053,20 +1053,28 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock) #define E3() \ \ IRQ_ENTER(); \ - RL(A); \ + LOCK(A); \ L(B); \ U(B); \ - RU(A); \ + UNLOCK(A); \ IRQ_EXIT(); /* - * Generate 12 testcases: + * Generate 24 testcases: */ #include "locking-selftest-hardirq.h" -GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_hard) +#include "locking-selftest-rlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_hard_rlock) + +#include "locking-selftest-wlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_hard_wlock) #include "locking-selftest-softirq.h" -GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft) +#include "locking-selftest-rlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_rlock) + +#include "locking-selftest-wlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_wlock) #undef E1 #undef E2 @@ -1080,8 +1088,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft) \ IRQ_DISABLE(); \ L(B); \ - WL(A); \ - WU(A); \ + LOCK(A); \ + UNLOCK(A); \ U(B); \ IRQ_ENABLE(); @@ -1098,13 +1106,21 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft) IRQ_EXIT(); /* - * Generate 12 testcases: + * Generate 24 testcases: */ #include "locking-selftest-hardirq.h" -// GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_hard) +#include "locking-selftest-rlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_hard_rlock) + +#include "locking-selftest-wlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_hard_wlock) #include "locking-selftest-softirq.h" -// GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft) +#include "locking-selftest-rlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_rlock) + +#include "locking-selftest-wlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_wlock) #ifdef CONFIG_DEBUG_LOCK_ALLOC # define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) @@ -1257,6 +1273,25 @@ static inline void print_testname(const char *testname) dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ pr_cont("\n"); +#define DO_TESTCASE_2RW(desc, name, nr) \ + print_testname(desc"/"#nr); \ + pr_cont(" |"); \ + dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ + pr_cont("\n"); + +#define DO_TESTCASE_2x2RW(desc, name, nr) \ + DO_TESTCASE_2RW("hard-"desc, name##_hard, nr) \ + DO_TESTCASE_2RW("soft-"desc, name##_soft, nr) \ + +#define DO_TESTCASE_6x2x2RW(desc, name) \ + DO_TESTCASE_2x2RW(desc, name, 123); \ + DO_TESTCASE_2x2RW(desc, name, 132); \ + DO_TESTCASE_2x2RW(desc, name, 213); \ + DO_TESTCASE_2x2RW(desc, name, 231); \ + DO_TESTCASE_2x2RW(desc, name, 312); \ + DO_TESTCASE_2x2RW(desc, name, 321); + #define DO_TESTCASE_6(desc, name) \ print_testname(desc); \ dotest(name##_spin, FAILURE, LOCKTYPE_SPIN); \ @@ -2121,8 +2156,8 @@ void locking_selftest(void) DO_TESTCASE_6x6("safe-A + unsafe-B #2", irqsafe4); DO_TESTCASE_6x6RW("irq lock-inversion", irq_inversion); - DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion); -// DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2); + DO_TESTCASE_6x2x2RW("irq read-recursion", irq_read_recursion); + DO_TESTCASE_6x2x2RW("irq read-recursion #2", irq_read_recursion2); ww_tests(); From 8ef7ca75120a39167def40f41daefee013c4b5af Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:35 +0800 Subject: [PATCH 38/67] lockdep/selftest: Add more recursive read related test cases Add those four test cases: 1. X --(ER)--> Y --(ER)--> Z --(ER)--> X is deadlock. 2. X --(EN)--> Y --(SR)--> Z --(ER)--> X is deadlock. 3. X --(EN)--> Y --(SR)--> Z --(SN)--> X is not deadlock. 4. X --(ER)--> Y --(SR)--> Z --(EN)--> X is not deadlock. Those self testcases are valuable for the development of supporting recursive read related deadlock detection. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-17-boqun.feng@gmail.com --- lib/locking-selftest.c | 161 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index f65a658cc9e3..76c314ab4f03 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -1034,6 +1034,133 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock) #undef E2 #undef E3 +/* + * write-read / write-read / write-read deadlock even if read is recursive + */ + +#define E1() \ + \ + WL(X1); \ + RL(Y1); \ + RU(Y1); \ + WU(X1); + +#define E2() \ + \ + WL(Y1); \ + RL(Z1); \ + RU(Z1); \ + WU(Y1); + +#define E3() \ + \ + WL(Z1); \ + RL(X1); \ + RU(X1); \ + WU(Z1); + +#include "locking-selftest-rlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(W1R2_W2R3_W3R1) + +#undef E1 +#undef E2 +#undef E3 + +/* + * write-write / read-read / write-read deadlock even if read is recursive + */ + +#define E1() \ + \ + WL(X1); \ + WL(Y1); \ + WU(Y1); \ + WU(X1); + +#define E2() \ + \ + RL(Y1); \ + RL(Z1); \ + RU(Z1); \ + RU(Y1); + +#define E3() \ + \ + WL(Z1); \ + RL(X1); \ + RU(X1); \ + WU(Z1); + +#include "locking-selftest-rlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(W1W2_R2R3_W3R1) + +#undef E1 +#undef E2 +#undef E3 + +/* + * write-write / read-read / read-write is not deadlock when read is recursive + */ + +#define E1() \ + \ + WL(X1); \ + WL(Y1); \ + WU(Y1); \ + WU(X1); + +#define E2() \ + \ + RL(Y1); \ + RL(Z1); \ + RU(Z1); \ + RU(Y1); + +#define E3() \ + \ + RL(Z1); \ + WL(X1); \ + WU(X1); \ + RU(Z1); + +#include "locking-selftest-rlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(W1R2_R2R3_W3W1) + +#undef E1 +#undef E2 +#undef E3 + +/* + * write-read / read-read / write-write is not deadlock when read is recursive + */ + +#define E1() \ + \ + WL(X1); \ + RL(Y1); \ + RU(Y1); \ + WU(X1); + +#define E2() \ + \ + RL(Y1); \ + RL(Z1); \ + RU(Z1); \ + RU(Y1); + +#define E3() \ + \ + WL(Z1); \ + WL(X1); \ + WU(X1); \ + WU(Z1); + +#include "locking-selftest-rlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(W1W2_R2R3_R3W1) + +#undef E1 +#undef E2 +#undef E3 /* * read-lock / write-lock recursion that is actually safe. */ @@ -1259,6 +1386,19 @@ static inline void print_testname(const char *testname) dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK); \ pr_cont("\n"); +#define DO_TESTCASE_1RR(desc, name, nr) \ + print_testname(desc"/"#nr); \ + pr_cont(" |"); \ + dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ + pr_cont("\n"); + +#define DO_TESTCASE_1RRB(desc, name, nr) \ + print_testname(desc"/"#nr); \ + pr_cont(" |"); \ + dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK); \ + pr_cont("\n"); + + #define DO_TESTCASE_3(desc, name, nr) \ print_testname(desc"/"#nr); \ dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ @@ -1368,6 +1508,22 @@ static inline void print_testname(const char *testname) DO_TESTCASE_2IB(desc, name, 312); \ DO_TESTCASE_2IB(desc, name, 321); +#define DO_TESTCASE_6x1RR(desc, name) \ + DO_TESTCASE_1RR(desc, name, 123); \ + DO_TESTCASE_1RR(desc, name, 132); \ + DO_TESTCASE_1RR(desc, name, 213); \ + DO_TESTCASE_1RR(desc, name, 231); \ + DO_TESTCASE_1RR(desc, name, 312); \ + DO_TESTCASE_1RR(desc, name, 321); + +#define DO_TESTCASE_6x1RRB(desc, name) \ + DO_TESTCASE_1RRB(desc, name, 123); \ + DO_TESTCASE_1RRB(desc, name, 132); \ + DO_TESTCASE_1RRB(desc, name, 213); \ + DO_TESTCASE_1RRB(desc, name, 231); \ + DO_TESTCASE_1RRB(desc, name, 312); \ + DO_TESTCASE_1RRB(desc, name, 321); + #define DO_TESTCASE_6x6(desc, name) \ DO_TESTCASE_6I(desc, name, 123); \ DO_TESTCASE_6I(desc, name, 132); \ @@ -2144,6 +2300,11 @@ void locking_selftest(void) pr_cont(" |"); dotest(rlock_chaincache_ABBA1, FAILURE, LOCKTYPE_RWLOCK); + DO_TESTCASE_6x1RRB("rlock W1R2/W2R3/W3R1", W1R2_W2R3_W3R1); + DO_TESTCASE_6x1RRB("rlock W1W2/R2R3/W3R1", W1W2_R2R3_W3R1); + DO_TESTCASE_6x1RR("rlock W1W2/R2R3/R3W1", W1W2_R2R3_R3W1); + DO_TESTCASE_6x1RR("rlock W1R2/R2R3/W3W1", W1R2_R2R3_W3W1); + printk(" --------------------------------------------------------------------------\n"); /* From 108dc42ed3507fe06214d51ab15fca7771df8bbd Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:36 +0800 Subject: [PATCH 39/67] Revert "locking/lockdep/selftests: Fix mixed read-write ABBA tests" This reverts commit d82fed75294229abc9d757f08a4817febae6c4f4. Since we now could handle mixed read-write deadlock detection well, the self tests could be detected as expected, no need to use this work-around. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-18-boqun.feng@gmail.com --- lib/locking-selftest.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 76c314ab4f03..4264cf4b60bb 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -2273,14 +2273,6 @@ void locking_selftest(void) print_testname("mixed read-lock/lock-write ABBA"); pr_cont(" |"); dotest(rlock_ABBA1, FAILURE, LOCKTYPE_RWLOCK); -#ifdef CONFIG_PROVE_LOCKING - /* - * Lockdep does indeed fail here, but there's nothing we can do about - * that now. Don't kill lockdep for it. - */ - unexpected_testcase_failures--; -#endif - pr_cont(" |"); dotest(rwsem_ABBA1, FAILURE, LOCKTYPE_RWSEM); From ad56450db86413ff911eb527b5a49e04a4345e61 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:37 +0800 Subject: [PATCH 40/67] locking/selftest: Add test cases for queued_read_lock() Add two self test cases for the following case: P0: P1: P2: spin_lock_irq(&slock) read_lock(&rwlock) write_lock_irq(&rwlock) read_lock(&rwlock) spin_lock(&slock) , which is a deadlock, as the read_lock() on P0 cannot get the lock because of the fairness. P0: P1: P2: spin_lock(&slock) read_lock(&rwlock) write_lock(&rwlock) read_lock(&rwlock) spin_lock_irq(&slock) , which is not a deadlock, as the read_lock() on P0 can get the lock because it could use the unfair fastpass. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-19-boqun.feng@gmail.com --- lib/locking-selftest.c | 104 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 4264cf4b60bb..17f8f6f37165 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -2201,6 +2201,108 @@ static void ww_tests(void) pr_cont("\n"); } + +/* + * + * read_lock(&A); + * + * spin_lock(&B); + * spin_lock(&B); + * read_lock(&A); + * + * is a deadlock. + */ +static void queued_read_lock_hardirq_RE_Er(void) +{ + HARDIRQ_ENTER(); + read_lock(&rwlock_A); + LOCK(B); + UNLOCK(B); + read_unlock(&rwlock_A); + HARDIRQ_EXIT(); + + HARDIRQ_DISABLE(); + LOCK(B); + read_lock(&rwlock_A); + read_unlock(&rwlock_A); + UNLOCK(B); + HARDIRQ_ENABLE(); +} + +/* + * + * spin_lock(&B); + * + * read_lock(&A); + * read_lock(&A); + * spin_lock(&B); + * + * is not a deadlock. + */ +static void queued_read_lock_hardirq_ER_rE(void) +{ + HARDIRQ_ENTER(); + LOCK(B); + read_lock(&rwlock_A); + read_unlock(&rwlock_A); + UNLOCK(B); + HARDIRQ_EXIT(); + + HARDIRQ_DISABLE(); + read_lock(&rwlock_A); + LOCK(B); + UNLOCK(B); + read_unlock(&rwlock_A); + HARDIRQ_ENABLE(); +} + +/* + * + * spin_lock(&B); + * read_lock(&A); + * + * spin_lock(&B); + * read_lock(&A); + * + * is a deadlock. Because the two read_lock()s are both non-recursive readers. + */ +static void queued_read_lock_hardirq_inversion(void) +{ + + HARDIRQ_ENTER(); + LOCK(B); + UNLOCK(B); + HARDIRQ_EXIT(); + + HARDIRQ_DISABLE(); + LOCK(B); + read_lock(&rwlock_A); + read_unlock(&rwlock_A); + UNLOCK(B); + HARDIRQ_ENABLE(); + + read_lock(&rwlock_A); + read_unlock(&rwlock_A); +} + +static void queued_read_lock_tests(void) +{ + printk(" --------------------------------------------------------------------------\n"); + printk(" | queued read lock tests |\n"); + printk(" ---------------------------\n"); + print_testname("hardirq read-lock/lock-read"); + dotest(queued_read_lock_hardirq_RE_Er, FAILURE, LOCKTYPE_RWLOCK); + pr_cont("\n"); + + print_testname("hardirq lock-read/read-lock"); + dotest(queued_read_lock_hardirq_ER_rE, SUCCESS, LOCKTYPE_RWLOCK); + pr_cont("\n"); + + print_testname("hardirq inversion"); + dotest(queued_read_lock_hardirq_inversion, FAILURE, LOCKTYPE_RWLOCK); + pr_cont("\n"); +} + void locking_selftest(void) { /* @@ -2318,6 +2420,8 @@ void locking_selftest(void) /* * queued_read_lock() specific test cases can be put here */ + if (IS_ENABLED(CONFIG_QUEUED_RWLOCKS)) + queued_read_lock_tests(); if (unexpected_testcase_failures) { printk("-----------------------------------------------------------------\n"); From 96a16f45aed89cf024606a11679b0609d09552c7 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:38 +0800 Subject: [PATCH 41/67] lockdep/selftest: Introduce recursion3 Add a test case shows that USED_IN_*_READ and ENABLE_*_READ can cause deadlock too. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-20-boqun.feng@gmail.com --- lib/locking-selftest.c | 55 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 17f8f6f37165..a899b3f0e2e5 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -1249,6 +1249,60 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_wlock) +#undef E1 +#undef E2 +#undef E3 +/* + * read-lock / write-lock recursion that is unsafe. + * + * A is a ENABLED_*_READ lock + * B is a USED_IN_*_READ lock + * + * read_lock(A); + * write_lock(B); + * + * read_lock(B); + * write_lock(A); // if this one is read_lock(), no deadlock + */ + +#define E1() \ + \ + IRQ_DISABLE(); \ + WL(B); \ + LOCK(A); \ + UNLOCK(A); \ + WU(B); \ + IRQ_ENABLE(); + +#define E2() \ + \ + RL(A); \ + RU(A); \ + +#define E3() \ + \ + IRQ_ENTER(); \ + RL(B); \ + RU(B); \ + IRQ_EXIT(); + +/* + * Generate 24 testcases: + */ +#include "locking-selftest-hardirq.h" +#include "locking-selftest-rlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_hard_rlock) + +#include "locking-selftest-wlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_hard_wlock) + +#include "locking-selftest-softirq.h" +#include "locking-selftest-rlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_rlock) + +#include "locking-selftest-wlock.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_wlock) + #ifdef CONFIG_DEBUG_LOCK_ALLOC # define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) # define I_RWLOCK(x) lockdep_reset_lock(&rwlock_##x.dep_map) @@ -2413,6 +2467,7 @@ void locking_selftest(void) DO_TESTCASE_6x2x2RW("irq read-recursion", irq_read_recursion); DO_TESTCASE_6x2x2RW("irq read-recursion #2", irq_read_recursion2); + DO_TESTCASE_6x2x2RW("irq read-recursion #3", irq_read_recursion3); ww_tests(); From cd290ec24633f51029dab0d25505fae7da0e1eda Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 21 Aug 2020 14:31:26 +0200 Subject: [PATCH 42/67] kcsan: Use tracing-safe version of prandom In the core runtime, we must minimize any calls to external library functions to avoid any kind of recursion. This can happen even though instrumentation is disabled for called functions, but tracing is enabled. Most recently, prandom_u32() added a tracepoint, which can cause problems for KCSAN even if the rcuidle variant is used. For example: kcsan -> prandom_u32() -> trace_prandom_u32_rcuidle -> srcu_read_lock_notrace -> __srcu_read_lock -> kcsan ... While we could disable KCSAN in kcsan_setup_watchpoint(), this does not solve other unexpected behaviour we may get due recursing into functions that may not be tolerant to such recursion: __srcu_read_lock -> kcsan -> ... -> __srcu_read_lock Therefore, switch to using prandom_u32_state(), which is uninstrumented, and does not have a tracepoint. Link: https://lkml.kernel.org/r/20200821063043.1949509-1-elver@google.com Link: https://lkml.kernel.org/r/20200820172046.GA177701@elver.google.com Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 8a1ff605ff2d..3994a217bde7 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -100,6 +100,9 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; */ static DEFINE_PER_CPU(long, kcsan_skip); +/* For kcsan_prandom_u32_max(). */ +static DEFINE_PER_CPU(struct rnd_state, kcsan_rand_state); + static __always_inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, bool expect_write, @@ -271,11 +274,28 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx * return true; } +/* + * Returns a pseudo-random number in interval [0, ep_ro). See prandom_u32_max() + * for more details. + * + * The open-coded version here is using only safe primitives for all contexts + * where we can have KCSAN instrumentation. In particular, we cannot use + * prandom_u32() directly, as its tracepoint could cause recursion. + */ +static u32 kcsan_prandom_u32_max(u32 ep_ro) +{ + struct rnd_state *state = &get_cpu_var(kcsan_rand_state); + const u32 res = prandom_u32_state(state); + + put_cpu_var(kcsan_rand_state); + return (u32)(((u64) res * ep_ro) >> 32); +} + static inline void reset_kcsan_skip(void) { long skip_count = kcsan_skip_watch - (IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ? - prandom_u32_max(kcsan_skip_watch) : + kcsan_prandom_u32_max(kcsan_skip_watch) : 0); this_cpu_write(kcsan_skip, skip_count); } @@ -285,16 +305,18 @@ static __always_inline bool kcsan_is_enabled(void) return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; } -static inline unsigned int get_delay(int type) +/* Introduce delay depending on context and configuration. */ +static void delay_access(int type) { unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt; /* For certain access types, skew the random delay to be longer. */ unsigned int skew_delay_order = (type & (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_ASSERT)) ? 1 : 0; - return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? - prandom_u32_max(delay >> skew_delay_order) : - 0); + delay -= IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? + kcsan_prandom_u32_max(delay >> skew_delay_order) : + 0; + udelay(delay); } void kcsan_save_irqtrace(struct task_struct *task) @@ -476,7 +498,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * Delay this thread, to increase probability of observing a racy * conflicting access. */ - udelay(get_delay(type)); + delay_access(type); /* * Re-read value, and check if it is as expected; if not, we infer a @@ -620,6 +642,7 @@ void __init kcsan_init(void) BUG_ON(!in_task()); kcsan_debugfs_init(); + prandom_seed_full_state(&kcsan_rand_state); /* * We are in the init task, and no other tasks should be running; From 1e44e6e82e7b4d2bae70a8a0b68f7d4f213b0e5f Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Mon, 6 Jul 2020 21:03:24 +0200 Subject: [PATCH 43/67] Replace HTTP links with HTTPS ones: LKMM Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Signed-off-by: Paul E. McKenney --- tools/memory-model/Documentation/references.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/Documentation/references.txt b/tools/memory-model/Documentation/references.txt index ecbbaa5396d4..c5fdfd19df24 100644 --- a/tools/memory-model/Documentation/references.txt +++ b/tools/memory-model/Documentation/references.txt @@ -120,7 +120,7 @@ o Jade Alglave, Luc Maranget, and Michael Tautschnig. 2014. "Herding o Jade Alglave, Patrick Cousot, and Luc Maranget. 2016. "Syntax and semantics of the weak consistency model specification language - cat". CoRR abs/1608.07531 (2016). http://arxiv.org/abs/1608.07531 + cat". CoRR abs/1608.07531 (2016). https://arxiv.org/abs/1608.07531 Memory-model comparisons From cc9628b45c9fa9b165a50dbb262928bc529bf35d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 Jul 2020 16:08:23 -0700 Subject: [PATCH 44/67] tools/memory-model: Update recipes.txt prime_numbers.c path The expand_to_next_prime() and next_prime_number() functions have moved from lib/prime_numbers.c to lib/math/prime_numbers.c, so this commit updates recipes.txt to reflect this change. Signed-off-by: Paul E. McKenney --- tools/memory-model/Documentation/recipes.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/memory-model/Documentation/recipes.txt b/tools/memory-model/Documentation/recipes.txt index 63c4adfed884..03f58b11c252 100644 --- a/tools/memory-model/Documentation/recipes.txt +++ b/tools/memory-model/Documentation/recipes.txt @@ -1,7 +1,7 @@ This document provides "recipes", that is, litmus tests for commonly occurring situations, as well as a few that illustrate subtly broken but attractive nuisances. Many of these recipes include example code from -v4.13 of the Linux kernel. +v5.7 of the Linux kernel. The first section covers simple special cases, the second section takes off the training wheels to cover more involved examples, @@ -278,7 +278,7 @@ is present if the value loaded determines the address of a later access first place (control dependency). Note that the term "data dependency" is sometimes casually used to cover both address and data dependencies. -In lib/prime_numbers.c, the expand_to_next_prime() function invokes +In lib/math/prime_numbers.c, the expand_to_next_prime() function invokes rcu_assign_pointer(), and the next_prime_number() function invokes rcu_dereference(). This combination mediates access to a bit vector that is expanded as additional primes are needed. From 984f272be9d7b2dd8b17e35d437e5da500b502ae Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Aug 2020 17:19:54 -0700 Subject: [PATCH 45/67] tools/memory-model: Improve litmus-test documentation The current LKMM documentation says very little about litmus tests, and worse yet directs people to the herd7 documentation for more information. Now, the herd7 documentation is quite voluminous and educational, but it is intended for people creating and modifying memory models, not those attempting to use them. This commit therefore updates README and creates a litmus-tests.txt file that gives an overview of litmus-test format and describes ways of modeling various special cases, illustrated with numerous examples. [ paulmck: Add Alan Stern feedback. ] [ paulmck: Apply Dave Chinner feedback. ] [ paulmck: Apply Andrii Nakryiko feedback. ] [ paulmck: Apply Johannes Weiner feedback. ] Link: https://lwn.net/Articles/827180/ Reported-by: Dave Chinner Acked-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- .../Documentation/litmus-tests.txt | 1070 +++++++++++++++++ tools/memory-model/README | 155 +-- 2 files changed, 1108 insertions(+), 117 deletions(-) create mode 100644 tools/memory-model/Documentation/litmus-tests.txt diff --git a/tools/memory-model/Documentation/litmus-tests.txt b/tools/memory-model/Documentation/litmus-tests.txt new file mode 100644 index 000000000000..289a38d626dd --- /dev/null +++ b/tools/memory-model/Documentation/litmus-tests.txt @@ -0,0 +1,1070 @@ +Linux-Kernel Memory Model Litmus Tests +====================================== + +This file describes the LKMM litmus-test format by example, describes +some tricks and traps, and finally outlines LKMM's limitations. Earlier +versions of this material appeared in a number of LWN articles, including: + +https://lwn.net/Articles/720550/ + A formal kernel memory-ordering model (part 2) +https://lwn.net/Articles/608550/ + Axiomatic validation of memory barriers and atomic instructions +https://lwn.net/Articles/470681/ + Validating Memory Barriers and Atomic Instructions + +This document presents information in decreasing order of applicability, +so that, where possible, the information that has proven more commonly +useful is shown near the beginning. + +For information on installing LKMM, including the underlying "herd7" +tool, please see tools/memory-model/README. + + +Copy-Pasta +========== + +As with other software, it is often better (if less macho) to adapt an +existing litmus test than it is to create one from scratch. A number +of litmus tests may be found in the kernel source tree: + + tools/memory-model/litmus-tests/ + Documentation/litmus-tests/ + +Several thousand more example litmus tests are available on github +and kernel.org: + + https://github.com/paulmckrcu/litmus + https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/perfbook.git/tree/CodeSamples/formal/herd + https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/perfbook.git/tree/CodeSamples/formal/litmus + +The -l and -L arguments to "git grep" can be quite helpful in identifying +existing litmus tests that are similar to the one you need. But even if +you start with an existing litmus test, it is still helpful to have a +good understanding of the litmus-test format. + + +Examples and Format +=================== + +This section describes the overall format of litmus tests, starting +with a small example of the message-passing pattern and moving on to +more complex examples that illustrate explicit initialization and LKMM's +minimalistic set of flow-control statements. + + +Message-Passing Example +----------------------- + +This section gives an overview of the format of a litmus test using an +example based on the common message-passing use case. This use case +appears often in the Linux kernel. For example, a flag (modeled by "y" +below) indicates that a buffer (modeled by "x" below) is now completely +filled in and ready for use. It would be very bad if the consumer saw the +flag set, but, due to memory misordering, saw old values in the buffer. + +This example asks whether smp_store_release() and smp_load_acquire() +suffices to avoid this bad outcome: + + 1 C MP+pooncerelease+poacquireonce + 2 + 3 {} + 4 + 5 P0(int *x, int *y) + 6 { + 7 WRITE_ONCE(*x, 1); + 8 smp_store_release(y, 1); + 9 } +10 +11 P1(int *x, int *y) +12 { +13 int r0; +14 int r1; +15 +16 r0 = smp_load_acquire(y); +17 r1 = READ_ONCE(*x); +18 } +19 +20 exists (1:r0=1 /\ 1:r1=0) + +Line 1 starts with "C", which identifies this file as being in the +LKMM C-language format (which, as we will see, is a small fragment +of the full C language). The remainder of line 1 is the name of +the test, which by convention is the filename with the ".litmus" +suffix stripped. In this case, the actual test may be found in +tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus +in the Linux-kernel source tree. + +Mechanically generated litmus tests will often have an optional +double-quoted comment string on the second line. Such strings are ignored +when running the test. Yes, you can add your own comments to litmus +tests, but this is a bit involved due to the use of multiple parsers. +For now, you can use C-language comments in the C code, and these comments +may be in either the "/* */" or the "//" style. A later section will +cover the full litmus-test commenting story. + +Line 3 is the initialization section. Because the default initialization +to zero suffices for this test, the "{}" syntax is used, which mean the +initialization section is empty. Litmus tests requiring non-default +initialization must have non-empty initialization sections, as in the +example that will be presented later in this document. + +Lines 5-9 show the first process and lines 11-18 the second process. Each +process corresponds to a Linux-kernel task (or kthread, workqueue, thread, +and so on; LKMM discussions often use these terms interchangeably). +The name of the first process is "P0" and that of the second "P1". +You can name your processes anything you like as long as the names consist +of a single "P" followed by a number, and as long as the numbers are +consecutive starting with zero. This can actually be quite helpful, +for example, a .litmus file matching "^P1(" but not matching "^P2(" +must contain a two-process litmus test. + +The argument list for each function are pointers to the global variables +used by that function. Unlike normal C-language function parameters, the +names are significant. The fact that both P0() and P1() have a formal +parameter named "x" means that these two processes are working with the +same global variable, also named "x". So the "int *x, int *y" on P0() +and P1() mean that both processes are working with two shared global +variables, "x" and "y". Global variables are always passed to processes +by reference, hence "P0(int *x, int *y)", but *never* "P0(int x, int y)". + +P0() has no local variables, but P1() has two of them named "r0" and "r1". +These names may be freely chosen, but for historical reasons stemming from +other litmus-test formats, it is conventional to use names consisting of +"r" followed by a number as shown here. A common bug in litmus tests +is forgetting to add a global variable to a process's parameter list. +This will sometimes result in an error message, but can also cause the +intended global to instead be silently treated as an undeclared local +variable. + +Each process's code is similar to Linux-kernel C, as can be seen on lines +7-8 and 13-17. This code may use many of the Linux kernel's atomic +operations, some of its exclusive-lock functions, and some of its RCU +and SRCU functions. An approximate list of the currently supported +functions may be found in the linux-kernel.def file. + +The P0() process does "WRITE_ONCE(*x, 1)" on line 7. Because "x" is a +pointer in P0()'s parameter list, this does an unordered store to global +variable "x". Line 8 does "smp_store_release(y, 1)", and because "y" +is also in P0()'s parameter list, this does a release store to global +variable "y". + +The P1() process declares two local variables on lines 13 and 14. +Line 16 does "r0 = smp_load_acquire(y)" which does an acquire load +from global variable "y" into local variable "r0". Line 17 does a +"r1 = READ_ONCE(*x)", which does an unordered load from "*x" into local +variable "r1". Both "x" and "y" are in P1()'s parameter list, so both +reference the same global variables that are used by P0(). + +Line 20 is the "exists" assertion expression to evaluate the final state. +This final state is evaluated after the dust has settled: both processes +have completed and all of their memory references and memory barriers +have propagated to all parts of the system. The references to the local +variables "r0" and "r1" in line 24 must be prefixed with "1:" to specify +which process they are local to. + +Note that the assertion expression is written in the litmus-test +language rather than in C. For example, single "=" is an equality +operator rather than an assignment. The "/\" character combination means +"and". Similarly, "\/" stands for "or". Both of these are ASCII-art +representations of the corresponding mathematical symbols. Finally, +"~" stands for "logical not", which is "!" in C, and not to be confused +with the C-language "~" operator which instead stands for "bitwise not". +Parentheses may be used to override precedence. + +The "exists" assertion on line 20 is satisfied if the consumer sees the +flag ("y") set but the buffer ("x") as not yet filled in, that is, if P1() +loaded a value from "x" that was equal to 1 but loaded a value from "y" +that was still equal to zero. + +This example can be checked by running the following command, which +absolutely must be run from the tools/memory-model directory and from +this directory only: + +herd7 -conf linux-kernel.cfg litmus-tests/MP+pooncerelease+poacquireonce.litmus + +The output is the result of something similar to a full state-space +search, and is as follows: + + 1 Test MP+pooncerelease+poacquireonce Allowed + 2 States 3 + 3 1:r0=0; 1:r1=0; + 4 1:r0=0; 1:r1=1; + 5 1:r0=1; 1:r1=1; + 6 No + 7 Witnesses + 8 Positive: 0 Negative: 3 + 9 Condition exists (1:r0=1 /\ 1:r1=0) +10 Observation MP+pooncerelease+poacquireonce Never 0 3 +11 Time MP+pooncerelease+poacquireonce 0.00 +12 Hash=579aaa14d8c35a39429b02e698241d09 + +The most pertinent line is line 10, which contains "Never 0 3", which +indicates that the bad result flagged by the "exists" clause never +happens. This line might instead say "Sometimes" to indicate that the +bad result happened in some but not all executions, or it might say +"Always" to indicate that the bad result happened in all executions. +(The herd7 tool doesn't judge, so it is only an LKMM convention that the +"exists" clause indicates a bad result. To see this, invert the "exists" +clause's condition and run the test.) The numbers ("0 3") at the end +of this line indicate the number of end states satisfying the "exists" +clause (0) and the number not not satisfying that clause (3). + +Another important part of this output is shown in lines 2-5, repeated here: + + 2 States 3 + 3 1:r0=0; 1:r1=0; + 4 1:r0=0; 1:r1=1; + 5 1:r0=1; 1:r1=1; + +Line 2 gives the total number of end states, and each of lines 3-5 list +one of these states, with the first ("1:r0=0; 1:r1=0;") indicating that +both of P1()'s loads returned the value "0". As expected, given the +"Never" on line 10, the state flagged by the "exists" clause is not +listed. This full list of states can be helpful when debugging a new +litmus test. + +The rest of the output is not normally needed, either due to irrelevance +or due to being redundant with the lines discussed above. However, the +following paragraph lists them for the benefit of readers possessed of +an insatiable curiosity. Other readers should feel free to skip ahead. + +Line 1 echos the test name, along with the "Test" and "Allowed". Line 6's +"No" says that the "exists" clause was not satisfied by any execution, +and as such it has the same meaning as line 10's "Never". Line 7 is a +lead-in to line 8's "Positive: 0 Negative: 3", which lists the number +of end states satisfying and not satisfying the "exists" clause, just +like the two numbers at the end of line 10. Line 9 repeats the "exists" +clause so that you don't have to look it up in the litmus-test file. +The number at the end of line 11 (which begins with "Time") gives the +time in seconds required to analyze the litmus test. Small tests such +as this one complete in a few milliseconds, so "0.00" is quite common. +Line 12 gives a hash of the contents for the litmus-test file, and is used +by tooling that manages litmus tests and their output. This tooling is +used by people modifying LKMM itself, and among other things lets such +people know which of the several thousand relevant litmus tests were +affected by a given change to LKMM. + + +Initialization +-------------- + +The previous example relied on the default zero initialization for +"x" and "y", but a similar litmus test could instead initialize them +to some other value: + + 1 C MP+pooncerelease+poacquireonce + 2 + 3 { + 4 x=42; + 5 y=42; + 6 } + 7 + 8 P0(int *x, int *y) + 9 { +10 WRITE_ONCE(*x, 1); +11 smp_store_release(y, 1); +12 } +13 +14 P1(int *x, int *y) +15 { +16 int r0; +17 int r1; +18 +19 r0 = smp_load_acquire(y); +20 r1 = READ_ONCE(*x); +21 } +22 +23 exists (1:r0=1 /\ 1:r1=42) + +Lines 3-6 now initialize both "x" and "y" to the value 42. This also +means that the "exists" clause on line 23 must change "1:r1=0" to +"1:r1=42". + +Running the test gives the same overall result as before, but with the +value 42 appearing in place of the value zero: + + 1 Test MP+pooncerelease+poacquireonce Allowed + 2 States 3 + 3 1:r0=1; 1:r1=1; + 4 1:r0=42; 1:r1=1; + 5 1:r0=42; 1:r1=42; + 6 No + 7 Witnesses + 8 Positive: 0 Negative: 3 + 9 Condition exists (1:r0=1 /\ 1:r1=42) +10 Observation MP+pooncerelease+poacquireonce Never 0 3 +11 Time MP+pooncerelease+poacquireonce 0.02 +12 Hash=ab9a9b7940a75a792266be279a980156 + +It is tempting to avoid the open-coded repetitions of the value "42" +by defining another global variable "initval=42" and replacing all +occurrences of "42" with "initval". This will not, repeat *not*, +initialize "x" and "y" to 42, but instead to the address of "initval" +(try it!). See the section below on linked lists to learn more about +why this approach to initialization can be useful. + + +Control Structures +------------------ + +LKMM supports the C-language "if" statement, which allows modeling of +conditional branches. In LKMM, conditional branches can affect ordering, +but only if you are *very* careful (compilers are surprisingly able +to optimize away conditional branches). The following example shows +the "load buffering" (LB) use case that is used in the Linux kernel to +synchronize between ring-buffer producers and consumers. In the example +below, P0() is one side checking to see if an operation may proceed and +P1() is the other side completing its update. + + 1 C LB+fencembonceonce+ctrlonceonce + 2 + 3 {} + 4 + 5 P0(int *x, int *y) + 6 { + 7 int r0; + 8 + 9 r0 = READ_ONCE(*x); +10 if (r0) +11 WRITE_ONCE(*y, 1); +12 } +13 +14 P1(int *x, int *y) +15 { +16 int r0; +17 +18 r0 = READ_ONCE(*y); +19 smp_mb(); +20 WRITE_ONCE(*x, 1); +21 } +22 +23 exists (0:r0=1 /\ 1:r0=1) + +P1()'s "if" statement on line 10 works as expected, so that line 11 is +executed only if line 9 loads a non-zero value from "x". Because P1()'s +write of "1" to "x" happens only after P1()'s read from "y", one would +hope that the "exists" clause cannot be satisfied. LKMM agrees: + + 1 Test LB+fencembonceonce+ctrlonceonce Allowed + 2 States 2 + 3 0:r0=0; 1:r0=0; + 4 0:r0=1; 1:r0=0; + 5 No + 6 Witnesses + 7 Positive: 0 Negative: 2 + 8 Condition exists (0:r0=1 /\ 1:r0=1) + 9 Observation LB+fencembonceonce+ctrlonceonce Never 0 2 +10 Time LB+fencembonceonce+ctrlonceonce 0.00 +11 Hash=e5260556f6de495fd39b556d1b831c3b + +However, there is no "while" statement due to the fact that full +state-space search has some difficulty with iteration. However, there +are tricks that may be used to handle some special cases, which are +discussed below. In addition, loop-unrolling tricks may be applied, +albeit sparingly. + + +Tricks and Traps +================ + +This section covers extracting debug output from herd7, emulating +spin loops, handling trivial linked lists, adding comments to litmus tests, +emulating call_rcu(), and finally tricks to improve herd7 performance +in order to better handle large litmus tests. + + +Debug Output +------------ + +By default, the herd7 state output includes all variables mentioned +in the "exists" clause. But sometimes debugging efforts are greatly +aided by the values of other variables. Consider this litmus test +(tools/memory-order/litmus-tests/SB+rfionceonce-poonceonces.litmus but +slightly modified), which probes an obscure corner of hardware memory +ordering: + + 1 C SB+rfionceonce-poonceonces + 2 + 3 {} + 4 + 5 P0(int *x, int *y) + 6 { + 7 int r1; + 8 int r2; + 9 +10 WRITE_ONCE(*x, 1); +11 r1 = READ_ONCE(*x); +12 r2 = READ_ONCE(*y); +13 } +14 +15 P1(int *x, int *y) +16 { +17 int r3; +18 int r4; +19 +20 WRITE_ONCE(*y, 1); +21 r3 = READ_ONCE(*y); +22 r4 = READ_ONCE(*x); +23 } +24 +25 exists (0:r2=0 /\ 1:r4=0) + +The herd7 output is as follows: + + 1 Test SB+rfionceonce-poonceonces Allowed + 2 States 4 + 3 0:r2=0; 1:r4=0; + 4 0:r2=0; 1:r4=1; + 5 0:r2=1; 1:r4=0; + 6 0:r2=1; 1:r4=1; + 7 Ok + 8 Witnesses + 9 Positive: 1 Negative: 3 +10 Condition exists (0:r2=0 /\ 1:r4=0) +11 Observation SB+rfionceonce-poonceonces Sometimes 1 3 +12 Time SB+rfionceonce-poonceonces 0.01 +13 Hash=c7f30fe0faebb7d565405d55b7318ada + +(This output indicates that CPUs are permitted to "snoop their own +store buffers", which all of Linux's CPU families other than s390 will +happily do. Such snooping results in disagreement among CPUs on the +order of stores from different CPUs, which is rarely an issue.) + +But the herd7 output shows only the two variables mentioned in the +"exists" clause. Someone modifying this test might wish to know the +values of "x", "y", "0:r1", and "0:r3" as well. The "locations" +statement on line 25 shows how to cause herd7 to display additional +variables: + + 1 C SB+rfionceonce-poonceonces + 2 + 3 {} + 4 + 5 P0(int *x, int *y) + 6 { + 7 int r1; + 8 int r2; + 9 +10 WRITE_ONCE(*x, 1); +11 r1 = READ_ONCE(*x); +12 r2 = READ_ONCE(*y); +13 } +14 +15 P1(int *x, int *y) +16 { +17 int r3; +18 int r4; +19 +20 WRITE_ONCE(*y, 1); +21 r3 = READ_ONCE(*y); +22 r4 = READ_ONCE(*x); +23 } +24 +25 locations [0:r1; 1:r3; x; y] +26 exists (0:r2=0 /\ 1:r4=0) + +The herd7 output then displays the values of all the variables: + + 1 Test SB+rfionceonce-poonceonces Allowed + 2 States 4 + 3 0:r1=1; 0:r2=0; 1:r3=1; 1:r4=0; x=1; y=1; + 4 0:r1=1; 0:r2=0; 1:r3=1; 1:r4=1; x=1; y=1; + 5 0:r1=1; 0:r2=1; 1:r3=1; 1:r4=0; x=1; y=1; + 6 0:r1=1; 0:r2=1; 1:r3=1; 1:r4=1; x=1; y=1; + 7 Ok + 8 Witnesses + 9 Positive: 1 Negative: 3 +10 Condition exists (0:r2=0 /\ 1:r4=0) +11 Observation SB+rfionceonce-poonceonces Sometimes 1 3 +12 Time SB+rfionceonce-poonceonces 0.01 +13 Hash=40de8418c4b395388f6501cafd1ed38d + +What if you would like to know the value of a particular global variable +at some particular point in a given process's execution? One approach +is to use a READ_ONCE() to load that global variable into a new local +variable, then add that local variable to the "locations" clause. +But be careful: In some litmus tests, adding a READ_ONCE() will change +the outcome! For one example, please see the C-READ_ONCE.litmus and +C-READ_ONCE-omitted.litmus tests located here: + + https://github.com/paulmckrcu/litmus/blob/master/manual/kernel/ + + +Spin Loops +---------- + +The analysis carried out by herd7 explores full state space, which is +at best of exponential time complexity. Adding processes and increasing +the amount of code in a give process can greatly increase execution time. +Potentially infinite loops, such as those used to wait for locks to +become available, are clearly problematic. + +Fortunately, it is possible to avoid state-space explosion by specially +modeling such loops. For example, the following litmus tests emulates +locking using xchg_acquire(), but instead of enclosing xchg_acquire() +in a spin loop, it instead excludes executions that fail to acquire the +lock using a herd7 "filter" clause. Note that for exclusive locking, you +are better off using the spin_lock() and spin_unlock() that LKMM directly +models, if for no other reason that these are much faster. However, the +techniques illustrated in this section can be used for other purposes, +such as emulating reader-writer locking, which LKMM does not yet model. + + 1 C C-SB+l-o-o-u+l-o-o-u-X + 2 + 3 { + 4 } + 5 + 6 P0(int *sl, int *x0, int *x1) + 7 { + 8 int r2; + 9 int r1; +10 +11 r2 = xchg_acquire(sl, 1); +12 WRITE_ONCE(*x0, 1); +13 r1 = READ_ONCE(*x1); +14 smp_store_release(sl, 0); +15 } +16 +17 P1(int *sl, int *x0, int *x1) +18 { +19 int r2; +20 int r1; +21 +22 r2 = xchg_acquire(sl, 1); +23 WRITE_ONCE(*x1, 1); +24 r1 = READ_ONCE(*x0); +25 smp_store_release(sl, 0); +26 } +27 +28 filter (0:r2=0 /\ 1:r2=0) +29 exists (0:r1=0 /\ 1:r1=0) + +This litmus test may be found here: + +https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/perfbook.git/tree/CodeSamples/formal/herd/C-SB+l-o-o-u+l-o-o-u-X.litmus + +This test uses two global variables, "x1" and "x2", and also emulates a +single global spinlock named "sl". This spinlock is held by whichever +process changes the value of "sl" from "0" to "1", and is released when +that process sets "sl" back to "0". P0()'s lock acquisition is emulated +on line 11 using xchg_acquire(), which unconditionally stores the value +"1" to "sl" and stores either "0" or "1" to "r2", depending on whether +the lock acquisition was successful or unsuccessful (due to "sl" already +having the value "1"), respectively. P1() operates in a similar manner. + +Rather unconventionally, execution appears to proceed to the critical +section on lines 12 and 13 in either case. Line 14 then uses an +smp_store_release() to store zero to "sl", thus emulating lock release. + +The case where xchg_acquire() fails to acquire the lock is handled by +the "filter" clause on line 28, which tells herd7 to keep only those +executions in which both "0:r2" and "1:r2" are zero, that is to pay +attention only to those executions in which both locks are actually +acquired. Thus, the bogus executions that would execute the critical +sections are discarded and any effects that they might have had are +ignored. Note well that the "filter" clause keeps those executions +for which its expression is satisfied, that is, for which the expression +evaluates to true. In other words, the "filter" clause says what to +keep, not what to discard. + +The result of running this test is as follows: + + 1 Test C-SB+l-o-o-u+l-o-o-u-X Allowed + 2 States 2 + 3 0:r1=0; 1:r1=1; + 4 0:r1=1; 1:r1=0; + 5 No + 6 Witnesses + 7 Positive: 0 Negative: 2 + 8 Condition exists (0:r1=0 /\ 1:r1=0) + 9 Observation C-SB+l-o-o-u+l-o-o-u-X Never 0 2 +10 Time C-SB+l-o-o-u+l-o-o-u-X 0.03 + +The "Never" on line 9 indicates that this use of xchg_acquire() and +smp_store_release() really does correctly emulate locking. + +Why doesn't the litmus test take the simpler approach of using a spin loop +to handle failed spinlock acquisitions, like the kernel does? The key +insight behind this litmus test is that spin loops have no effect on the +possible "exists"-clause outcomes of program execution in the absence +of deadlock. In other words, given a high-quality lock-acquisition +primitive in a deadlock-free program running on high-quality hardware, +each lock acquisition will eventually succeed. Because herd7 already +explores the full state space, the length of time required to actually +acquire the lock does not matter. After all, herd7 already models all +possible durations of the xchg_acquire() statements. + +Why not just add the "filter" clause to the "exists" clause, thus +avoiding the "filter" clause entirely? This does work, but is slower. +The reason that the "filter" clause is faster is that (in the common case) +herd7 knows to abandon an execution as soon as the "filter" expression +fails to be satisfied. In contrast, the "exists" clause is evaluated +only at the end of time, thus requiring herd7 to waste time on bogus +executions in which both critical sections proceed concurrently. In +addition, some LKMM users like the separation of concerns provided by +using the both the "filter" and "exists" clauses. + +Readers lacking a pathological interest in odd corner cases should feel +free to skip the remainder of this section. + +But what if the litmus test were to temporarily set "0:r2" to a non-zero +value? Wouldn't that cause herd7 to abandon the execution prematurely +due to an early mismatch of the "filter" clause? + +Why not just try it? Line 4 of the following modified litmus test +introduces a new global variable "x2" that is initialized to "1". Line 23 +of P1() reads that variable into "1:r2" to force an early mismatch with +the "filter" clause. Line 24 does a known-true "if" condition to avoid +and static analysis that herd7 might do. Finally the "exists" clause +on line 32 is updated to a condition that is alway satisfied at the end +of the test. + + 1 C C-SB+l-o-o-u+l-o-o-u-X + 2 + 3 { + 4 x2=1; + 5 } + 6 + 7 P0(int *sl, int *x0, int *x1) + 8 { + 9 int r2; +10 int r1; +11 +12 r2 = xchg_acquire(sl, 1); +13 WRITE_ONCE(*x0, 1); +14 r1 = READ_ONCE(*x1); +15 smp_store_release(sl, 0); +16 } +17 +18 P1(int *sl, int *x0, int *x1, int *x2) +19 { +20 int r2; +21 int r1; +22 +23 r2 = READ_ONCE(*x2); +24 if (r2) +25 r2 = xchg_acquire(sl, 1); +26 WRITE_ONCE(*x1, 1); +27 r1 = READ_ONCE(*x0); +28 smp_store_release(sl, 0); +29 } +30 +31 filter (0:r2=0 /\ 1:r2=0) +32 exists (x1=1) + +If the "filter" clause were to check each variable at each point in the +execution, running this litmus test would display no executions because +all executions would be filtered out at line 23. However, the output +is instead as follows: + + 1 Test C-SB+l-o-o-u+l-o-o-u-X Allowed + 2 States 1 + 3 x1=1; + 4 Ok + 5 Witnesses + 6 Positive: 2 Negative: 0 + 7 Condition exists (x1=1) + 8 Observation C-SB+l-o-o-u+l-o-o-u-X Always 2 0 + 9 Time C-SB+l-o-o-u+l-o-o-u-X 0.04 +10 Hash=080bc508da7f291e122c6de76c0088e3 + +Line 3 shows that there is one execution that did not get filtered out, +so the "filter" clause is evaluated only on the last assignment to +the variables that it checks. In this case, the "filter" clause is a +disjunction, so it might be evaluated twice, once at the final (and only) +assignment to "0:r2" and once at the final assignment to "1:r2". + + +Linked Lists +------------ + +LKMM can handle linked lists, but only linked lists in which each node +contains nothing except a pointer to the next node in the list. This is +of course quite restrictive, but there is nevertheless quite a bit that +can be done within these confines, as can be seen in the litmus test +at tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus: + + 1 C MP+onceassign+derefonce + 2 + 3 { + 4 y=z; + 5 z=0; + 6 } + 7 + 8 P0(int *x, int **y) + 9 { +10 WRITE_ONCE(*x, 1); +11 rcu_assign_pointer(*y, x); +12 } +13 +14 P1(int *x, int **y) +15 { +16 int *r0; +17 int r1; +18 +19 rcu_read_lock(); +20 r0 = rcu_dereference(*y); +21 r1 = READ_ONCE(*r0); +22 rcu_read_unlock(); +23 } +24 +25 exists (1:r0=x /\ 1:r1=0) + +Line 4's "y=z" may seem odd, given that "z" has not yet been initialized. +But "y=z" does not set the value of "y" to that of "z", but instead +sets the value of "y" to the *address* of "z". Lines 4 and 5 therefore +create a simple linked list, with "y" pointing to "z" and "z" having a +NULL pointer. A much longer linked list could be created if desired, +and circular singly linked lists can also be created and manipulated. + +The "exists" clause works the same way, with the "1:r0=x" comparing P1()'s +"r0" not to the value of "x", but again to its address. This term of the +"exists" clause therefore tests whether line 20's load from "y" saw the +value stored by line 11, which is in fact what is required in this case. + +P0()'s line 10 initializes "x" to the value 1 then line 11 links to "x" +from "y", replacing "z". + +P1()'s line 20 loads a pointer from "y", and line 21 dereferences that +pointer. The RCU read-side critical section spanning lines 19-22 is +just for show in this example. + +Running this test results in the following: + + 1 Test MP+onceassign+derefonce Allowed + 2 States 2 + 3 1:r0=x; 1:r1=1; + 4 1:r0=z; 1:r1=0; + 5 No + 6 Witnesses + 7 Positive: 0 Negative: 2 + 8 Condition exists (1:r0=x /\ 1:r1=0) + 9 Observation MP+onceassign+derefonce Never 0 2 +10 Time MP+onceassign+derefonce 0.00 +11 Hash=49ef7a741563570102448a256a0c8568 + +The only possible outcomes feature P1() loading a pointer to "z" +(which contains zero) on the one hand and P1() loading a pointer to "x" +(which contains the value one) on the other. This should be reassuring +because it says that RCU readers cannot see the old preinitialization +values when accessing a newly inserted list node. This undesirable +scenario is flagged by the "exists" clause, and would occur if P1() +loaded a pointer to "x", but obtained the pre-initialization value of +zero after dereferencing that pointer. + + +Comments +-------- + +Different portions of a litmus test are processed by different parsers, +which has the charming effect of requiring different comment syntax in +different portions of the litmus test. The C-syntax portions use +C-language comments (either "/* */" or "//"), while the other portions +use Ocaml comments "(* *)". + +The following litmus test illustrates the comment style corresponding +to each syntactic unit of the test: + + 1 C MP+onceassign+derefonce (* A *) + 2 + 3 (* B *) + 4 + 5 { + 6 y=z; (* C *) + 7 z=0; + 8 } // D + 9 +10 // E +11 +12 P0(int *x, int **y) // F +13 { +14 WRITE_ONCE(*x, 1); // G +15 rcu_assign_pointer(*y, x); +16 } +17 +18 // H +19 +20 P1(int *x, int **y) +21 { +22 int *r0; +23 int r1; +24 +25 rcu_read_lock(); +26 r0 = rcu_dereference(*y); +27 r1 = READ_ONCE(*r0); +28 rcu_read_unlock(); +29 } +30 +31 // I +32 +33 exists (* J *) (1:r0=x /\ (* K *) 1:r1=0) (* L *) + +In short, use C-language comments in the C code and Ocaml comments in +the rest of the litmus test. + +On the other hand, if you prefer C-style comments everywhere, the +C preprocessor is your friend. + + +Asynchronous RCU Grace Periods +------------------------------ + +The following litmus test is derived from the example show in +Documentation/litmus-tests/rcu/RCU+sync+free.litmus, but converted to +emulate call_rcu(): + + 1 C RCU+sync+free + 2 + 3 { + 4 int x = 1; + 5 int *y = &x; + 6 int z = 1; + 7 } + 8 + 9 P0(int *x, int *z, int **y) +10 { +11 int *r0; +12 int r1; +13 +14 rcu_read_lock(); +15 r0 = rcu_dereference(*y); +16 r1 = READ_ONCE(*r0); +17 rcu_read_unlock(); +18 } +19 +20 P1(int *z, int **y, int *c) +21 { +22 rcu_assign_pointer(*y, z); +23 smp_store_release(*c, 1); // Emulate call_rcu(). +24 } +25 +26 P2(int *x, int *z, int **y, int *c) +27 { +28 int r0; +29 +30 r0 = smp_load_acquire(*c); // Note call_rcu() request. +31 synchronize_rcu(); // Wait one grace period. +32 WRITE_ONCE(*x, 0); // Emulate the RCU callback. +33 } +34 +35 filter (2:r0=1) (* Reject too-early starts. *) +36 exists (0:r0=x /\ 0:r1=0) + +Lines 4-6 initialize a linked list headed by "y" that initially contains +"x". In addition, "z" is pre-initialized to prepare for P1(), which +will replace "x" with "z" in this list. + +P0() on lines 9-18 enters an RCU read-side critical section, loads the +list header "y" and dereferences it, leaving the node in "0:r0" and +the node's value in "0:r1". + +P1() on lines 20-24 updates the list header to instead reference "z", +then emulates call_rcu() by doing a release store into "c". + +P2() on lines 27-33 emulates the behind-the-scenes effect of doing a +call_rcu(). Line 30 first does an acquire load from "c", then line 31 +waits for an RCU grace period to elapse, and finally line 32 emulates +the RCU callback, which in turn emulates a call to kfree(). + +Of course, it is possible for P2() to start too soon, so that the +value of "2:r0" is zero rather than the required value of "1". +The "filter" clause on line 35 handles this possibility, rejecting +all executions in which "2:r0" is not equal to the value "1". + + +Performance +----------- + +LKMM's exploration of the full state-space can be extremely helpful, +but it does not come for free. The price is exponential computational +complexity in terms of the number of processes, the average number +of statements in each process, and the total number of stores in the +litmus test. + +So it is best to start small and then work up. Where possible, break +your code down into small pieces each representing a core concurrency +requirement. + +That said, herd7 is quite fast. On an unprepossessing x86 laptop, it +was able to analyze the following 10-process RCU litmus test in about +six seconds. + +https://github.com/paulmckrcu/litmus/blob/master/auto/C-RW-R+RW-R+RW-G+RW-G+RW-G+RW-G+RW-R+RW-R+RW-R+RW-R.litmus + +One way to make herd7 run faster is to use the "-speedcheck true" option. +This option prevents herd7 from generating all possible end states, +instead causing it to focus solely on whether or not the "exists" +clause can be satisfied. With this option, herd7 evaluates the above +litmus test in about 300 milliseconds, for more than an order of magnitude +improvement in performance. + +Larger 16-process litmus tests that would normally consume 15 minutes +of time complete in about 40 seconds with this option. To be fair, +you do get an extra 65,535 states when you leave off the "-speedcheck +true" option. + +https://github.com/paulmckrcu/litmus/blob/master/auto/C-RW-R+RW-R+RW-G+RW-G+RW-G+RW-G+RW-R+RW-R+RW-R+RW-R+RW-G+RW-G+RW-G+RW-G+RW-R+RW-R.litmus + +Nevertheless, litmus-test analysis really is of exponential complexity, +whether with or without "-speedcheck true". Increasing by just three +processes to a 19-process litmus test requires 2 hours and 40 minutes +without, and about 8 minutes with "-speedcheck true". Each of these +results represent roughly an order of magnitude slowdown compared to the +16-process litmus test. Again, to be fair, the multi-hour run explores +no fewer than 524,287 additional states compared to the shorter one. + +https://github.com/paulmckrcu/litmus/blob/master/auto/C-RW-R+RW-R+RW-G+RW-G+RW-G+RW-G+RW-R+RW-R+RW-R+RW-R+RW-R+RW-R+RW-G+RW-G+RW-G+RW-G+RW-R+RW-R+RW-R.litmus + +If you don't like command-line arguments, you can obtain a similar speedup +by adding a "filter" clause with exactly the same expression as your +"exists" clause. + +However, please note that seeing the full set of states can be extremely +helpful when developing and debugging litmus tests. + + +LIMITATIONS +=========== + +Limitations of the Linux-kernel memory model (LKMM) include: + +1. Compiler optimizations are not accurately modeled. Of course, + the use of READ_ONCE() and WRITE_ONCE() limits the compiler's + ability to optimize, but under some circumstances it is possible + for the compiler to undermine the memory model. For more + information, see Documentation/explanation.txt (in particular, + the "THE PROGRAM ORDER RELATION: po AND po-loc" and "A WARNING" + sections). + + Note that this limitation in turn limits LKMM's ability to + accurately model address, control, and data dependencies. + For example, if the compiler can deduce the value of some variable + carrying a dependency, then the compiler can break that dependency + by substituting a constant of that value. + +2. Multiple access sizes for a single variable are not supported, + and neither are misaligned or partially overlapping accesses. + +3. Exceptions and interrupts are not modeled. In some cases, + this limitation can be overcome by modeling the interrupt or + exception with an additional process. + +4. I/O such as MMIO or DMA is not supported. + +5. Self-modifying code (such as that found in the kernel's + alternatives mechanism, function tracer, Berkeley Packet Filter + JIT compiler, and module loader) is not supported. + +6. Complete modeling of all variants of atomic read-modify-write + operations, locking primitives, and RCU is not provided. + For example, call_rcu() and rcu_barrier() are not supported. + However, a substantial amount of support is provided for these + operations, as shown in the linux-kernel.def file. + + Here are specific limitations: + + a. When rcu_assign_pointer() is passed NULL, the Linux + kernel provides no ordering, but LKMM models this + case as a store release. + + b. The "unless" RMW operations are not currently modeled: + atomic_long_add_unless(), atomic_inc_unless_negative(), + and atomic_dec_unless_positive(). These can be emulated + in litmus tests, for example, by using atomic_cmpxchg(). + + One exception of this limitation is atomic_add_unless(), + which is provided directly by herd7 (so no corresponding + definition in linux-kernel.def). atomic_add_unless() is + modeled by herd7 therefore it can be used in litmus tests. + + c. The call_rcu() function is not modeled. As was shown above, + it can be emulated in litmus tests by adding another + process that invokes synchronize_rcu() and the body of the + callback function, with (for example) a release-acquire + from the site of the emulated call_rcu() to the beginning + of the additional process. + + d. The rcu_barrier() function is not modeled. It can be + emulated in litmus tests emulating call_rcu() via + (for example) a release-acquire from the end of each + additional call_rcu() process to the site of the + emulated rcu-barrier(). + + e. Although sleepable RCU (SRCU) is now modeled, there + are some subtle differences between its semantics and + those in the Linux kernel. For example, the kernel + might interpret the following sequence as two partially + overlapping SRCU read-side critical sections: + + 1 r1 = srcu_read_lock(&my_srcu); + 2 do_something_1(); + 3 r2 = srcu_read_lock(&my_srcu); + 4 do_something_2(); + 5 srcu_read_unlock(&my_srcu, r1); + 6 do_something_3(); + 7 srcu_read_unlock(&my_srcu, r2); + + In contrast, LKMM will interpret this as a nested pair of + SRCU read-side critical sections, with the outer critical + section spanning lines 1-7 and the inner critical section + spanning lines 3-5. + + This difference would be more of a concern had anyone + identified a reasonable use case for partially overlapping + SRCU read-side critical sections. For more information + on the trickiness of such overlapping, please see: + https://paulmck.livejournal.com/40593.html + + f. Reader-writer locking is not modeled. It can be + emulated in litmus tests using atomic read-modify-write + operations. + +The fragment of the C language supported by these litmus tests is quite +limited and in some ways non-standard: + +1. There is no automatic C-preprocessor pass. You can of course + run it manually, if you choose. + +2. There is no way to create functions other than the Pn() functions + that model the concurrent processes. + +3. The Pn() functions' formal parameters must be pointers to the + global shared variables. Nothing can be passed by value into + these functions. + +4. The only functions that can be invoked are those built directly + into herd7 or that are defined in the linux-kernel.def file. + +5. The "switch", "do", "for", "while", and "goto" C statements are + not supported. The "switch" statement can be emulated by the + "if" statement. The "do", "for", and "while" statements can + often be emulated by manually unrolling the loop, or perhaps by + enlisting the aid of the C preprocessor to minimize the resulting + code duplication. Some uses of "goto" can be emulated by "if", + and some others by unrolling. + +6. Although you can use a wide variety of types in litmus-test + variable declarations, and especially in global-variable + declarations, the "herd7" tool understands only int and + pointer types. There is no support for floating-point types, + enumerations, characters, strings, arrays, or structures. + +7. Parsing of variable declarations is very loose, with almost no + type checking. + +8. Initializers differ from their C-language counterparts. + For example, when an initializer contains the name of a shared + variable, that name denotes a pointer to that variable, not + the current value of that variable. For example, "int x = y" + is interpreted the way "int x = &y" would be in C. + +9. Dynamic memory allocation is not supported, although this can + be worked around in some cases by supplying multiple statically + allocated variables. + +Some of these limitations may be overcome in the future, but others are +more likely to be addressed by incorporating the Linux-kernel memory model +into other tools. + +Finally, please note that LKMM is subject to change as hardware, use cases, +and compilers evolve. diff --git a/tools/memory-model/README b/tools/memory-model/README index ecb7385376bf..d2e03c4f52a0 100644 --- a/tools/memory-model/README +++ b/tools/memory-model/README @@ -63,10 +63,32 @@ BASIC USAGE: HERD7 ================== The memory model is used, in conjunction with "herd7", to exhaustively -explore the state space of small litmus tests. +explore the state space of small litmus tests. Documentation describing +the format, features, capabilities and limitations of these litmus +tests is available in tools/memory-model/Documentation/litmus-tests.txt. -For example, to run SB+fencembonceonces.litmus against the memory model: +Example litmus tests may be found in the Linux-kernel source tree: + tools/memory-model/litmus-tests/ + Documentation/litmus-tests/ + +Several thousand more example litmus tests are available here: + + https://github.com/paulmckrcu/litmus + https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/perfbook.git/tree/CodeSamples/formal/herd + https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/perfbook.git/tree/CodeSamples/formal/litmus + +Documentation describing litmus tests and now to use them may be found +here: + + tools/memory-model/Documentation/litmus-tests.txt + +The remainder of this section uses the SB+fencembonceonces.litmus test +located in the tools/memory-model directory. + +To run SB+fencembonceonces.litmus against the memory model: + + $ cd $LINUX_SOURCE_TREE/tools/memory-model $ herd7 -conf linux-kernel.cfg litmus-tests/SB+fencembonceonces.litmus Here is the corresponding output: @@ -87,7 +109,11 @@ Here is the corresponding output: The "Positive: 0 Negative: 3" and the "Never 0 3" each indicate that this litmus test's "exists" clause can not be satisfied. -See "herd7 -help" or "herdtools7/doc/" for more information. +See "herd7 -help" or "herdtools7/doc/" for more information on running the +tool itself, but please be aware that this documentation is intended for +people who work on the memory model itself, that is, people making changes +to the tools/memory-model/linux-kernel.* files. It is not intended for +people focusing on writing, understanding, and running LKMM litmus tests. ===================== @@ -124,7 +150,11 @@ that during two million trials, the state specified in this litmus test's "exists" clause was not reached. And, as with "herd7", please see "klitmus7 -help" or "herdtools7/doc/" -for more information. +for more information. And again, please be aware that this documentation +is intended for people who work on the memory model itself, that is, +people making changes to the tools/memory-model/linux-kernel.* files. +It is not intended for people focusing on writing, understanding, and +running LKMM litmus tests. ==================== @@ -137,6 +167,10 @@ Documentation/cheatsheet.txt Documentation/explanation.txt Describes the memory model in detail. +Documentation/litmus-tests.txt + Describes the format, features, capabilities, and limitations + of the litmus tests that LKMM can evaluate. + Documentation/recipes.txt Lists common memory-ordering patterns. @@ -187,116 +221,3 @@ README This file. scripts Various scripts, see scripts/README. - - -=========== -LIMITATIONS -=========== - -The Linux-kernel memory model (LKMM) has the following limitations: - -1. Compiler optimizations are not accurately modeled. Of course, - the use of READ_ONCE() and WRITE_ONCE() limits the compiler's - ability to optimize, but under some circumstances it is possible - for the compiler to undermine the memory model. For more - information, see Documentation/explanation.txt (in particular, - the "THE PROGRAM ORDER RELATION: po AND po-loc" and "A WARNING" - sections). - - Note that this limitation in turn limits LKMM's ability to - accurately model address, control, and data dependencies. - For example, if the compiler can deduce the value of some variable - carrying a dependency, then the compiler can break that dependency - by substituting a constant of that value. - -2. Multiple access sizes for a single variable are not supported, - and neither are misaligned or partially overlapping accesses. - -3. Exceptions and interrupts are not modeled. In some cases, - this limitation can be overcome by modeling the interrupt or - exception with an additional process. - -4. I/O such as MMIO or DMA is not supported. - -5. Self-modifying code (such as that found in the kernel's - alternatives mechanism, function tracer, Berkeley Packet Filter - JIT compiler, and module loader) is not supported. - -6. Complete modeling of all variants of atomic read-modify-write - operations, locking primitives, and RCU is not provided. - For example, call_rcu() and rcu_barrier() are not supported. - However, a substantial amount of support is provided for these - operations, as shown in the linux-kernel.def file. - - a. When rcu_assign_pointer() is passed NULL, the Linux - kernel provides no ordering, but LKMM models this - case as a store release. - - b. The "unless" RMW operations are not currently modeled: - atomic_long_add_unless(), atomic_inc_unless_negative(), - and atomic_dec_unless_positive(). These can be emulated - in litmus tests, for example, by using atomic_cmpxchg(). - - One exception of this limitation is atomic_add_unless(), - which is provided directly by herd7 (so no corresponding - definition in linux-kernel.def). atomic_add_unless() is - modeled by herd7 therefore it can be used in litmus tests. - - c. The call_rcu() function is not modeled. It can be - emulated in litmus tests by adding another process that - invokes synchronize_rcu() and the body of the callback - function, with (for example) a release-acquire from - the site of the emulated call_rcu() to the beginning - of the additional process. - - d. The rcu_barrier() function is not modeled. It can be - emulated in litmus tests emulating call_rcu() via - (for example) a release-acquire from the end of each - additional call_rcu() process to the site of the - emulated rcu-barrier(). - - e. Although sleepable RCU (SRCU) is now modeled, there - are some subtle differences between its semantics and - those in the Linux kernel. For example, the kernel - might interpret the following sequence as two partially - overlapping SRCU read-side critical sections: - - 1 r1 = srcu_read_lock(&my_srcu); - 2 do_something_1(); - 3 r2 = srcu_read_lock(&my_srcu); - 4 do_something_2(); - 5 srcu_read_unlock(&my_srcu, r1); - 6 do_something_3(); - 7 srcu_read_unlock(&my_srcu, r2); - - In contrast, LKMM will interpret this as a nested pair of - SRCU read-side critical sections, with the outer critical - section spanning lines 1-7 and the inner critical section - spanning lines 3-5. - - This difference would be more of a concern had anyone - identified a reasonable use case for partially overlapping - SRCU read-side critical sections. For more information, - please see: https://paulmck.livejournal.com/40593.html - - f. Reader-writer locking is not modeled. It can be - emulated in litmus tests using atomic read-modify-write - operations. - -The "herd7" tool has some additional limitations of its own, apart from -the memory model: - -1. Non-trivial data structures such as arrays or structures are - not supported. However, pointers are supported, allowing trivial - linked lists to be constructed. - -2. Dynamic memory allocation is not supported, although this can - be worked around in some cases by supplying multiple statically - allocated variables. - -Some of these limitations may be overcome in the future, but others are -more likely to be addressed by incorporating the Linux-kernel memory model -into other tools. - -Finally, please note that LKMM is subject to change as hardware, use cases, -and compilers evolve. From 0b8c06b75ea143f3c68aa419c36e82d9ab7454f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Aug 2020 10:58:55 -0700 Subject: [PATCH 46/67] tools/memory-model: Add a simple entry point document Current LKMM documentation assumes that the reader already understands concurrency in the Linux kernel, which won't necessarily always be the case. This commit supplies a simple.txt file that provides a starting point for someone who is new to concurrency in the Linux kernel. That said, this file might also useful as a reminder to experienced developers of simpler approaches to dealing with concurrency. Link: Link: https://lwn.net/Articles/827180/ [ paulmck: Apply feedback from Joel Fernandes. ] Co-developed-by: Dave Chinner Signed-off-by: Dave Chinner Co-developed-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- .../Documentation/litmus-tests.txt | 8 +- tools/memory-model/Documentation/simple.txt | 271 ++++++++++++++++++ tools/memory-model/README | 5 + 3 files changed, 282 insertions(+), 2 deletions(-) create mode 100644 tools/memory-model/Documentation/simple.txt diff --git a/tools/memory-model/Documentation/litmus-tests.txt b/tools/memory-model/Documentation/litmus-tests.txt index 289a38d626dd..2f840dcd15cf 100644 --- a/tools/memory-model/Documentation/litmus-tests.txt +++ b/tools/memory-model/Documentation/litmus-tests.txt @@ -726,8 +726,12 @@ P0()'s line 10 initializes "x" to the value 1 then line 11 links to "x" from "y", replacing "z". P1()'s line 20 loads a pointer from "y", and line 21 dereferences that -pointer. The RCU read-side critical section spanning lines 19-22 is -just for show in this example. +pointer. The RCU read-side critical section spanning lines 19-22 is just +for show in this example. Note that the address used for line 21's load +depends on (in this case, "is exactly the same as") the value loaded by +line 20. This is an example of what is called an "address dependency". +This particular address dependency extends from the load on line 20 to the +load on line 21. Address dependencies provide a weak form of ordering. Running this test results in the following: diff --git a/tools/memory-model/Documentation/simple.txt b/tools/memory-model/Documentation/simple.txt new file mode 100644 index 000000000000..81e1a0ec5342 --- /dev/null +++ b/tools/memory-model/Documentation/simple.txt @@ -0,0 +1,271 @@ +This document provides options for those wishing to keep their +memory-ordering lives simple, as is necessary for those whose domain +is complex. After all, there are bugs other than memory-ordering bugs, +and the time spent gaining memory-ordering knowledge is not available +for gaining domain knowledge. Furthermore Linux-kernel memory model +(LKMM) is quite complex, with subtle differences in code often having +dramatic effects on correctness. + +The options near the beginning of this list are quite simple. The idea +is not that kernel hackers don't already know about them, but rather +that they might need the occasional reminder. + +Please note that this is a generic guide, and that specific subsystems +will often have special requirements or idioms. For example, developers +of MMIO-based device drivers will often need to use mb(), rmb(), and +wmb(), and therefore might find smp_mb(), smp_rmb(), and smp_wmb() +to be more natural than smp_load_acquire() and smp_store_release(). +On the other hand, those coming in from other environments will likely +be more familiar with these last two. + + +Single-threaded code +==================== + +In single-threaded code, there is no reordering, at least assuming +that your toolchain and hardware are working correctly. In addition, +it is generally a mistake to assume your code will only run in a single +threaded context as the kernel can enter the same code path on multiple +CPUs at the same time. One important exception is a function that makes +no external data references. + +In the general case, you will need to take explicit steps to ensure that +your code really is executed within a single thread that does not access +shared variables. A simple way to achieve this is to define a global lock +that you acquire at the beginning of your code and release at the end, +taking care to ensure that all references to your code's shared data are +also carried out under that same lock. Because only one thread can hold +this lock at a given time, your code will be executed single-threaded. +This approach is called "code locking". + +Code locking can severely limit both performance and scalability, so it +should be used with caution, and only on code paths that execute rarely. +After all, a huge amount of effort was required to remove the Linux +kernel's old "Big Kernel Lock", so let's please be very careful about +adding new "little kernel locks". + +One of the advantages of locking is that, in happy contrast with the +year 1981, almost all kernel developers are very familiar with locking. +The Linux kernel's lockdep (CONFIG_PROVE_LOCKING=y) is very helpful with +the formerly feared deadlock scenarios. + +Please use the standard locking primitives provided by the kernel rather +than rolling your own. For one thing, the standard primitives interact +properly with lockdep. For another thing, these primitives have been +tuned to deal better with high contention. And for one final thing, it is +surprisingly hard to correctly code production-quality lock acquisition +and release functions. After all, even simple non-production-quality +locking functions must carefully prevent both the CPU and the compiler +from moving code in either direction across the locking function. + +Despite the scalability limitations of single-threaded code, RCU +takes this approach for much of its grace-period processing and also +for early-boot operation. The reason RCU is able to scale despite +single-threaded grace-period processing is use of batching, where all +updates that accumulated during one grace period are handled by the +next one. In other words, slowing down grace-period processing makes +it more efficient. Nor is RCU unique: Similar batching optimizations +are used in many I/O operations. + + +Packaged code +============= + +Even if performance and scalability concerns prevent your code from +being completely single-threaded, it is often possible to use library +functions that handle the concurrency nearly or entirely on their own. +This approach delegates any LKMM worries to the library maintainer. + +In the kernel, what is the "library"? Quite a bit. It includes the +contents of the lib/ directory, much of the include/linux/ directory along +with a lot of other heavily used APIs. But heavily used examples include +the list macros (for example, include/linux/{,rcu}list.h), workqueues, +smp_call_function(), and the various hash tables and search trees. + + +Data locking +============ + +With code locking, we use single-threaded code execution to guarantee +serialized access to the data that the code is accessing. However, +we can also achieve this by instead associating the lock with specific +instances of the data structures. This creates a "critical section" +in the code execution that will execute as though it is single threaded. +By placing all the accesses and modifications to a shared data structure +inside a critical section, we ensure that the execution context that +holds the lock has exclusive access to the shared data. + +The poster boy for this approach is the hash table, where placing a lock +in each hash bucket allows operations on different buckets to proceed +concurrently. This works because the buckets do not overlap with each +other, so that an operation on one bucket does not interfere with any +other bucket. + +As the number of buckets increases, data locking scales naturally. +In particular, if the amount of data increases with the number of CPUs, +increasing the number of buckets as the number of CPUs increase results +in a naturally scalable data structure. + + +Per-CPU processing +================== + +Partitioning processing and data over CPUs allows each CPU to take +a single-threaded approach while providing excellent performance and +scalability. Of course, there is no free lunch: The dark side of this +excellence is substantially increased memory footprint. + +In addition, it is sometimes necessary to occasionally update some global +view of this processing and data, in which case something like locking +must be used to protect this global view. This is the approach taken +by the percpu_counter infrastructure. In many cases, there are already +generic/library variants of commonly used per-cpu constructs available. +Please use them rather than rolling your own. + +RCU uses DEFINE_PER_CPU*() declaration to create a number of per-CPU +data sets. For example, each CPU does private quiescent-state processing +within its instance of the per-CPU rcu_data structure, and then uses data +locking to report quiescent states up the grace-period combining tree. + + +Packaged primitives: Sequence locking +===================================== + +Lockless programming is considered by many to be more difficult than +lock-based programming, but there are a few lockless design patterns that +have been built out into an API. One of these APIs is sequence locking. +Although this APIs can be used in extremely complex ways, there are simple +and effective ways of using it that avoid the need to pay attention to +memory ordering. + +The basic keep-things-simple rule for sequence locking is "do not write +in read-side code". Yes, you can do writes from within sequence-locking +readers, but it won't be so simple. For example, such writes will be +lockless and should be idempotent. + +For more sophisticated use cases, LKMM can guide you, including use +cases involving combining sequence locking with other synchronization +primitives. (LKMM does not yet know about sequence locking, so it is +currently necessary to open-code it in your litmus tests.) + +Additional information may be found in include/linux/seqlock.h. + +Packaged primitives: RCU +======================== + +Another lockless design pattern that has been baked into an API +is RCU. The Linux kernel makes sophisticated use of RCU, but the +keep-things-simple rules for RCU are "do not write in read-side code" +and "do not update anything that is visible to and accessed by readers", +and "protect updates with locking". + +These rules are illustrated by the functions foo_update_a() and +foo_get_a() shown in Documentation/RCU/whatisRCU.rst. Additional +RCU usage patterns maybe found in Documentation/RCU and in the +source code. + + +Packaged primitives: Atomic operations +====================================== + +Back in the day, the Linux kernel had three types of atomic operations: + +1. Initialization and read-out, such as atomic_set() and atomic_read(). + +2. Operations that did not return a value and provided no ordering, + such as atomic_inc() and atomic_dec(). + +3. Operations that returned a value and provided full ordering, such as + atomic_add_return() and atomic_dec_and_test(). Note that some + value-returning operations provide full ordering only conditionally. + For example, cmpxchg() provides ordering only upon success. + +More recent kernels have operations that return a value but do not +provide full ordering. These are flagged with either a _relaxed() +suffix (providing no ordering), or an _acquire() or _release() suffix +(providing limited ordering). + +Additional information may be found in these files: + +Documentation/atomic_t.txt +Documentation/atomic_bitops.txt +Documentation/core-api/atomic_ops.rst +Documentation/core-api/refcount-vs-atomic.rst + +Reading code using these primitives is often also quite helpful. + + +Lockless, fully ordered +======================= + +When using locking, there often comes a time when it is necessary +to access some variable or another without holding the data lock +that serializes access to that variable. + +If you want to keep things simple, use the initialization and read-out +operations from the previous section only when there are no racing +accesses. Otherwise, use only fully ordered operations when accessing +or modifying the variable. This approach guarantees that code prior +to a given access to that variable will be seen by all CPUs has having +happened before any code following any later access to that same variable. + +Please note that per-CPU functions are not atomic operations and +hence they do not provide any ordering guarantees at all. + +If the lockless accesses are frequently executed reads that are used +only for heuristics, or if they are frequently executed writes that +are used only for statistics, please see the next section. + + +Lockless statistics and heuristics +================================== + +Unordered primitives such as atomic_read(), atomic_set(), READ_ONCE(), and +WRITE_ONCE() can safely be used in some cases. These primitives provide +no ordering, but they do prevent the compiler from carrying out a number +of destructive optimizations (for which please see the next section). +One example use for these primitives is statistics, such as per-CPU +counters exemplified by the rt_cache_stat structure's routing-cache +statistics counters. Another example use case is heuristics, such as +the jiffies_till_first_fqs and jiffies_till_next_fqs kernel parameters +controlling how often RCU scans for idle CPUs. + +But be careful. "Unordered" really does mean "unordered". It is all +too easy to assume ordering, and this assumption must be avoided when +using these primitives. + + +Don't let the compiler trip you up +================================== + +It can be quite tempting to use plain C-language accesses for lockless +loads from and stores to shared variables. Although this is both +possible and quite common in the Linux kernel, it does require a +surprising amount of analysis, care, and knowledge about the compiler. +Yes, some decades ago it was not unfair to consider a C compiler to be +an assembler with added syntax and better portability, but the advent of +sophisticated optimizing compilers mean that those days are long gone. +Today's optimizing compilers can profoundly rewrite your code during the +translation process, and have long been ready, willing, and able to do so. + +Therefore, if you really need to use C-language assignments instead of +READ_ONCE(), WRITE_ONCE(), and so on, you will need to have a very good +understanding of both the C standard and your compiler. Here are some +introductory references and some tooling to start you on this noble quest: + +Who's afraid of a big bad optimizing compiler? + https://lwn.net/Articles/793253/ +Calibrating your fear of big bad optimizing compilers + https://lwn.net/Articles/799218/ +Concurrency bugs should fear the big bad data-race detector (part 1) + https://lwn.net/Articles/816850/ +Concurrency bugs should fear the big bad data-race detector (part 2) + https://lwn.net/Articles/816854/ + + +More complex use cases +====================== + +If the alternatives above do not do what you need, please look at the +recipes-pairs.txt file to peel off the next layer of the memory-ordering +onion. diff --git a/tools/memory-model/README b/tools/memory-model/README index d2e03c4f52a0..c8144d4aafa0 100644 --- a/tools/memory-model/README +++ b/tools/memory-model/README @@ -177,6 +177,11 @@ Documentation/recipes.txt Documentation/references.txt Provides background reading. +Documentation/simple.txt + Starting point for someone new to Linux-kernel concurrency. + And also for those needing a reminder of the simpler approaches + to concurrency! + linux-kernel.bell Categorizes the relevant instructions, including memory references, memory barriers, atomic read-modify-write operations, From 0ce0c78eff7d22c8a261de6c4305a5abb638c200 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Aug 2020 11:35:56 -0700 Subject: [PATCH 47/67] tools/memory-model: Expand the cheatsheet.txt notion of relaxed This commit adds a key entry enumerating the various types of relaxed operations. While in the area, it also renames the relaxed rows. [ paulmck: Apply Boqun Feng feedback. ] Acked-by: Boqun Feng Signed-off-by: Paul E. McKenney --- .../memory-model/Documentation/cheatsheet.txt | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/tools/memory-model/Documentation/cheatsheet.txt b/tools/memory-model/Documentation/cheatsheet.txt index 33ba98d72b16..99d00870b160 100644 --- a/tools/memory-model/Documentation/cheatsheet.txt +++ b/tools/memory-model/Documentation/cheatsheet.txt @@ -3,9 +3,9 @@ C Self R W RMW Self R W DR DW RMW SV -- ---- - - --- ---- - - -- -- --- -- -Store, e.g., WRITE_ONCE() Y Y -Load, e.g., READ_ONCE() Y Y Y Y -Unsuccessful RMW operation Y Y Y Y +Relaxed store Y Y +Relaxed load Y Y Y Y +Relaxed RMW operation Y Y Y Y rcu_dereference() Y Y Y Y Successful *_acquire() R Y Y Y Y Y Y Successful *_release() C Y Y Y W Y @@ -17,14 +17,19 @@ smp_mb__before_atomic() CP Y Y Y a a a a Y smp_mb__after_atomic() CP a a Y Y Y Y Y Y -Key: C: Ordering is cumulative - P: Ordering propagates - R: Read, for example, READ_ONCE(), or read portion of RMW - W: Write, for example, WRITE_ONCE(), or write portion of RMW - Y: Provides ordering - a: Provides ordering given intervening RMW atomic operation - DR: Dependent read (address dependency) - DW: Dependent write (address, data, or control dependency) - RMW: Atomic read-modify-write operation - SELF: Orders self, as opposed to accesses before and/or after - SV: Orders later accesses to the same variable +Key: Relaxed: A relaxed operation is either READ_ONCE(), WRITE_ONCE(), + a *_relaxed() RMW operation, an unsuccessful RMW + operation, a non-value-returning RMW operation such + as atomic_inc(), or one of the atomic*_read() and + atomic*_set() family of operations. + C: Ordering is cumulative + P: Ordering propagates + R: Read, for example, READ_ONCE(), or read portion of RMW + W: Write, for example, WRITE_ONCE(), or write portion of RMW + Y: Provides ordering + a: Provides ordering given intervening RMW atomic operation + DR: Dependent read (address dependency) + DW: Dependent write (address, data, or control dependency) + RMW: Atomic read-modify-write operation + SELF: Orders self, as opposed to accesses before and/or after + SV: Orders later accesses to the same variable From 58faf20a086bd34f91983609e26eac3d5fe76be3 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 27 Aug 2020 13:40:37 +0200 Subject: [PATCH 48/67] time/sched_clock: Use raw_read_seqcount_latch() during suspend sched_clock uses seqcount_t latching to switch between two storage places protected by the sequence counter. This allows it to have interruptible, NMI-safe, seqcount_t write side critical sections. Since 7fc26327b756 ("seqlock: Introduce raw_read_seqcount_latch()"), raw_read_seqcount_latch() became the standardized way for seqcount_t latch read paths. Due to the dependent load, it has one read memory barrier less than the currently used raw_read_seqcount() API. Use raw_read_seqcount_latch() for the suspend path. Commit aadd6e5caaac ("time/sched_clock: Use raw_read_seqcount_latch()") missed changing that instance of raw_read_seqcount(). References: 1809bfa44e10 ("timers, sched/clock: Avoid deadlock during read from NMI") Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200715092345.GA231464@debian-buster-darwi.lab.linutronix.de --- kernel/time/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 1c03eec6ca9b..8c6b5febd7a0 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -258,7 +258,7 @@ void __init generic_sched_clock_init(void) */ static u64 notrace suspended_sched_clock_read(void) { - unsigned int seq = raw_read_seqcount(&cd.seq); + unsigned int seq = raw_read_seqcount_latch(&cd.seq); return cd.read_data[seq & 1].epoch_cyc; } From 6446a5131e24a834606c15a965fa920041581c2c Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 27 Aug 2020 13:40:38 +0200 Subject: [PATCH 49/67] mm/swap: Do not abuse the seqcount_t latching API Commit eef1a429f234 ("mm/swap.c: piggyback lru_add_drain_all() calls") implemented an optimization mechanism to exit the to-be-started LRU drain operation (name it A) if another drain operation *started and finished* while (A) was blocked on the LRU draining mutex. This was done through a seqcount_t latch, which is an abuse of its semantics: 1. seqcount_t latching should be used for the purpose of switching between two storage places with sequence protection to allow interruptible, preemptible, writer sections. The referenced optimization mechanism has absolutely nothing to do with that. 2. The used raw_write_seqcount_latch() has two SMP write memory barriers to insure one consistent storage place out of the two storage places available. A full memory barrier is required instead: to guarantee that the pagevec counter stores visible by local CPU are visible to other CPUs -- before loading the current drain generation. Beside the seqcount_t API abuse, the semantics of a latch sequence counter was force-fitted into the referenced optimization. What was meant is to track "generations" of LRU draining operations, where "global lru draining generation = x" implies that all generations 0 < n <= x are already *scheduled* for draining -- thus nothing needs to be done if the current generation number n <= x. Remove the conceptually-inappropriate seqcount_t latch usage. Manually implement the referenced optimization using a counter and SMP memory barriers. Note, while at it, use the non-atomic variant of cpumask_set_cpu(), __cpumask_set_cpu(), due to the already existing mutex protection. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/87y2pg9erj.fsf@vostro.fn.ogness.net --- mm/swap.c | 65 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 11 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index d16d65d9b4e0..a1ec807e325d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -763,10 +763,20 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ void lru_add_drain_all(void) { - static seqcount_t seqcount = SEQCNT_ZERO(seqcount); - static DEFINE_MUTEX(lock); + /* + * lru_drain_gen - Global pages generation number + * + * (A) Definition: global lru_drain_gen = x implies that all generations + * 0 < n <= x are already *scheduled* for draining. + * + * This is an optimization for the highly-contended use case where a + * user space workload keeps constantly generating a flow of pages for + * each CPU. + */ + static unsigned int lru_drain_gen; static struct cpumask has_work; - int cpu, seq; + static DEFINE_MUTEX(lock); + unsigned cpu, this_gen; /* * Make sure nobody triggers this path before mm_percpu_wq is fully @@ -775,21 +785,54 @@ void lru_add_drain_all(void) if (WARN_ON(!mm_percpu_wq)) return; - seq = raw_read_seqcount_latch(&seqcount); + /* + * Guarantee pagevec counter stores visible by this CPU are visible to + * other CPUs before loading the current drain generation. + */ + smp_mb(); + + /* + * (B) Locally cache global LRU draining generation number + * + * The read barrier ensures that the counter is loaded before the mutex + * is taken. It pairs with smp_mb() inside the mutex critical section + * at (D). + */ + this_gen = smp_load_acquire(&lru_drain_gen); mutex_lock(&lock); /* - * Piggyback on drain started and finished while we waited for lock: - * all pages pended at the time of our enter were drained from vectors. + * (C) Exit the draining operation if a newer generation, from another + * lru_add_drain_all(), was already scheduled for draining. Check (A). */ - if (__read_seqcount_retry(&seqcount, seq)) + if (unlikely(this_gen != lru_drain_gen)) goto done; - raw_write_seqcount_latch(&seqcount); + /* + * (D) Increment global generation number + * + * Pairs with smp_load_acquire() at (B), outside of the critical + * section. Use a full memory barrier to guarantee that the new global + * drain generation number is stored before loading pagevec counters. + * + * This pairing must be done here, before the for_each_online_cpu loop + * below which drains the page vectors. + * + * Let x, y, and z represent some system CPU numbers, where x < y < z. + * Assume CPU #z is is in the middle of the for_each_online_cpu loop + * below and has already reached CPU #y's per-cpu data. CPU #x comes + * along, adds some pages to its per-cpu vectors, then calls + * lru_add_drain_all(). + * + * If the paired barrier is done at any later step, e.g. after the + * loop, CPU #x will just exit at (C) and miss flushing out all of its + * added pages. + */ + WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); + smp_mb(); cpumask_clear(&has_work); - for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); @@ -801,7 +844,7 @@ void lru_add_drain_all(void) need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); - cpumask_set_cpu(cpu, &has_work); + __cpumask_set_cpu(cpu, &has_work); } } @@ -816,7 +859,7 @@ void lru_add_drain_all(void) { lru_add_drain(); } -#endif +#endif /* CONFIG_SMP */ /** * release_pages - batched put_page() From 80793c3471d90d4dc2b48deadb6413bdfe39500f Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 27 Aug 2020 13:40:39 +0200 Subject: [PATCH 50/67] seqlock: Introduce seqcount_latch_t Latch sequence counters are a multiversion concurrency control mechanism where the seqcount_t counter even/odd value is used to switch between two copies of protected data. This allows the seqcount_t read path to safely interrupt its write side critical section (e.g. from NMIs). Initially, latch sequence counters were implemented as a single write function above plain seqcount_t: raw_write_seqcount_latch(). The read side was expected to use plain seqcount_t raw_read_seqcount(). A specialized latch read function, raw_read_seqcount_latch(), was later added. It became the standardized way for latch read paths. Due to the dependent load, it has one read memory barrier less than the plain seqcount_t raw_read_seqcount() API. Only raw_write_seqcount_latch() and raw_read_seqcount_latch() should be used with latch sequence counters. Having *unique* read and write path APIs means that latch sequence counters are actually a data type of their own -- just inappropriately overloading plain seqcount_t. Introduce seqcount_latch_t. This adds type-safety and ensures that only the correct latch-safe APIs are to be used. Not to break bisection, let the latch APIs also accept plain seqcount_t or seqcount_raw_spinlock_t. After converting all call sites to seqcount_latch_t, only that new data type will be allowed. References: 9b0fd802e8c0 ("seqcount: Add raw_write_seqcount_latch()") References: 7fc26327b756 ("seqlock: Introduce raw_read_seqcount_latch()") References: aadd6e5caaac ("time/sched_clock: Use raw_read_seqcount_latch()") Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200827114044.11173-4-a.darwish@linutronix.de --- Documentation/locking/seqlock.rst | 18 +++++ include/linux/seqlock.h | 110 +++++++++++++++++++++--------- 2 files changed, 94 insertions(+), 34 deletions(-) diff --git a/Documentation/locking/seqlock.rst b/Documentation/locking/seqlock.rst index 62c5ad98c11c..a334b584f2b3 100644 --- a/Documentation/locking/seqlock.rst +++ b/Documentation/locking/seqlock.rst @@ -139,6 +139,24 @@ with the associated LOCKTYPE lock acquired. Read path: same as in :ref:`seqcount_t`. + +.. _seqcount_latch_t: + +Latch sequence counters (``seqcount_latch_t``) +---------------------------------------------- + +Latch sequence counters are a multiversion concurrency control mechanism +where the embedded seqcount_t counter even/odd value is used to switch +between two copies of protected data. This allows the sequence counter +read path to safely interrupt its own write side critical section. + +Use seqcount_latch_t when the write side sections cannot be protected +from interruption by readers. This is typically the case when the read +side can be invoked from NMI handlers. + +Check `raw_write_seqcount_latch()` for more information. + + .. _seqlock_t: Sequential locks (``seqlock_t``) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 300cbf312546..88b917d4ebde 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -587,34 +587,76 @@ static inline void write_seqcount_t_invalidate(seqcount_t *s) kcsan_nestable_atomic_end(); } -/** - * raw_read_seqcount_latch() - pick even/odd seqcount_t latch data copy - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants +/* + * Latch sequence counters (seqcount_latch_t) * - * Use seqcount_t latching to switch between two storage places protected - * by a sequence counter. Doing so allows having interruptible, preemptible, - * seqcount_t write side critical sections. + * A sequence counter variant where the counter even/odd value is used to + * switch between two copies of protected data. This allows the read path, + * typically NMIs, to safely interrupt the write side critical section. * - * Check raw_write_seqcount_latch() for more details and a full reader and - * writer usage example. - * - * Return: sequence counter raw value. Use the lowest bit as an index for - * picking which data copy to read. The full counter value must then be - * checked with read_seqcount_retry(). + * As the write sections are fully preemptible, no special handling for + * PREEMPT_RT is needed. */ -#define raw_read_seqcount_latch(s) \ - raw_read_seqcount_t_latch(__seqcount_ptr(s)) +typedef struct { + seqcount_t seqcount; +} seqcount_latch_t; -static inline int raw_read_seqcount_t_latch(seqcount_t *s) -{ - /* Pairs with the first smp_wmb() in raw_write_seqcount_latch() */ - int seq = READ_ONCE(s->sequence); /* ^^^ */ - return seq; +/** + * SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t + * @seq_name: Name of the seqcount_latch_t instance + */ +#define SEQCNT_LATCH_ZERO(seq_name) { \ + .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ } /** - * raw_write_seqcount_latch() - redirect readers to even/odd copy - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * seqcount_latch_init() - runtime initializer for seqcount_latch_t + * @s: Pointer to the seqcount_latch_t instance + */ +static inline void seqcount_latch_init(seqcount_latch_t *s) +{ + seqcount_init(&s->seqcount); +} + +/** + * raw_read_seqcount_latch() - pick even/odd latch data copy + * @s: Pointer to seqcount_t, seqcount_raw_spinlock_t, or seqcount_latch_t + * + * See raw_write_seqcount_latch() for details and a full reader/writer + * usage example. + * + * Return: sequence counter raw value. Use the lowest bit as an index for + * picking which data copy to read. The full counter must then be checked + * with read_seqcount_latch_retry(). + */ +#define raw_read_seqcount_latch(s) \ +({ \ + /* \ + * Pairs with the first smp_wmb() in raw_write_seqcount_latch(). \ + * Due to the dependent load, a full smp_rmb() is not needed. \ + */ \ + _Generic(*(s), \ + seqcount_t: READ_ONCE(((seqcount_t *)s)->sequence), \ + seqcount_raw_spinlock_t: READ_ONCE(((seqcount_raw_spinlock_t *)s)->seqcount.sequence), \ + seqcount_latch_t: READ_ONCE(((seqcount_latch_t *)s)->seqcount.sequence)); \ +}) + +/** + * read_seqcount_latch_retry() - end a seqcount_latch_t read section + * @s: Pointer to seqcount_latch_t + * @start: count, from raw_read_seqcount_latch() + * + * Return: true if a read section retry is required, else false + */ +static inline int +read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) +{ + return read_seqcount_retry(&s->seqcount, start); +} + +/** + * raw_write_seqcount_latch() - redirect latch readers to even/odd copy + * @s: Pointer to seqcount_t, seqcount_raw_spinlock_t, or seqcount_latch_t * * The latch technique is a multiversion concurrency control method that allows * queries during non-atomic modifications. If you can guarantee queries never @@ -633,7 +675,7 @@ static inline int raw_read_seqcount_t_latch(seqcount_t *s) * The basic form is a data structure like:: * * struct latch_struct { - * seqcount_t seq; + * seqcount_latch_t seq; * struct data_struct data[2]; * }; * @@ -643,13 +685,13 @@ static inline int raw_read_seqcount_t_latch(seqcount_t *s) * void latch_modify(struct latch_struct *latch, ...) * { * smp_wmb(); // Ensure that the last data[1] update is visible - * latch->seq++; + * latch->seq.sequence++; * smp_wmb(); // Ensure that the seqcount update is visible * * modify(latch->data[0], ...); * * smp_wmb(); // Ensure that the data[0] update is visible - * latch->seq++; + * latch->seq.sequence++; * smp_wmb(); // Ensure that the seqcount update is visible * * modify(latch->data[1], ...); @@ -668,8 +710,8 @@ static inline int raw_read_seqcount_t_latch(seqcount_t *s) * idx = seq & 0x01; * entry = data_query(latch->data[idx], ...); * - * // read_seqcount_retry() includes needed smp_rmb() - * } while (read_seqcount_retry(&latch->seq, seq)); + * // This includes needed smp_rmb() + * } while (read_seqcount_latch_retry(&latch->seq, seq)); * * return entry; * } @@ -693,14 +735,14 @@ static inline int raw_read_seqcount_t_latch(seqcount_t *s) * When data is a dynamic data structure; one should use regular RCU * patterns to manage the lifetimes of the objects within. */ -#define raw_write_seqcount_latch(s) \ - raw_write_seqcount_t_latch(__seqcount_ptr(s)) - -static inline void raw_write_seqcount_t_latch(seqcount_t *s) -{ - smp_wmb(); /* prior stores before incrementing "sequence" */ - s->sequence++; - smp_wmb(); /* increment "sequence" before following stores */ +#define raw_write_seqcount_latch(s) \ +{ \ + smp_wmb(); /* prior stores before incrementing "sequence" */ \ + _Generic(*(s), \ + seqcount_t: ((seqcount_t *)s)->sequence++, \ + seqcount_raw_spinlock_t:((seqcount_raw_spinlock_t *)s)->seqcount.sequence++, \ + seqcount_latch_t: ((seqcount_latch_t *)s)->seqcount.sequence++); \ + smp_wmb(); /* increment "sequence" before following stores */ \ } /* From a690ed07353ec45f056b0a6f87c23a12a59c030d Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 27 Aug 2020 13:40:40 +0200 Subject: [PATCH 51/67] time/sched_clock: Use seqcount_latch_t Latch sequence counters have unique read and write APIs, and thus seqcount_latch_t was recently introduced at seqlock.h. Use that new data type instead of plain seqcount_t. This adds the necessary type-safety and ensures only latching-safe seqcount APIs are to be used. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200827114044.11173-5-a.darwish@linutronix.de --- kernel/time/sched_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 8c6b5febd7a0..0642013dace4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -35,7 +35,7 @@ * into a single 64-byte cache line. */ struct clock_data { - seqcount_t seq; + seqcount_latch_t seq; struct clock_read_data read_data[2]; ktime_t wrap_kt; unsigned long rate; @@ -76,7 +76,7 @@ struct clock_read_data *sched_clock_read_begin(unsigned int *seq) int sched_clock_read_retry(unsigned int seq) { - return read_seqcount_retry(&cd.seq, seq); + return read_seqcount_latch_retry(&cd.seq, seq); } unsigned long long notrace sched_clock(void) From 249d053835320cb3e7c00066cf085a6ba9b1f126 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 27 Aug 2020 13:40:41 +0200 Subject: [PATCH 52/67] timekeeping: Use seqcount_latch_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Latch sequence counters are a multiversion concurrency control mechanism where the seqcount_t counter even/odd value is used to switch between two data storage copies. This allows the seqcount_t read path to safely interrupt its write side critical section (e.g. from NMIs). Initially, latch sequence counters were implemented as a single write function, raw_write_seqcount_latch(), above plain seqcount_t. The read path was expected to use plain seqcount_t raw_read_seqcount(). A specialized read function was later added, raw_read_seqcount_latch(), and became the standardized way for latch read paths. Having unique read and write APIs meant that latch sequence counters are basically a data type of their own -- just inappropriately overloading plain seqcount_t. The seqcount_latch_t data type was thus introduced at seqlock.h. Use that new data type instead of seqcount_raw_spinlock_t. This ensures that only latch-safe APIs are to be used with the sequence counter. Note that the use of seqcount_raw_spinlock_t was not very useful in the first place. Only the "raw_" subset of seqcount_t APIs were used at timekeeping.c. This subset was created for contexts where lockdep cannot be used. seqcount_LOCKTYPE_t's raison d'être -- verifying that the seqcount_t writer serialization lock is held -- cannot thus be done. References: 0c3351d451ae ("seqlock: Use raw_ prefix instead of _no_lockdep") References: 55f3560df975 ("seqlock: Extend seqcount API with associated locks") Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200827114044.11173-6-a.darwish@linutronix.de --- kernel/time/timekeeping.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4c47f388a83f..999c981ae766 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -64,7 +64,7 @@ static struct timekeeper shadow_timekeeper; * See @update_fast_timekeeper() below. */ struct tk_fast { - seqcount_raw_spinlock_t seq; + seqcount_latch_t seq; struct tk_read_base base[2]; }; @@ -81,13 +81,13 @@ static struct clocksource dummy_clock = { }; static struct tk_fast tk_fast_mono ____cacheline_aligned = { - .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock), + .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { - .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock), + .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; @@ -467,7 +467,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) tk_clock_read(tkr), tkr->cycle_last, tkr->mask)); - } while (read_seqcount_retry(&tkf->seq, seq)); + } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } @@ -533,7 +533,7 @@ static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) tk_clock_read(tkr), tkr->cycle_last, tkr->mask)); - } while (read_seqcount_retry(&tkf->seq, seq)); + } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } From a1f1066133d85d5f42217cc72a2490bb7aa889c5 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 27 Aug 2020 13:40:42 +0200 Subject: [PATCH 53/67] x86/tsc: Use seqcount_latch_t Latch sequence counters have unique read and write APIs, and thus seqcount_latch_t was recently introduced at seqlock.h. Use that new data type instead of plain seqcount_t. This adds the necessary type-safety and ensures that only latching-safe seqcount APIs are to be used. Signed-off-by: Ahmed S. Darwish [peterz: unwreck cyc2ns_read_begin()] Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200827114044.11173-7-a.darwish@linutronix.de --- arch/x86/kernel/tsc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 49d925043171..f70dffc2771f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -54,7 +54,7 @@ struct clocksource *art_related_clocksource; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ - seqcount_t seq; /* 32 + 4 = 36 */ + seqcount_latch_t seq; /* 32 + 4 = 36 */ }; /* fits one cacheline */ @@ -73,14 +73,14 @@ __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) preempt_disable_notrace(); do { - seq = this_cpu_read(cyc2ns.seq.sequence); + seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); - } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); + } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); } __always_inline void cyc2ns_read_end(void) @@ -186,7 +186,7 @@ static void __init cyc2ns_init_boot_cpu(void) { struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); - seqcount_init(&c2n->seq); + seqcount_latch_init(&c2n->seq); __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); } @@ -203,7 +203,7 @@ static void __init cyc2ns_init_secondary_cpus(void) for_each_possible_cpu(cpu) { if (cpu != this_cpu) { - seqcount_init(&c2n->seq); + seqcount_latch_init(&c2n->seq); c2n = per_cpu_ptr(&cyc2ns, cpu); c2n->data[0] = data[0]; c2n->data[1] = data[1]; From 24bf401cebfd630cc9e2c3746e43945e836626f9 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 27 Aug 2020 13:40:43 +0200 Subject: [PATCH 54/67] rbtree_latch: Use seqcount_latch_t Latch sequence counters have unique read and write APIs, and thus seqcount_latch_t was recently introduced at seqlock.h. Use that new data type instead of plain seqcount_t. This adds the necessary type-safety and ensures that only latching-safe seqcount APIs are to be used. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200827114044.11173-8-a.darwish@linutronix.de --- include/linux/rbtree_latch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h index 7d012faa509a..3d1a9e716b80 100644 --- a/include/linux/rbtree_latch.h +++ b/include/linux/rbtree_latch.h @@ -42,8 +42,8 @@ struct latch_tree_node { }; struct latch_tree_root { - seqcount_t seq; - struct rb_root tree[2]; + seqcount_latch_t seq; + struct rb_root tree[2]; }; /** @@ -206,7 +206,7 @@ latch_tree_find(void *key, struct latch_tree_root *root, do { seq = raw_read_seqcount_latch(&root->seq); node = __lt_find(key, root, seq & 1, ops->comp); - } while (read_seqcount_retry(&root->seq, seq)); + } while (read_seqcount_latch_retry(&root->seq, seq)); return node; } From 0c9794c8b6781eb7dad8e19b78c5d4557790597a Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 27 Aug 2020 13:40:44 +0200 Subject: [PATCH 55/67] seqlock: seqcount latch APIs: Only allow seqcount_latch_t All latch sequence counter call-sites have now been converted from plain seqcount_t to the new seqcount_latch_t data type. Enforce type-safety by modifying seqlock.h latch APIs to only accept seqcount_latch_t. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200827114044.11173-9-a.darwish@linutronix.de --- include/linux/seqlock.h | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 88b917d4ebde..f2a7a467e998 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -620,7 +620,7 @@ static inline void seqcount_latch_init(seqcount_latch_t *s) /** * raw_read_seqcount_latch() - pick even/odd latch data copy - * @s: Pointer to seqcount_t, seqcount_raw_spinlock_t, or seqcount_latch_t + * @s: Pointer to seqcount_latch_t * * See raw_write_seqcount_latch() for details and a full reader/writer * usage example. @@ -629,17 +629,14 @@ static inline void seqcount_latch_init(seqcount_latch_t *s) * picking which data copy to read. The full counter must then be checked * with read_seqcount_latch_retry(). */ -#define raw_read_seqcount_latch(s) \ -({ \ - /* \ - * Pairs with the first smp_wmb() in raw_write_seqcount_latch(). \ - * Due to the dependent load, a full smp_rmb() is not needed. \ - */ \ - _Generic(*(s), \ - seqcount_t: READ_ONCE(((seqcount_t *)s)->sequence), \ - seqcount_raw_spinlock_t: READ_ONCE(((seqcount_raw_spinlock_t *)s)->seqcount.sequence), \ - seqcount_latch_t: READ_ONCE(((seqcount_latch_t *)s)->seqcount.sequence)); \ -}) +static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) +{ + /* + * Pairs with the first smp_wmb() in raw_write_seqcount_latch(). + * Due to the dependent load, a full smp_rmb() is not needed. + */ + return READ_ONCE(s->seqcount.sequence); +} /** * read_seqcount_latch_retry() - end a seqcount_latch_t read section @@ -656,7 +653,7 @@ read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) /** * raw_write_seqcount_latch() - redirect latch readers to even/odd copy - * @s: Pointer to seqcount_t, seqcount_raw_spinlock_t, or seqcount_latch_t + * @s: Pointer to seqcount_latch_t * * The latch technique is a multiversion concurrency control method that allows * queries during non-atomic modifications. If you can guarantee queries never @@ -735,14 +732,11 @@ read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) * When data is a dynamic data structure; one should use regular RCU * patterns to manage the lifetimes of the objects within. */ -#define raw_write_seqcount_latch(s) \ -{ \ - smp_wmb(); /* prior stores before incrementing "sequence" */ \ - _Generic(*(s), \ - seqcount_t: ((seqcount_t *)s)->sequence++, \ - seqcount_raw_spinlock_t:((seqcount_raw_spinlock_t *)s)->seqcount.sequence++, \ - seqcount_latch_t: ((seqcount_latch_t *)s)->seqcount.sequence++); \ - smp_wmb(); /* increment "sequence" before following stores */ \ +static inline void raw_write_seqcount_latch(seqcount_latch_t *s) +{ + smp_wmb(); /* prior stores before incrementing "sequence" */ + s->seqcount.sequence++; + smp_wmb(); /* increment "sequence" before following stores */ } /* From 6dd699b13d53f26a7603702d8bada3482312df74 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Fri, 4 Sep 2020 17:32:27 +0200 Subject: [PATCH 56/67] seqlock: seqcount_LOCKNAME_t: Standardize naming convention At seqlock.h, sequence counters with associated locks are either called seqcount_LOCKNAME_t, seqcount_LOCKTYPE_t, or seqcount_locktype_t. Standardize on seqcount_LOCKNAME_t for all instances in comments, kernel-doc, and SEQCOUNT_LOCKNAME() generative macro paramters. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200904153231.11994-2-a.darwish@linutronix.de --- include/linux/seqlock.h | 79 +++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index f2a7a467e998..820ace2f5911 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -53,7 +53,7 @@ * * If the write serialization mechanism is one of the common kernel * locking primitives, use a sequence counter with associated lock - * (seqcount_LOCKTYPE_t) instead. + * (seqcount_LOCKNAME_t) instead. * * If it's desired to automatically handle the sequence counter writer * serialization and non-preemptibility requirements, use a sequential @@ -117,7 +117,7 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) #define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) } /* - * Sequence counters with associated locks (seqcount_LOCKTYPE_t) + * Sequence counters with associated locks (seqcount_LOCKNAME_t) * * A sequence counter which associates the lock used for writer * serialization at initialization time. This enables lockdep to validate @@ -138,30 +138,32 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) #endif /** - * typedef seqcount_LOCKNAME_t - sequence counter with LOCKTYPE associated + * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated * @seqcount: The real sequence counter - * @lock: Pointer to the associated spinlock + * @lock: Pointer to the associated lock * - * A plain sequence counter with external writer synchronization by a - * spinlock. The spinlock is associated to the sequence count in the + * A plain sequence counter with external writer synchronization by + * LOCKNAME @lock. The lock is associated to the sequence counter in the * static initializer or init function. This enables lockdep to validate * that the write side critical section is properly serialized. + * + * LOCKNAME: raw_spinlock, spinlock, rwlock, mutex, or ww_mutex. */ /* * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t * @s: Pointer to the seqcount_LOCKNAME_t instance - * @lock: Pointer to the associated LOCKTYPE + * @lock: Pointer to the associated lock */ /* - * SEQCOUNT_LOCKTYPE() - Instantiate seqcount_LOCKNAME_t and helpers - * @locktype: actual typename - * @lockname: name - * @preemptible: preemptibility of above locktype + * SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers + * @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t + * @locktype: LOCKNAME canonical C data type + * @preemptible: preemptibility of above lockname * @lockmember: argument for lockdep_assert_held() */ -#define SEQCOUNT_LOCKTYPE(locktype, lockname, preemptible, lockmember) \ +#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember) \ typedef struct seqcount_##lockname { \ seqcount_t seqcount; \ __SEQ_LOCK(locktype *lock); \ @@ -211,29 +213,28 @@ static inline void __seqcount_assert(seqcount_t *s) lockdep_assert_preemption_disabled(); } -SEQCOUNT_LOCKTYPE(raw_spinlock_t, raw_spinlock, false, s->lock) -SEQCOUNT_LOCKTYPE(spinlock_t, spinlock, false, s->lock) -SEQCOUNT_LOCKTYPE(rwlock_t, rwlock, false, s->lock) -SEQCOUNT_LOCKTYPE(struct mutex, mutex, true, s->lock) -SEQCOUNT_LOCKTYPE(struct ww_mutex, ww_mutex, true, &s->lock->base) +SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, s->lock) +SEQCOUNT_LOCKNAME(spinlock, spinlock_t, false, s->lock) +SEQCOUNT_LOCKNAME(rwlock, rwlock_t, false, s->lock) +SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock) +SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base) /* * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t * @name: Name of the seqcount_LOCKNAME_t instance - * @lock: Pointer to the associated LOCKTYPE + * @lock: Pointer to the associated LOCKNAME */ -#define SEQCOUNT_LOCKTYPE_ZERO(seq_name, assoc_lock) { \ +#define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) { \ .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ __SEQ_LOCK(.lock = (assoc_lock)) \ } -#define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) -#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) -#define SEQCNT_RWLOCK_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) -#define SEQCNT_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) -#define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) - +#define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) +#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) +#define SEQCNT_RWLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) +#define SEQCNT_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) +#define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define __seqprop_case(s, lockname, prop) \ seqcount_##lockname##_t: __seqcount_##lockname##_##prop((void *)(s)) @@ -252,7 +253,7 @@ SEQCOUNT_LOCKTYPE(struct ww_mutex, ww_mutex, true, &s->lock->base) /** * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb() * barrier. Callers should ensure that smp_rmb() or equivalent ordering is @@ -283,7 +284,7 @@ repeat: /** * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Return: count to be passed to read_seqcount_retry() */ @@ -299,7 +300,7 @@ static inline unsigned raw_read_seqcount_t_begin(const seqcount_t *s) /** * read_seqcount_begin() - begin a seqcount_t read critical section - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Return: count to be passed to read_seqcount_retry() */ @@ -314,7 +315,7 @@ static inline unsigned read_seqcount_t_begin(const seqcount_t *s) /** * raw_read_seqcount() - read the raw seqcount_t counter value - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * raw_read_seqcount opens a read critical section of the given * seqcount_t, without any lockdep checking, and without checking or @@ -337,7 +338,7 @@ static inline unsigned raw_read_seqcount_t(const seqcount_t *s) /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * raw_seqcount_begin opens a read critical section of the given * seqcount_t. Unlike read_seqcount_begin(), this function will not wait @@ -365,7 +366,7 @@ static inline unsigned raw_seqcount_t_begin(const seqcount_t *s) /** * __read_seqcount_retry() - end a seqcount_t read section w/o barrier - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @start: count, from read_seqcount_begin() * * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb() @@ -389,7 +390,7 @@ static inline int __read_seqcount_t_retry(const seqcount_t *s, unsigned start) /** * read_seqcount_retry() - end a seqcount_t read critical section - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @start: count, from read_seqcount_begin() * * read_seqcount_retry closes the read critical section of given @@ -409,7 +410,7 @@ static inline int read_seqcount_t_retry(const seqcount_t *s, unsigned start) /** * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants */ #define raw_write_seqcount_begin(s) \ do { \ @@ -428,7 +429,7 @@ static inline void raw_write_seqcount_t_begin(seqcount_t *s) /** * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants */ #define raw_write_seqcount_end(s) \ do { \ @@ -448,7 +449,7 @@ static inline void raw_write_seqcount_t_end(seqcount_t *s) /** * write_seqcount_begin_nested() - start a seqcount_t write section with * custom lockdep nesting level - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @subclass: lockdep nesting level * * See Documentation/locking/lockdep-design.rst @@ -471,7 +472,7 @@ static inline void write_seqcount_t_begin_nested(seqcount_t *s, int subclass) /** * write_seqcount_begin() - start a seqcount_t write side critical section - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * write_seqcount_begin opens a write side critical section of the given * seqcount_t. @@ -497,7 +498,7 @@ static inline void write_seqcount_t_begin(seqcount_t *s) /** * write_seqcount_end() - end a seqcount_t write side critical section - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * The write section must've been opened with write_seqcount_begin(). */ @@ -517,7 +518,7 @@ static inline void write_seqcount_t_end(seqcount_t *s) /** * raw_write_seqcount_barrier() - do a seqcount_t write barrier - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * This can be used to provide an ordering guarantee instead of the usual * consistency guarantee. It is one wmb cheaper, because it can collapse @@ -571,7 +572,7 @@ static inline void raw_write_seqcount_t_barrier(seqcount_t *s) /** * write_seqcount_invalidate() - invalidate in-progress seqcount_t read * side operations - * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * After write_seqcount_invalidate, no seqcount_t read side operations * will complete successfully and see data older than this. From 5cdd25572a29e46f932d3e6eedbd07429de66431 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Fri, 4 Sep 2020 17:32:28 +0200 Subject: [PATCH 57/67] seqlock: Use unique prefix for seqcount_t property accessors At seqlock.h, the following set of functions: - __seqcount_ptr() - __seqcount_preemptible() - __seqcount_assert() act as plain seqcount_t "property" accessors. Meanwhile, the following group: - __seqcount_ptr() - __seqcount_lock_preemptible() - __seqcount_assert_lock_held() act as the equivalent set, but in the generic form, taking either seqcount_t or any of the seqcount_LOCKNAME_t variants. This is quite confusing, especially the first member where it is called exactly the same in both groups. Differentiate the first group by using "__seqprop" as prefix, and also use that same prefix for all of seqcount_LOCKNAME_t property accessors. While at it, constify the property accessors first parameter when appropriate. References: 55f3560df975 ("seqlock: Extend seqcount API with associated locks") Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200904153231.11994-3-a.darwish@linutronix.de --- include/linux/seqlock.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 820ace2f5911..0b4a22f33ac3 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -157,7 +157,9 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) */ /* - * SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers + * SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers + * seqprop_LOCKNAME_*() - Property accessors for seqcount_LOCKNAME_t + * * @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t * @locktype: LOCKNAME canonical C data type * @preemptible: preemptibility of above lockname @@ -177,19 +179,19 @@ seqcount_##lockname##_init(seqcount_##lockname##_t *s, locktype *lock) \ } \ \ static __always_inline seqcount_t * \ -__seqcount_##lockname##_ptr(seqcount_##lockname##_t *s) \ +__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ { \ return &s->seqcount; \ } \ \ static __always_inline bool \ -__seqcount_##lockname##_preemptible(seqcount_##lockname##_t *s) \ +__seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ { \ return preemptible; \ } \ \ static __always_inline void \ -__seqcount_##lockname##_assert(seqcount_##lockname##_t *s) \ +__seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \ { \ __SEQ_LOCK(lockdep_assert_held(lockmember)); \ } @@ -198,17 +200,17 @@ __seqcount_##lockname##_assert(seqcount_##lockname##_t *s) \ * __seqprop() for seqcount_t */ -static inline seqcount_t *__seqcount_ptr(seqcount_t *s) +static inline seqcount_t *__seqprop_ptr(seqcount_t *s) { return s; } -static inline bool __seqcount_preemptible(seqcount_t *s) +static inline bool __seqprop_preemptible(const seqcount_t *s) { return false; } -static inline void __seqcount_assert(seqcount_t *s) +static inline void __seqprop_assert(const seqcount_t *s) { lockdep_assert_preemption_disabled(); } @@ -237,10 +239,10 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base) #define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define __seqprop_case(s, lockname, prop) \ - seqcount_##lockname##_t: __seqcount_##lockname##_##prop((void *)(s)) + seqcount_##lockname##_t: __seqprop_##lockname##_##prop((void *)(s)) #define __seqprop(s, prop) _Generic(*(s), \ - seqcount_t: __seqcount_##prop((void *)(s)), \ + seqcount_t: __seqprop_##prop((void *)(s)), \ __seqprop_case((s), raw_spinlock, prop), \ __seqprop_case((s), spinlock, prop), \ __seqprop_case((s), rwlock, prop), \ From 52ac39e5db5148f70392edb654ad882ac8da88a8 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Fri, 4 Sep 2020 17:32:29 +0200 Subject: [PATCH 58/67] seqlock: seqcount_t: Implement all read APIs as statement expressions The sequence counters read APIs are implemented as CPP macros, so they can take either seqcount_t or any of the seqcount_LOCKNAME_t variants. Such macros then get *directly* transformed to internal C functions that only take plain seqcount_t. Further commits need access to seqcount_LOCKNAME_t inside of the actual read APIs code. Thus transform all of the seqcount read APIs to pure GCC statement expressions instead. This will not break type-safety: all of the transformed APIs resolve to a _Generic() selection that does not have a "default" case. This will also not affect the transformed APIs readability: previously added kernel-doc above all of seqlock.h functions makes the expectations quite clear for call-site developers. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200904153231.11994-4-a.darwish@linutronix.de --- include/linux/seqlock.h | 94 ++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 49 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 0b4a22f33ac3..f3b78277e26b 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -184,6 +184,12 @@ __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ return &s->seqcount; \ } \ \ +static __always_inline unsigned \ +__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ +{ \ + return READ_ONCE(s->seqcount.sequence); \ +} \ + \ static __always_inline bool \ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ { \ @@ -205,6 +211,11 @@ static inline seqcount_t *__seqprop_ptr(seqcount_t *s) return s; } +static inline unsigned __seqprop_sequence(const seqcount_t *s) +{ + return READ_ONCE(s->sequence); +} + static inline bool __seqprop_preemptible(const seqcount_t *s) { return false; @@ -250,6 +261,7 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base) __seqprop_case((s), ww_mutex, prop)) #define __seqcount_ptr(s) __seqprop(s, ptr) +#define __seqcount_sequence(s) __seqprop(s, sequence) #define __seqcount_lock_preemptible(s) __seqprop(s, preemptible) #define __seqcount_assert_lock_held(s) __seqprop(s, assert) @@ -268,21 +280,15 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base) * Return: count to be passed to read_seqcount_retry() */ #define __read_seqcount_begin(s) \ - __read_seqcount_t_begin(__seqcount_ptr(s)) - -static inline unsigned __read_seqcount_t_begin(const seqcount_t *s) -{ - unsigned ret; - -repeat: - ret = READ_ONCE(s->sequence); - if (unlikely(ret & 1)) { - cpu_relax(); - goto repeat; - } - kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); - return ret; -} +({ \ + unsigned seq; \ + \ + while ((seq = __seqcount_sequence(s)) & 1) \ + cpu_relax(); \ + \ + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ + seq; \ +}) /** * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep @@ -291,14 +297,12 @@ repeat: * Return: count to be passed to read_seqcount_retry() */ #define raw_read_seqcount_begin(s) \ - raw_read_seqcount_t_begin(__seqcount_ptr(s)) - -static inline unsigned raw_read_seqcount_t_begin(const seqcount_t *s) -{ - unsigned ret = __read_seqcount_t_begin(s); - smp_rmb(); - return ret; -} +({ \ + unsigned seq = __read_seqcount_begin(s); \ + \ + smp_rmb(); \ + seq; \ +}) /** * read_seqcount_begin() - begin a seqcount_t read critical section @@ -307,13 +311,10 @@ static inline unsigned raw_read_seqcount_t_begin(const seqcount_t *s) * Return: count to be passed to read_seqcount_retry() */ #define read_seqcount_begin(s) \ - read_seqcount_t_begin(__seqcount_ptr(s)) - -static inline unsigned read_seqcount_t_begin(const seqcount_t *s) -{ - seqcount_lockdep_reader_access(s); - return raw_read_seqcount_t_begin(s); -} +({ \ + seqcount_lockdep_reader_access(__seqcount_ptr(s)); \ + raw_read_seqcount_begin(s); \ +}) /** * raw_read_seqcount() - read the raw seqcount_t counter value @@ -327,15 +328,13 @@ static inline unsigned read_seqcount_t_begin(const seqcount_t *s) * Return: count to be passed to read_seqcount_retry() */ #define raw_read_seqcount(s) \ - raw_read_seqcount_t(__seqcount_ptr(s)) - -static inline unsigned raw_read_seqcount_t(const seqcount_t *s) -{ - unsigned ret = READ_ONCE(s->sequence); - smp_rmb(); - kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); - return ret; -} +({ \ + unsigned seq = __seqcount_sequence(s); \ + \ + smp_rmb(); \ + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ + seq; \ +}) /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o @@ -355,16 +354,13 @@ static inline unsigned raw_read_seqcount_t(const seqcount_t *s) * Return: count to be passed to read_seqcount_retry() */ #define raw_seqcount_begin(s) \ - raw_seqcount_t_begin(__seqcount_ptr(s)) - -static inline unsigned raw_seqcount_t_begin(const seqcount_t *s) -{ - /* - * If the counter is odd, let read_seqcount_retry() fail - * by decrementing the counter. - */ - return raw_read_seqcount_t(s) & ~1; -} +({ \ + /* \ + * If the counter is odd, let read_seqcount_retry() fail \ + * by decrementing the counter. \ + */ \ + raw_read_seqcount(s) & ~1; \ +}) /** * __read_seqcount_retry() - end a seqcount_t read section w/o barrier From 8117ab508f9c476e0a10b9db7f4818f784cf3176 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Fri, 4 Sep 2020 17:32:30 +0200 Subject: [PATCH 59/67] seqlock: seqcount_LOCKNAME_t: Introduce PREEMPT_RT support Preemption must be disabled before entering a sequence counter write side critical section. Otherwise the read side section can preempt the write side section and spin for the entire scheduler tick. If that reader belongs to a real-time scheduling class, it can spin forever and the kernel will livelock. Disabling preemption cannot be done for PREEMPT_RT though: it can lead to higher latencies, and the write side sections will not be able to acquire locks which become sleeping locks (e.g. spinlock_t). To remain preemptible, while avoiding a possible livelock caused by the reader preempting the writer, use a different technique: let the reader detect if a seqcount_LOCKNAME_t writer is in progress. If that's the case, acquire then release the associated LOCKNAME writer serialization lock. This will allow any possibly-preempted writer to make progress until the end of its writer serialization lock critical section. Implement this lock-unlock technique for all seqcount_LOCKNAME_t with an associated (PREEMPT_RT) sleeping lock. References: 55f3560df975 ("seqlock: Extend seqcount API with associated locks") Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200519214547.352050-1-a.darwish@linutronix.de --- include/linux/seqlock.h | 61 ++++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index f3b78277e26b..2bc95104ab1b 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -131,7 +132,23 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) * See Documentation/locking/seqlock.rst */ -#ifdef CONFIG_LOCKDEP +/* + * For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot + * disable preemption. It can lead to higher latencies, and the write side + * sections will not be able to acquire locks which become sleeping locks + * (e.g. spinlock_t). + * + * To remain preemptible while avoiding a possible livelock caused by the + * reader preempting the writer, use a different technique: let the reader + * detect if a seqcount_LOCKNAME_t writer is in progress. If that is the + * case, acquire then release the associated LOCKNAME writer serialization + * lock. This will allow any possibly-preempted writer to make progress + * until the end of its writer serialization lock critical section. + * + * This lock-unlock technique must be implemented for all of PREEMPT_RT + * sleeping locks. See Documentation/locking/locktypes.rst + */ +#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT) #define __SEQ_LOCK(expr) expr #else #define __SEQ_LOCK(expr) @@ -162,10 +179,12 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) * * @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t * @locktype: LOCKNAME canonical C data type - * @preemptible: preemptibility of above lockname + * @preemptible: preemptibility of above locktype * @lockmember: argument for lockdep_assert_held() + * @lockbase: associated lock release function (prefix only) + * @lock_acquire: associated lock acquisition function (full call) */ -#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember) \ +#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember, lockbase, lock_acquire) \ typedef struct seqcount_##lockname { \ seqcount_t seqcount; \ __SEQ_LOCK(locktype *lock); \ @@ -187,13 +206,33 @@ __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ static __always_inline unsigned \ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ { \ - return READ_ONCE(s->seqcount.sequence); \ + unsigned seq = READ_ONCE(s->seqcount.sequence); \ + \ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ + return seq; \ + \ + if (preemptible && unlikely(seq & 1)) { \ + __SEQ_LOCK(lock_acquire); \ + __SEQ_LOCK(lockbase##_unlock(s->lock)); \ + \ + /* \ + * Re-read the sequence counter since the (possibly \ + * preempted) writer made progress. \ + */ \ + seq = READ_ONCE(s->seqcount.sequence); \ + } \ + \ + return seq; \ } \ \ static __always_inline bool \ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ { \ - return preemptible; \ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ + return preemptible; \ + \ + /* PREEMPT_RT relies on the above LOCK+UNLOCK */ \ + return false; \ } \ \ static __always_inline void \ @@ -226,11 +265,13 @@ static inline void __seqprop_assert(const seqcount_t *s) lockdep_assert_preemption_disabled(); } -SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, s->lock) -SEQCOUNT_LOCKNAME(spinlock, spinlock_t, false, s->lock) -SEQCOUNT_LOCKNAME(rwlock, rwlock_t, false, s->lock) -SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock) -SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base) +#define __SEQ_RT IS_ENABLED(CONFIG_PREEMPT_RT) + +SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, s->lock, raw_spin, raw_spin_lock(s->lock)) +SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, s->lock, spin, spin_lock(s->lock)) +SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, s->lock, read, read_lock(s->lock)) +SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex, mutex_lock(s->lock)) +SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mutex, ww_mutex_lock(s->lock, NULL)) /* * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t From 1909760f5fc3f123e47b4e24e0ccdc0fc8f3f106 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Fri, 4 Sep 2020 17:32:31 +0200 Subject: [PATCH 60/67] seqlock: PREEMPT_RT: Do not starve seqlock_t writers On PREEMPT_RT, seqlock_t is transformed to a sleeping lock that do not disable preemption. A seqlock_t reader can thus preempt its write side section and spin for the enter scheduler tick. If that reader belongs to a real-time scheduling class, it can spin forever and the kernel will livelock. To break this livelock possibility on PREEMPT_RT, implement seqlock_t in terms of "seqcount_spinlock_t" instead of plain "seqcount_t". Beside its pure annotational value, this will leverage the existing seqcount_LOCKNAME_T PREEMPT_RT anti-livelock mechanisms, without adding any extra code. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200904153231.11994-6-a.darwish@linutronix.de --- include/linux/seqlock.h | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 2bc95104ab1b..f73c7eb68f27 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -790,13 +790,17 @@ static inline void raw_write_seqcount_latch(seqcount_latch_t *s) * - Documentation/locking/seqlock.rst */ typedef struct { - struct seqcount seqcount; + /* + * Make sure that readers don't starve writers on PREEMPT_RT: use + * seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK(). + */ + seqcount_spinlock_t seqcount; spinlock_t lock; } seqlock_t; #define __SEQLOCK_UNLOCKED(lockname) \ { \ - .seqcount = SEQCNT_ZERO(lockname), \ + .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \ .lock = __SPIN_LOCK_UNLOCKED(lockname) \ } @@ -806,8 +810,8 @@ typedef struct { */ #define seqlock_init(sl) \ do { \ - seqcount_init(&(sl)->seqcount); \ spin_lock_init(&(sl)->lock); \ + seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock); \ } while (0) /** @@ -854,6 +858,12 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) return read_seqcount_retry(&sl->seqcount, start); } +/* + * For all seqlock_t write side functions, use write_seqcount_*t*_begin() + * instead of the generic write_seqcount_begin(). This way, no redundant + * lockdep_assert_held() checks are added. + */ + /** * write_seqlock() - start a seqlock_t write side critical section * @sl: Pointer to seqlock_t @@ -870,7 +880,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) static inline void write_seqlock(seqlock_t *sl) { spin_lock(&sl->lock); - write_seqcount_t_begin(&sl->seqcount); + write_seqcount_t_begin(&sl->seqcount.seqcount); } /** @@ -882,7 +892,7 @@ static inline void write_seqlock(seqlock_t *sl) */ static inline void write_sequnlock(seqlock_t *sl) { - write_seqcount_t_end(&sl->seqcount); + write_seqcount_t_end(&sl->seqcount.seqcount); spin_unlock(&sl->lock); } @@ -896,7 +906,7 @@ static inline void write_sequnlock(seqlock_t *sl) static inline void write_seqlock_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); - write_seqcount_t_begin(&sl->seqcount); + write_seqcount_t_begin(&sl->seqcount.seqcount); } /** @@ -909,7 +919,7 @@ static inline void write_seqlock_bh(seqlock_t *sl) */ static inline void write_sequnlock_bh(seqlock_t *sl) { - write_seqcount_t_end(&sl->seqcount); + write_seqcount_t_end(&sl->seqcount.seqcount); spin_unlock_bh(&sl->lock); } @@ -923,7 +933,7 @@ static inline void write_sequnlock_bh(seqlock_t *sl) static inline void write_seqlock_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); - write_seqcount_t_begin(&sl->seqcount); + write_seqcount_t_begin(&sl->seqcount.seqcount); } /** @@ -935,7 +945,7 @@ static inline void write_seqlock_irq(seqlock_t *sl) */ static inline void write_sequnlock_irq(seqlock_t *sl) { - write_seqcount_t_end(&sl->seqcount); + write_seqcount_t_end(&sl->seqcount.seqcount); spin_unlock_irq(&sl->lock); } @@ -944,7 +954,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) unsigned long flags; spin_lock_irqsave(&sl->lock, flags); - write_seqcount_t_begin(&sl->seqcount); + write_seqcount_t_begin(&sl->seqcount.seqcount); return flags; } @@ -973,7 +983,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) static inline void write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags) { - write_seqcount_t_end(&sl->seqcount); + write_seqcount_t_end(&sl->seqcount.seqcount); spin_unlock_irqrestore(&sl->lock, flags); } From 267580db047ef428a70bef8287ca62c5a450c139 Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Tue, 15 Sep 2020 16:30:28 +0200 Subject: [PATCH 61/67] seqlock: Unbreak lockdep seqcount_LOCKNAME_init() needs to be a macro due to the lockdep annotation in seqcount_init(). Since a macro cannot define another macro, we need to effectively revert commit: e4e9ab3f9f91 ("seqlock: Fold seqcount_LOCKNAME_init() definition"). Fixes: e4e9ab3f9f91 ("seqlock: Fold seqcount_LOCKNAME_init() definition") Reported-by: Qian Cai Debugged-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Tested-by: Qian Cai Link: https://lkml.kernel.org/r/20200915143028.GB2674@hirez.programming.kicks-ass.net --- include/linux/seqlock.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index f73c7eb68f27..76e44e6c0100 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -173,6 +173,19 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) * @lock: Pointer to the associated lock */ +#define seqcount_LOCKNAME_init(s, _lock, lockname) \ + do { \ + seqcount_##lockname##_t *____s = (s); \ + seqcount_init(&____s->seqcount); \ + __SEQ_LOCK(____s->lock = (_lock)); \ + } while (0) + +#define seqcount_raw_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, raw_spinlock) +#define seqcount_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, spinlock) +#define seqcount_rwlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, rwlock); +#define seqcount_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, mutex); +#define seqcount_ww_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, ww_mutex); + /* * SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers * seqprop_LOCKNAME_*() - Property accessors for seqcount_LOCKNAME_t @@ -190,13 +203,6 @@ typedef struct seqcount_##lockname { \ __SEQ_LOCK(locktype *lock); \ } seqcount_##lockname##_t; \ \ -static __always_inline void \ -seqcount_##lockname##_init(seqcount_##lockname##_t *s, locktype *lock) \ -{ \ - seqcount_init(&s->seqcount); \ - __SEQ_LOCK(s->lock = lock); \ -} \ - \ static __always_inline seqcount_t * \ __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ { \ @@ -284,8 +290,8 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mu __SEQ_LOCK(.lock = (assoc_lock)) \ } -#define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_RAW_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) +#define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_RWLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) From 6d1823ccc480866e571ab1206665d693aeb600cf Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 17 Sep 2020 16:01:50 +0800 Subject: [PATCH 62/67] lockdep: Optimize the memory usage of circular queue Qian Cai reported a BFS_EQUEUEFULL warning [1] after read recursive deadlock detection merged into tip tree recently. Unlike the previous lockep graph searching, which iterate every lock class (every node in the graph) exactly once, the graph searching for read recurisve deadlock detection needs to iterate every lock dependency (every edge in the graph) once, as a result, the maximum memory cost of the circular queue changes from O(V), where V is the number of lock classes (nodes or vertices) in the graph, to O(E), where E is the number of lock dependencies (edges), because every lock class or dependency gets enqueued once in the BFS. Therefore we hit the BFS_EQUEUEFULL case. However, actually we don't need to enqueue all dependencies for the BFS, because every time we enqueue a dependency, we almostly enqueue all other dependencies in the same dependency list ("almostly" is because we currently check before enqueue, so if a dependency doesn't pass the check stage we won't enqueue it, however, we can always do in reverse ordering), based on this, we can only enqueue the first dependency from a dependency list and every time we want to fetch a new dependency to work, we can either: 1) fetch the dependency next to the current dependency in the dependency list or 2) if the dependency in 1) doesn't exist, fetch the dependency from the queue. With this approach, the "max bfs queue depth" for a x86_64_defconfig + lockdep and selftest config kernel can get descreased from: max bfs queue depth: 201 to (after apply this patch) max bfs queue depth: 61 While I'm at it, clean up the code logic a little (e.g. directly return other than set a "ret" value and goto the "exit" label). [1]: https://lore.kernel.org/lkml/17343f6f7f2438fc376125384133c5ba70c2a681.camel@redhat.com/ Reported-by: Qian Cai Reported-by: syzbot+62ebe501c1ce9a91f68c@syzkaller.appspotmail.com Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200917080210.108095-1-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 101 +++++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 40 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index cccf4bc759c6..9560a4e55277 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1606,6 +1606,15 @@ static inline void bfs_init_rootb(struct lock_list *lock, lock->only_xr = (hlock->read != 0); } +static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset) +{ + if (!lock || !lock->parent) + return NULL; + + return list_next_or_null_rcu(get_dep_list(lock->parent, offset), + &lock->entry, struct lock_list, entry); +} + /* * Breadth-First Search to find a strong path in the dependency graph. * @@ -1639,36 +1648,25 @@ static enum bfs_result __bfs(struct lock_list *source_entry, struct lock_list **target_entry, int offset) { - struct lock_list *entry; - struct lock_list *lock; - struct list_head *head; struct circular_queue *cq = &lock_cq; - enum bfs_result ret = BFS_RNOMATCH; + struct lock_list *lock = NULL; + struct lock_list *entry; + struct list_head *head; + unsigned int cq_depth; + bool first; lockdep_assert_locked(); - if (match(source_entry, data)) { - *target_entry = source_entry; - ret = BFS_RMATCH; - goto exit; - } - - head = get_dep_list(source_entry, offset); - if (list_empty(head)) - goto exit; - __cq_init(cq); __cq_enqueue(cq, source_entry); - while ((lock = __cq_dequeue(cq))) { - bool prev_only_xr; - - if (!lock->class) { - ret = BFS_EINVALIDNODE; - goto exit; - } + while ((lock = __bfs_next(lock, offset)) || (lock = __cq_dequeue(cq))) { + if (!lock->class) + return BFS_EINVALIDNODE; /* + * Step 1: check whether we already finish on this one. + * * If we have visited all the dependencies from this @lock to * others (iow, if we have visited all lock_list entries in * @lock->class->locks_{after,before}) we skip, otherwise go @@ -1680,13 +1678,13 @@ static enum bfs_result __bfs(struct lock_list *source_entry, else mark_lock_accessed(lock); - head = get_dep_list(lock, offset); - - prev_only_xr = lock->only_xr; - - list_for_each_entry_rcu(entry, head, entry) { - unsigned int cq_depth; - u8 dep = entry->dep; + /* + * Step 2: check whether prev dependency and this form a strong + * dependency path. + */ + if (lock->parent) { /* Parent exists, check prev dependency */ + u8 dep = lock->dep; + bool prev_only_xr = lock->parent->only_xr; /* * Mask out all -(S*)-> if we only have *R in previous @@ -1701,26 +1699,49 @@ static enum bfs_result __bfs(struct lock_list *source_entry, continue; /* If there are only -(*R)-> left, set that for the next step */ - entry->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK)); + lock->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK)); + } + /* + * Step 3: we haven't visited this and there is a strong + * dependency path to this, so check with @match. + */ + if (match(lock, data)) { + *target_entry = lock; + return BFS_RMATCH; + } + + /* + * Step 4: if not match, expand the path by adding the + * forward or backwards dependencis in the search + * + */ + first = true; + head = get_dep_list(lock, offset); + list_for_each_entry_rcu(entry, head, entry) { visit_lock_entry(entry, lock); - if (match(entry, data)) { - *target_entry = entry; - ret = BFS_RMATCH; - goto exit; - } - if (__cq_enqueue(cq, entry)) { - ret = BFS_EQUEUEFULL; - goto exit; - } + /* + * Note we only enqueue the first of the list into the + * queue, because we can always find a sibling + * dependency from one (see __bfs_next()), as a result + * the space of queue is saved. + */ + if (!first) + continue; + + first = false; + + if (__cq_enqueue(cq, entry)) + return BFS_EQUEUEFULL; + cq_depth = __cq_get_elem_count(cq); if (max_bfs_queue_depth < cq_depth) max_bfs_queue_depth = cq_depth; } } -exit: - return ret; + + return BFS_RNOMATCH; } static inline enum bfs_result From 24a1877286822293684ef3f7bada4ea48a6e129e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 24 Sep 2020 17:48:51 +0200 Subject: [PATCH 63/67] locking/seqlock: Tweak DEFINE_SEQLOCK() kernel doc ctags creates a warning: |ctags: Warning: include/linux/seqlock.h:738: null expansion of name pattern "\2" The DEFINE_SEQLOCK() macro is passed to ctags and being told to expect an argument. Add a dummy argument to keep ctags quiet. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Link: https://lkml.kernel.org/r/20200924154851.skmswuyj322yuz4g@linutronix.de --- include/linux/seqlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 76e44e6c0100..ac5b07f558b0 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -821,7 +821,7 @@ typedef struct { } while (0) /** - * DEFINE_SEQLOCK() - Define a statically allocated seqlock_t + * DEFINE_SEQLOCK(sl) - Define a statically allocated seqlock_t * @sl: Name of the seqlock_t instance */ #define DEFINE_SEQLOCK(sl) \ From d89d5f855f84ccf3f7e648813b4bb95c780bd7cd Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Thu, 1 Oct 2020 22:20:28 +0200 Subject: [PATCH 64/67] locking/atomics: Check atomic-arch-fallback.h too The sha1sum of include/linux/atomic-arch-fallback.h isn't checked by check-atomics.sh. It's not clear why it's skipped so let's check it too. Signed-off-by: Paul Bolle Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mark Rutland Link: https://lkml.kernel.org/r/20201001202028.1048418-1-pebolle@tiscali.nl --- scripts/atomic/check-atomics.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/atomic/check-atomics.sh b/scripts/atomic/check-atomics.sh index 8378c63a1e09..82748d42ecc5 100755 --- a/scripts/atomic/check-atomics.sh +++ b/scripts/atomic/check-atomics.sh @@ -16,6 +16,7 @@ fi cat < Date: Wed, 30 Sep 2020 11:49:37 +0200 Subject: [PATCH 65/67] lockdep: Fix usage_traceoverflow Basically print_lock_class_header()'s for loop is out of sync with the the size of of ->usage_traces[]. Also clean things up a bit while at it, to avoid such mishaps in the future. Fixes: 23870f122768 ("locking/lockdep: Fix "USED" <- "IN-NMI" inversions") Reported-by: Qian Cai Debugged-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Qian Cai Link: https://lkml.kernel.org/r/20200930094937.GE2651@hirez.programming.kicks-ass.net --- include/linux/lockdep_types.h | 8 +++++-- kernel/locking/lockdep.c | 34 ++++++++++++++---------------- kernel/locking/lockdep_internals.h | 7 ++++-- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index bb35b449f533..9a1fd49df17f 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -35,8 +35,12 @@ enum lockdep_wait_type { /* * We'd rather not expose kernel/lockdep_states.h this wide, but we do need * the total number of states... :-( + * + * XXX_LOCK_USAGE_STATES is the number of lines in lockdep_states.h, for each + * of those we generates 4 states, Additionally we report on USED and USED_READ. */ -#define XXX_LOCK_USAGE_STATES (1+2*4) +#define XXX_LOCK_USAGE_STATES 2 +#define LOCK_TRACE_STATES (XXX_LOCK_USAGE_STATES*4 + 2) /* * NR_LOCKDEP_CACHING_CLASSES ... Number of classes @@ -106,7 +110,7 @@ struct lock_class { * IRQ/softirq usage tracking bits: */ unsigned long usage_mask; - const struct lock_trace *usage_traces[XXX_LOCK_USAGE_STATES]; + const struct lock_trace *usage_traces[LOCK_TRACE_STATES]; /* * Generation counter, when doing certain classes of graph walking, diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2facbbd146ec..a430fbb01eb1 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -585,6 +585,8 @@ static const char *usage_str[] = #include "lockdep_states.h" #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", + [LOCK_USED_READ] = "INITIAL READ USE", + /* abused as string storage for verify_lock_unused() */ [LOCK_USAGE_STATES] = "IN-NMI", }; #endif @@ -1939,7 +1941,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) #endif printk(KERN_CONT " {\n"); - for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { + for (bit = 0; bit < LOCK_TRACE_STATES; bit++) { if (class->usage_mask & (1 << bit)) { int len = depth; @@ -3969,7 +3971,7 @@ static int separate_irq_context(struct task_struct *curr, static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { - unsigned int old_mask, new_mask, ret = 1; + unsigned int new_mask, ret = 1; if (new_bit >= LOCK_USAGE_STATES) { DEBUG_LOCKS_WARN_ON(1); @@ -3996,30 +3998,26 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, if (unlikely(hlock_class(this)->usage_mask & new_mask)) goto unlock; - old_mask = hlock_class(this)->usage_mask; hlock_class(this)->usage_mask |= new_mask; - /* - * Save one usage_traces[] entry and map both LOCK_USED and - * LOCK_USED_READ onto the same entry. - */ - if (new_bit == LOCK_USED || new_bit == LOCK_USED_READ) { - if (old_mask & (LOCKF_USED | LOCKF_USED_READ)) - goto unlock; - new_bit = LOCK_USED; + if (new_bit < LOCK_TRACE_STATES) { + if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) + return 0; } - if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) - return 0; - switch (new_bit) { - case LOCK_USED: - debug_atomic_dec(nr_unused_locks); - break; - default: + case 0 ... LOCK_USED-1: ret = mark_lock_irq(curr, this, new_bit); if (!ret) return 0; + break; + + case LOCK_USED: + debug_atomic_dec(nr_unused_locks); + break; + + default: + break; } unlock: diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index b0be1560ed17..de49f9e1c11b 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -20,9 +20,12 @@ enum lock_usage_bit { #undef LOCKDEP_STATE LOCK_USED, LOCK_USED_READ, - LOCK_USAGE_STATES + LOCK_USAGE_STATES, }; +/* states after LOCK_USED_READ are not traced and printed */ +static_assert(LOCK_TRACE_STATES == LOCK_USAGE_STATES); + #define LOCK_USAGE_READ_MASK 1 #define LOCK_USAGE_DIR_MASK 2 #define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK)) @@ -121,7 +124,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; -#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) +#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1) extern void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]); From 4d004099a668c41522242aa146a38cc4eb59cb1e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 Oct 2020 11:04:21 +0200 Subject: [PATCH 66/67] lockdep: Fix lockdep recursion Steve reported that lockdep_assert*irq*(), when nested inside lockdep itself, will trigger a false-positive. One example is the stack-trace code, as called from inside lockdep, triggering tracing, which in turn calls RCU, which then uses lockdep_assert_irqs_disabled(). Fixes: a21ee6055c30 ("lockdep: Change hardirq{s_enabled,_context} to per-cpu variables") Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 13 ++++-- kernel/locking/lockdep.c | 99 ++++++++++++++++++++++++---------------- 2 files changed, 67 insertions(+), 45 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 6a584b3e5c74..b1227be47496 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -534,6 +534,7 @@ do { \ DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); +DECLARE_PER_CPU(unsigned int, lockdep_recursion); /* * The below lockdep_assert_*() macros use raw_cpu_read() to access the above @@ -543,25 +544,27 @@ DECLARE_PER_CPU(int, hardirq_context); * read the value from our previous CPU. */ +#define __lockdep_enabled (debug_locks && !raw_cpu_read(lockdep_recursion)) + #define lockdep_assert_irqs_enabled() \ do { \ - WARN_ON_ONCE(debug_locks && !raw_cpu_read(hardirqs_enabled)); \ + WARN_ON_ONCE(__lockdep_enabled && !raw_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_irqs_disabled() \ do { \ - WARN_ON_ONCE(debug_locks && raw_cpu_read(hardirqs_enabled)); \ + WARN_ON_ONCE(__lockdep_enabled && raw_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_in_irq() \ do { \ - WARN_ON_ONCE(debug_locks && !raw_cpu_read(hardirq_context)); \ + WARN_ON_ONCE(__lockdep_enabled && !raw_cpu_read(hardirq_context)); \ } while (0) #define lockdep_assert_preemption_enabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ - debug_locks && \ + __lockdep_enabled && \ (preempt_count() != 0 || \ !raw_cpu_read(hardirqs_enabled))); \ } while (0) @@ -569,7 +572,7 @@ do { \ #define lockdep_assert_preemption_disabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ - debug_locks && \ + __lockdep_enabled && \ (preempt_count() == 0 && \ raw_cpu_read(hardirqs_enabled))); \ } while (0) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a430fbb01eb1..85d15f0362dc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -76,6 +76,23 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif +DEFINE_PER_CPU(unsigned int, lockdep_recursion); +EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion); + +static inline bool lockdep_enabled(void) +{ + if (!debug_locks) + return false; + + if (raw_cpu_read(lockdep_recursion)) + return false; + + if (current->lockdep_recursion) + return false; + + return true; +} + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -93,7 +110,7 @@ static inline void lockdep_lock(void) arch_spin_lock(&__lock); __owner = current; - current->lockdep_recursion++; + __this_cpu_inc(lockdep_recursion); } static inline void lockdep_unlock(void) @@ -101,7 +118,7 @@ static inline void lockdep_unlock(void) if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) return; - current->lockdep_recursion--; + __this_cpu_dec(lockdep_recursion); __owner = NULL; arch_spin_unlock(&__lock); } @@ -393,10 +410,15 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } +static __always_inline void lockdep_recursion_inc(void) +{ + __this_cpu_inc(lockdep_recursion); +} + static __always_inline void lockdep_recursion_finish(void) { - if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK)) - current->lockdep_recursion = 0; + if (WARN_ON_ONCE(__this_cpu_dec_return(lockdep_recursion))) + __this_cpu_write(lockdep_recursion, 0); } void lockdep_set_selftest_task(struct task_struct *task) @@ -3659,7 +3681,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(in_nmi())) return; - if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) + if (unlikely(__this_cpu_read(lockdep_recursion))) return; if (unlikely(lockdep_hardirqs_enabled())) { @@ -3695,7 +3717,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) current->hardirq_chain_key = current->curr_chain_key; - current->lockdep_recursion++; + lockdep_recursion_inc(); __trace_hardirqs_on_caller(); lockdep_recursion_finish(); } @@ -3728,7 +3750,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) goto skip_checks; } - if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) + if (unlikely(__this_cpu_read(lockdep_recursion))) return; if (lockdep_hardirqs_enabled()) { @@ -3781,7 +3803,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (in_nmi()) { if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) return; - } else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) + } else if (__this_cpu_read(lockdep_recursion)) return; /* @@ -3814,7 +3836,7 @@ void lockdep_softirqs_on(unsigned long ip) { struct irqtrace_events *trace = ¤t->irqtrace; - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; /* @@ -3829,7 +3851,7 @@ void lockdep_softirqs_on(unsigned long ip) return; } - current->lockdep_recursion++; + lockdep_recursion_inc(); /* * We'll do an OFF -> ON transition: */ @@ -3852,7 +3874,7 @@ void lockdep_softirqs_on(unsigned long ip) */ void lockdep_softirqs_off(unsigned long ip) { - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; /* @@ -4233,11 +4255,11 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, if (subclass) { unsigned long flags; - if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) + if (DEBUG_LOCKS_WARN_ON(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); register_lock_class(lock, subclass, 1); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -4920,11 +4942,11 @@ void lock_set_class(struct lockdep_map *lock, const char *name, { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); check_flags(flags); if (__lock_set_class(lock, name, key, subclass, ip)) check_chain_key(current); @@ -4937,11 +4959,11 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); check_flags(flags); if (__lock_downgrade(lock, ip)) check_chain_key(current); @@ -4979,7 +5001,7 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock static bool lockdep_nmi(void) { - if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) + if (raw_cpu_read(lockdep_recursion)) return false; if (!in_nmi()) @@ -5000,7 +5022,10 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); - if (unlikely(current->lockdep_recursion)) { + if (!debug_locks) + return; + + if (unlikely(!lockdep_enabled())) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { struct held_lock hlock; @@ -5023,7 +5048,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), nest_lock, ip, 0, 0); lockdep_recursion_finish(); @@ -5037,13 +5062,13 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) trace_lock_release(lock, ip); - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); if (__lock_release(lock, ip)) check_chain_key(current); lockdep_recursion_finish(); @@ -5056,13 +5081,13 @@ noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) unsigned long flags; int ret = 0; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return 1; /* avoid false negative lockdep_assert_held() */ raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); ret = __lock_is_held(lock, read); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5077,13 +5102,13 @@ struct pin_cookie lock_pin_lock(struct lockdep_map *lock) struct pin_cookie cookie = NIL_COOKIE; unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return cookie; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); cookie = __lock_pin_lock(lock); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5096,13 +5121,13 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_repin_lock(lock, cookie); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5113,13 +5138,13 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_unpin_lock(lock, cookie); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5249,15 +5274,12 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) trace_lock_acquired(lock, ip); - if (unlikely(!lock_stat || !debug_locks)) - return; - - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lock_stat || !lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_contended(lock, ip); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5270,15 +5292,12 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) trace_lock_contended(lock, ip); - if (unlikely(!lock_stat || !debug_locks)) - return; - - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lock_stat || !lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_acquired(lock, ip); lockdep_recursion_finish(); raw_local_irq_restore(flags); From baffd723e44dc3d7f84f0b8f1fe1ece00ddd2710 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 Oct 2020 09:56:57 +0200 Subject: [PATCH 67/67] lockdep: Revert "lockdep: Use raw_cpu_*() for per-cpu variables" The thinking in commit: fddf9055a60d ("lockdep: Use raw_cpu_*() for per-cpu variables") is flawed. While it is true that when we're migratable both CPUs will have a 0 value, it doesn't hold that when we do get migrated in the middle of a raw_cpu_op(), the old CPU will still have 0 by the time we get around to reading it on the new CPU. Luckily, the reason for that commit (s390 using preempt_disable() instead of preempt_disable_notrace() in their percpu code), has since been fixed by commit: 1196f12a2c96 ("s390: don't trace preemption in percpu macros") An audit of arch/*/include/asm/percpu*.h shows there are no other architectures affected by this particular issue. Fixes: fddf9055a60d ("lockdep: Use raw_cpu_*() for per-cpu variables") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20201005095958.GJ2651@hirez.programming.kicks-ass.net --- include/linux/lockdep.h | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b1227be47496..1130f271de66 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -512,19 +512,19 @@ static inline void print_irqtrace_events(struct task_struct *curr) #define lock_map_release(l) lock_release(l, _THIS_IP_) #ifdef CONFIG_PROVE_LOCKING -# define might_lock(lock) \ +# define might_lock(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) -# define might_lock_read(lock) \ +# define might_lock_read(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) -# define might_lock_nested(lock, subclass) \ +# define might_lock_nested(lock, subclass) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, subclass, 0, 1, 1, NULL, \ @@ -536,29 +536,21 @@ DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); DECLARE_PER_CPU(unsigned int, lockdep_recursion); -/* - * The below lockdep_assert_*() macros use raw_cpu_read() to access the above - * per-cpu variables. This is required because this_cpu_read() will potentially - * call into preempt/irq-disable and that obviously isn't right. This is also - * correct because when IRQs are enabled, it doesn't matter if we accidentally - * read the value from our previous CPU. - */ - -#define __lockdep_enabled (debug_locks && !raw_cpu_read(lockdep_recursion)) +#define __lockdep_enabled (debug_locks && !this_cpu_read(lockdep_recursion)) #define lockdep_assert_irqs_enabled() \ do { \ - WARN_ON_ONCE(__lockdep_enabled && !raw_cpu_read(hardirqs_enabled)); \ + WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_irqs_disabled() \ do { \ - WARN_ON_ONCE(__lockdep_enabled && raw_cpu_read(hardirqs_enabled)); \ + WARN_ON_ONCE(__lockdep_enabled && this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_in_irq() \ do { \ - WARN_ON_ONCE(__lockdep_enabled && !raw_cpu_read(hardirq_context)); \ + WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirq_context)); \ } while (0) #define lockdep_assert_preemption_enabled() \ @@ -566,7 +558,7 @@ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() != 0 || \ - !raw_cpu_read(hardirqs_enabled))); \ + !this_cpu_read(hardirqs_enabled))); \ } while (0) #define lockdep_assert_preemption_disabled() \ @@ -574,7 +566,7 @@ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() == 0 && \ - raw_cpu_read(hardirqs_enabled))); \ + this_cpu_read(hardirqs_enabled))); \ } while (0) #else