From 364f6afc4f5537b79cf454eb35cae92920676075 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:40 -0700 Subject: [PATCH 01/10] locking/lockdep: Make it clear that what lock_class::key points at is not modified This patch does not change the behavior of the lockdep code. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Link: https://lkml.kernel.org/r/20190722182443.216015-2-bvanassche@acm.org Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 2 +- kernel/locking/lockdep.c | 2 +- kernel/locking/lockdep_internals.h | 3 ++- kernel/locking/lockdep_proc.c | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 0b0d7259276d..cdb3c2f06092 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -97,7 +97,7 @@ struct lock_class { */ struct list_head locks_after, locks_before; - struct lockdep_subclass_key *key; + const struct lockdep_subclass_key *key; unsigned int subclass; unsigned int dep_gen_id; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4861cf8e274b..af6627866191 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -511,7 +511,7 @@ static const char *usage_str[] = }; #endif -const char * __get_key_name(struct lockdep_subclass_key *key, char *str) +const char *__get_key_name(const struct lockdep_subclass_key *key, char *str) { return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index cc83568d5012..2e518369add4 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -116,7 +116,8 @@ extern struct lock_chain lock_chains[]; extern void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]); -extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); +extern const char *__get_key_name(const struct lockdep_subclass_key *key, + char *str); struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index bda006f8a88b..ed9842425cac 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -399,7 +399,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) static void seq_stats(struct seq_file *m, struct lock_stat_data *data) { - struct lockdep_subclass_key *ckey; + const struct lockdep_subclass_key *ckey; struct lock_class_stats *stats; struct lock_class *class; const char *cname; From a2970421640bd9b6a78f2685d7750a791abdfd4e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:41 -0700 Subject: [PATCH 02/10] stacktrace: Constify 'entries' arguments Make it clear to humans and to the compiler that the stack trace ('entries') arguments are not modified. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Link: https://lkml.kernel.org/r/20190722182443.216015-3-bvanassche@acm.org Signed-off-by: Ingo Molnar --- include/linux/stacktrace.h | 4 ++-- kernel/stacktrace.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index f0cfd12cb45e..83bd8cb475d7 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -9,9 +9,9 @@ struct task_struct; struct pt_regs; #ifdef CONFIG_STACKTRACE -void stack_trace_print(unsigned long *trace, unsigned int nr_entries, +void stack_trace_print(const unsigned long *trace, unsigned int nr_entries, int spaces); -int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, +int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries, unsigned int nr_entries, int spaces); unsigned int stack_trace_save(unsigned long *store, unsigned int size, unsigned int skipnr); diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index f5440abb7532..6d1f68b7e528 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -20,7 +20,7 @@ * @nr_entries: Number of entries in the storage array * @spaces: Number of leading spaces to print */ -void stack_trace_print(unsigned long *entries, unsigned int nr_entries, +void stack_trace_print(const unsigned long *entries, unsigned int nr_entries, int spaces) { unsigned int i; @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(stack_trace_print); * * Return: Number of bytes printed. */ -int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, +int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries, unsigned int nr_entries, int spaces) { unsigned int generated, i, total = 0; From 12593b7467f9130b64a6d4b6a26ed4ec217b6784 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:42 -0700 Subject: [PATCH 03/10] locking/lockdep: Reduce space occupied by stack traces Although commit 669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for workqueues") unregisters dynamic lockdep keys when a workqueue is destroyed, a side effect of that commit is that all stack traces associated with the lockdep key are leaked when a workqueue is destroyed. Fix this by storing each unique stack trace once. Other changes in this patch are: - Use NULL instead of { .nr_entries = 0 } to represent 'no trace'. - Store a pointer to a stack trace in struct lock_class and struct lock_list instead of storing 'nr_entries' and 'offset'. This patch avoids that the following program triggers the "BUG: MAX_STACK_TRACE_ENTRIES too low!" complaint: #include #include int main() { for (;;) { int fd = open("/dev/infiniband/rdma_cm", O_RDWR); close(fd); } } Suggested-by: Peter Zijlstra Reported-by: Eric Biggers Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: Yuyang Du Link: https://lkml.kernel.org/r/20190722182443.216015-4-bvanassche@acm.org Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 9 +- kernel/locking/lockdep.c | 128 ++++++++++++++++++++--------- kernel/locking/lockdep_internals.h | 2 + 3 files changed, 95 insertions(+), 44 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index cdb3c2f06092..b8a835fd611b 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -66,10 +66,7 @@ struct lock_class_key { extern struct lock_class_key __lockdep_no_validate__; -struct lock_trace { - unsigned int nr_entries; - unsigned int offset; -}; +struct lock_trace; #define LOCKSTAT_POINTS 4 @@ -105,7 +102,7 @@ struct lock_class { * IRQ/softirq usage tracking bits: */ unsigned long usage_mask; - struct lock_trace usage_traces[XXX_LOCK_USAGE_STATES]; + const struct lock_trace *usage_traces[XXX_LOCK_USAGE_STATES]; /* * Generation counter, when doing certain classes of graph walking, @@ -193,7 +190,7 @@ struct lock_list { struct list_head entry; struct lock_class *class; struct lock_class *links_to; - struct lock_trace trace; + const struct lock_trace *trace; int distance; /* diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index af6627866191..1a96869cb2f0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -449,33 +449,72 @@ static void print_lockdep_off(const char *bug_msg) unsigned long nr_stack_trace_entries; #ifdef CONFIG_PROVE_LOCKING +/** + * struct lock_trace - single stack backtrace + * @hash_entry: Entry in a stack_trace_hash[] list. + * @hash: jhash() of @entries. + * @nr_entries: Number of entries in @entries. + * @entries: Actual stack backtrace. + */ +struct lock_trace { + struct hlist_node hash_entry; + u32 hash; + u32 nr_entries; + unsigned long entries[0] __aligned(sizeof(unsigned long)); +}; +#define LOCK_TRACE_SIZE_IN_LONGS \ + (sizeof(struct lock_trace) / sizeof(unsigned long)) /* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. + * Stack-trace: sequence of lock_trace structures. Protected by the graph_lock. */ static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static struct hlist_head stack_trace_hash[STACK_TRACE_HASH_SIZE]; -static int save_trace(struct lock_trace *trace) +static bool traces_identical(struct lock_trace *t1, struct lock_trace *t2) { - unsigned long *entries = stack_trace + nr_stack_trace_entries; + return t1->hash == t2->hash && t1->nr_entries == t2->nr_entries && + memcmp(t1->entries, t2->entries, + t1->nr_entries * sizeof(t1->entries[0])) == 0; +} + +static struct lock_trace *save_trace(void) +{ + struct lock_trace *trace, *t2; + struct hlist_head *hash_head; + u32 hash; unsigned int max_entries; - trace->offset = nr_stack_trace_entries; - max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->nr_entries = stack_trace_save(entries, max_entries, 3); - nr_stack_trace_entries += trace->nr_entries; + BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE); + BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES); - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { + trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries); + max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries - + LOCK_TRACE_SIZE_IN_LONGS; + trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); + + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES - + LOCK_TRACE_SIZE_IN_LONGS - 1) { if (!debug_locks_off_graph_unlock()) - return 0; + return NULL; print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); - return 0; + return NULL; } - return 1; + hash = jhash(trace->entries, trace->nr_entries * + sizeof(trace->entries[0]), 0); + trace->hash = hash; + hash_head = stack_trace_hash + (hash & (STACK_TRACE_HASH_SIZE - 1)); + hlist_for_each_entry(t2, hash_head, hash_entry) { + if (traces_identical(trace, t2)) + return t2; + } + nr_stack_trace_entries += LOCK_TRACE_SIZE_IN_LONGS + trace->nr_entries; + hlist_add_head(&trace->hash_entry, hash_head); + + return trace; } #endif @@ -1235,7 +1274,7 @@ static struct lock_list *alloc_list_entry(void) static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, unsigned long ip, int distance, - struct lock_trace *trace) + const struct lock_trace *trace) { struct lock_list *entry; /* @@ -1249,7 +1288,7 @@ static int add_lock_to_list(struct lock_class *this, entry->class = this; entry->links_to = links_to; entry->distance = distance; - entry->trace = *trace; + entry->trace = trace; /* * Both allocation and removal are done under the graph lock; but * iteration is under RCU-sched; see look_up_lock_class() and @@ -1470,11 +1509,10 @@ static inline int __bfs_backwards(struct lock_list *src_entry, } -static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +static void print_lock_trace(const struct lock_trace *trace, + unsigned int spaces) { - unsigned long *entries = stack_trace + trace->offset; - - stack_trace_print(entries, trace->nr_entries, spaces); + stack_trace_print(trace->entries, trace->nr_entries, spaces); } /* @@ -1489,7 +1527,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) printk("\n-> #%u", depth); print_lock_name(target->class); printk(KERN_CONT ":\n"); - print_lock_trace(&target->trace, 6); + print_lock_trace(target->trace, 6); } static void @@ -1592,7 +1630,8 @@ static noinline void print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; - if (!save_trace(&this->trace)) + this->trace = save_trace(); + if (!this->trace) return; depth = get_lock_depth(target); @@ -1715,7 +1754,7 @@ check_path(struct lock_class *target, struct lock_list *src_entry, */ static noinline int check_noncircular(struct held_lock *src, struct held_lock *target, - struct lock_trace *trace) + struct lock_trace **const trace) { int ret; struct lock_list *uninitialized_var(target_entry); @@ -1729,13 +1768,13 @@ check_noncircular(struct held_lock *src, struct held_lock *target, ret = check_path(hlock_class(target), &src_entry, &target_entry); if (unlikely(!ret)) { - if (!trace->nr_entries) { + if (!*trace) { /* * If save_trace fails here, the printing might * trigger a WARN but because of the !nr_entries it * should not do bad things. */ - save_trace(trace); + *trace = save_trace(); } print_circular_bug(&src_entry, target_entry, src, target); @@ -1859,7 +1898,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) len += printk("%*s %s", depth, "", usage_str[bit]); len += printk(KERN_CONT " at:\n"); - print_lock_trace(class->usage_traces + bit, len); + print_lock_trace(class->usage_traces[bit], len); } } printk("%*s }\n", depth, ""); @@ -1884,7 +1923,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, do { print_lock_class_header(entry->class, depth); printk("%*s ... acquired at:\n", depth, ""); - print_lock_trace(&entry->trace, 2); + print_lock_trace(entry->trace, 2); printk("\n"); if (depth == 0 && (entry != root)) { @@ -1995,14 +2034,14 @@ print_bad_irq_dependency(struct task_struct *curr, print_lock_name(backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); - print_lock_trace(backwards_entry->class->usage_traces + bit1, 1); + print_lock_trace(backwards_entry->class->usage_traces[bit1], 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); - print_lock_trace(forwards_entry->class->usage_traces + bit2, 1); + print_lock_trace(forwards_entry->class->usage_traces[bit2], 1); pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, @@ -2011,13 +2050,15 @@ print_bad_irq_dependency(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); - if (!save_trace(&prev_root->trace)) + prev_root->trace = save_trace(); + if (!prev_root->trace) return; print_shortest_lock_dependencies(backwards_entry, prev_root); pr_warn("\nthe dependencies between the lock to be acquired"); pr_warn(" and %s-irq-unsafe lock:\n", irqclass); - if (!save_trace(&next_root->trace)) + next_root->trace = save_trace(); + if (!next_root->trace) return; print_shortest_lock_dependencies(forwards_entry, next_root); @@ -2369,7 +2410,8 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, struct lock_trace *trace) + struct held_lock *next, int distance, + struct lock_trace **const trace) { struct lock_list *entry; int ret; @@ -2444,8 +2486,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return ret; #endif - if (!trace->nr_entries && !save_trace(trace)) - return 0; + if (!*trace) { + *trace = save_trace(); + if (!*trace) + return 0; + } /* * Ok, all validations passed, add the new lock @@ -2453,14 +2498,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(prev)->locks_after, - next->acquire_ip, distance, trace); + next->acquire_ip, distance, *trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(next)->locks_before, - next->acquire_ip, distance, trace); + next->acquire_ip, distance, *trace); if (!ret) return 0; @@ -2476,7 +2521,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { - struct lock_trace trace = { .nr_entries = 0 }; + struct lock_trace *trace = NULL; int depth = curr->lockdep_depth; struct held_lock *hlock; @@ -3015,7 +3060,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_lock(this); pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); - print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1); + print_lock_trace(hlock_class(this)->usage_traces[prev_bit], 1); print_irqtrace_events(curr); pr_warn("\nother info that might help us debug this:\n"); @@ -3096,7 +3141,8 @@ print_irq_inversion_bug(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); - if (!save_trace(&root->trace)) + root->trace = save_trace(); + if (!root->trace) return; print_shortest_lock_dependencies(other, root); @@ -3580,7 +3626,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, hlock_class(this)->usage_mask |= new_mask; - if (!save_trace(hlock_class(this)->usage_traces + new_bit)) + if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) return 0; switch (new_bit) { @@ -5157,6 +5203,12 @@ void __init lockdep_init(void) ) / 1024 ); +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + printk(" memory used for stack traces: %zu kB\n", + (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024 + ); +#endif + printk(" per task-struct memory footprint: %zu bytes\n", sizeof(((struct task_struct *)NULL)->held_locks)); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 2e518369add4..93a008bf77db 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -92,6 +92,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_LOCKDEP_ENTRIES 16384UL #define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_STACK_TRACE_ENTRIES 262144UL +#define STACK_TRACE_HASH_SIZE 8192 #else #define MAX_LOCKDEP_ENTRIES 32768UL @@ -102,6 +103,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = * addresses. Protected by the hash_lock. */ #define MAX_STACK_TRACE_ENTRIES 524288UL +#define STACK_TRACE_HASH_SIZE 16384 #endif #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) From 8c779229d0f4fe83ead90bdcbbf08b02989aa200 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:43 -0700 Subject: [PATCH 04/10] locking/lockdep: Report more stack trace statistics Report the number of stack traces and the number of stack trace hash chains. These two numbers are useful because these allow to estimate the number of stack trace hash collisions. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Link: https://lkml.kernel.org/r/20190722182443.216015-5-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 29 +++++++++++++++++++++++++++++ kernel/locking/lockdep_internals.h | 4 ++++ kernel/locking/lockdep_proc.c | 6 ++++++ 3 files changed, 39 insertions(+) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 1a96869cb2f0..3c3902c40a0e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -516,6 +516,35 @@ static struct lock_trace *save_trace(void) return trace; } + +/* Return the number of stack traces in the stack_trace[] array. */ +u64 lockdep_stack_trace_count(void) +{ + struct lock_trace *trace; + u64 c = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) { + hlist_for_each_entry(trace, &stack_trace_hash[i], hash_entry) { + c++; + } + } + + return c; +} + +/* Return the number of stack hash chains that have at least one stack trace. */ +u64 lockdep_stack_hash_count(void) +{ + u64 c = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) + if (!hlist_empty(&stack_trace_hash[i])) + c++; + + return c; +} #endif unsigned int nr_hardirq_chains; diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 93a008bf77db..18d85aebbb57 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -140,6 +140,10 @@ extern unsigned int max_bfs_queue_depth; #ifdef CONFIG_PROVE_LOCKING extern unsigned long lockdep_count_forward_deps(struct lock_class *); extern unsigned long lockdep_count_backward_deps(struct lock_class *); +#ifdef CONFIG_TRACE_IRQFLAGS +u64 lockdep_stack_trace_count(void); +u64 lockdep_stack_hash_count(void); +#endif #else static inline unsigned long lockdep_count_forward_deps(struct lock_class *class) diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index ed9842425cac..dadb7b7fba37 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -285,6 +285,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_process_chains); seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + seq_printf(m, " number of stack traces: %llu\n", + lockdep_stack_trace_count()); + seq_printf(m, " number of stack hash chains: %llu\n", + lockdep_stack_hash_count()); +#endif seq_printf(m, " combined max dependencies: %11u\n", (nr_hardirq_chains + 1) * (nr_softirq_chains + 1) * From 91d2a812dfb98b3b4dad661529c33bc38d303461 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 25 Jun 2019 10:39:13 -0400 Subject: [PATCH 05/10] locking/rwsem: Make handoff writer optimistically spin on owner When the handoff bit is set by a writer, no other tasks other than the setting writer itself is allowed to acquire the lock. If the to-be-handoff'ed writer goes to sleep, there will be a wakeup latency period where the lock is free, but no one can acquire it. That is less than ideal. To reduce that latency, the handoff writer will now optimistically spin on the owner if it happens to be a on-cpu writer. It will spin until it releases the lock and the to-be-handoff'ed writer can then acquire the lock immediately without any delay. Of course, if the owner is not a on-cpu writer, the to-be-handoff'ed writer will have to sleep anyway. The optimistic spinning code is also modified to not stop spinning when the handoff bit is set. This will prevent an occasional setting of handoff bit from causing a bunch of optimistic spinners from entering into the wait queue causing significant reduction in throughput. On a 1-socket 22-core 44-thread Skylake system, the AIM7 shared_memory workload was run with 7000 users. The throughput (jobs/min) of the following kernels were as follows: 1) 5.2-rc6 - 8,092,486 2) 5.2-rc6 + tip's rwsem patches - 7,567,568 3) 5.2-rc6 + tip's rwsem patches + this patch - 7,954,545 Using perf-record(1), the %cpu time used by rwsem_down_write_slowpath(), rwsem_down_write_failed() and their callees for the 3 kernels were 1.70%, 5.46% and 2.08% respectively. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: x86@kernel.org Cc: Ingo Molnar Cc: Will Deacon Cc: huang ying Cc: Tim Chen Cc: Linus Torvalds Cc: Borislav Petkov Cc: Thomas Gleixner Cc: Davidlohr Bueso Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20190625143913.24154-1-longman@redhat.com --- kernel/locking/rwsem.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index bd0f0d05724c..354238a08b7a 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -724,11 +724,12 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) rcu_read_lock(); for (;;) { - if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) { - state = OWNER_NONSPINNABLE; - break; - } - + /* + * When a waiting writer set the handoff flag, it may spin + * on the owner as well. Once that writer acquires the lock, + * we can spin on it. So we don't need to quit even when the + * handoff bit is set. + */ new = rwsem_owner_flags(sem, &new_flags); if ((new != owner) || (new_flags != flags)) { state = rwsem_owner_state(new, new_flags, nonspinnable); @@ -974,6 +975,13 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, { return false; } + +static inline int +rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) +{ + return 0; +} +#define OWNER_NULL 1 #endif /* @@ -1206,6 +1214,18 @@ wait: raw_spin_unlock_irq(&sem->wait_lock); + /* + * After setting the handoff bit and failing to acquire + * the lock, attempt to spin on owner to accelerate lock + * transfer. If the previous owner is a on-cpu writer and it + * has just released the lock, OWNER_NULL will be returned. + * In this case, we attempt to acquire the lock again + * without sleeping. + */ + if ((wstate == WRITER_HANDOFF) && + (rwsem_spin_on_owner(sem, 0) == OWNER_NULL)) + goto trylock_again; + /* Block until there are no active lockers. */ for (;;) { if (signal_pending_state(state, current)) @@ -1240,7 +1260,7 @@ wait: break; } } - +trylock_again: raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); From fce45cd41101f1a9620267146b21f09b3454d8db Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 28 Jul 2019 21:47:35 -0700 Subject: [PATCH 06/10] locking/rwsem: Check for operations on an uninitialized rwsem Currently rwsems is the only locking primitive that lacks this debug feature. Add it under CONFIG_DEBUG_RWSEMS and do the magic checking in the locking fastpath (trylock) operation such that we cover all cases. The unlocking part is pretty straightforward. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Cc: mingo@kernel.org Cc: Davidlohr Bueso Link: https://lkml.kernel.org/r/20190729044735.9632-1-dave@stgolabs.net --- include/linux/rwsem.h | 10 ++++++++++ kernel/locking/rwsem.c | 22 ++++++++++++++++++---- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 9d9c663987d8..00d6054687dd 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -45,6 +45,9 @@ struct rw_semaphore { #endif raw_spinlock_t wait_lock; struct list_head wait_list; +#ifdef CONFIG_DEBUG_RWSEMS + void *magic; +#endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif @@ -73,6 +76,12 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) # define __RWSEM_DEP_MAP_INIT(lockname) #endif +#ifdef CONFIG_DEBUG_RWSEMS +# define __DEBUG_RWSEM_INITIALIZER(lockname) , .magic = &lockname +#else +# define __DEBUG_RWSEM_INITIALIZER(lockname) +#endif + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED #else @@ -85,6 +94,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) .wait_list = LIST_HEAD_INIT((name).wait_list), \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) \ __RWSEM_OPT_INIT(name) \ + __DEBUG_RWSEM_INITIALIZER(name) \ __RWSEM_DEP_MAP_INIT(name) } #define DECLARE_RWSEM(name) \ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 354238a08b7a..eef04551eae7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -105,8 +105,9 @@ #ifdef CONFIG_DEBUG_RWSEMS # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ if (!debug_locks_silent && \ - WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ + WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ #c, atomic_long_read(&(sem)->count), \ + (unsigned long) sem->magic, \ atomic_long_read(&(sem)->owner), (long)current, \ list_empty(&(sem)->wait_list) ? "" : "not ")) \ debug_locks_off(); \ @@ -329,6 +330,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, */ debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); +#endif +#ifdef CONFIG_DEBUG_RWSEMS + sem->magic = sem; #endif atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); @@ -1358,11 +1362,14 @@ static inline int __down_read_killable(struct rw_semaphore *sem) static inline int __down_read_trylock(struct rw_semaphore *sem) { + long tmp; + + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); + /* * Optimize for the case when the rwsem is not locked at all. */ - long tmp = RWSEM_UNLOCKED_VALUE; - + tmp = RWSEM_UNLOCKED_VALUE; do { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, tmp + RWSEM_READER_BIAS)) { @@ -1403,8 +1410,11 @@ static inline int __down_write_killable(struct rw_semaphore *sem) static inline int __down_write_trylock(struct rw_semaphore *sem) { - long tmp = RWSEM_UNLOCKED_VALUE; + long tmp; + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); + + tmp = RWSEM_UNLOCKED_VALUE; if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); @@ -1420,7 +1430,9 @@ inline void __up_read(struct rw_semaphore *sem) { long tmp; + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); @@ -1438,12 +1450,14 @@ static inline void __up_write(struct rw_semaphore *sem) { long tmp; + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); /* * sem->owner may differ from current if the ownership is transferred * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. */ DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); + rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) From 24a376d65177009a4dd8d846543c5dc69f5c4ced Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Aug 2019 15:30:28 +0200 Subject: [PATCH 07/10] locking/qspinlock,x86: Clarify virt_spin_lock_key Add a few comments to clarify how this is supposed to work. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Juergen Gross --- arch/x86/include/asm/qspinlock.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index bd5ac6cc37db..444d6fd9a6d8 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -63,10 +63,25 @@ static inline bool vcpu_is_preempted(long cpu) #endif #ifdef CONFIG_PARAVIRT +/* + * virt_spin_lock_key - enables (by default) the virt_spin_lock() hijack. + * + * Native (and PV wanting native due to vCPU pinning) should disable this key. + * It is done in this backwards fashion to only have a single direction change, + * which removes ordering between native_pv_spin_init() and HV setup. + */ DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); void native_pv_lock_init(void) __init; +/* + * Shortcut for the queued_spin_lock_slowpath() function that allows + * virt to hijack it. + * + * Returns: + * true - lock has been negotiated, all done; + * false - queued_spin_lock_slowpath() will do its thing. + */ #define virt_spin_lock virt_spin_lock static inline bool virt_spin_lock(struct qspinlock *lock) { From 5f35d5a66b3ec62cb5ec4ec2ad9aebe2ac325673 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Wed, 31 Jul 2019 20:35:03 +0530 Subject: [PATCH 08/10] locking/mutex: Make __mutex_owner static to mutex.c __mutex_owner() should only be used by the mutex api's. So, to put this restiction let's move the __mutex_owner() function definition from linux/mutex.h to mutex.c file. There exist functions that uses __mutex_owner() like mutex_is_locked() and mutex_trylock_recursive(), So to keep legacy thing intact move them as well and export them. Move mutex_waiter structure also to keep it private to the file. Signed-off-by: Mukesh Ojha Signed-off-by: Peter Zijlstra (Intel) Cc: mingo@redhat.com Cc: will@kernel.org Link: https://lkml.kernel.org/r/1564585504-3543-1-git-send-email-mojha@codeaurora.org --- include/linux/mutex.h | 38 +++----------------------------------- kernel/locking/mutex.c | 39 +++++++++++++++++++++++++++++++++++++++ kernel/locking/mutex.h | 2 ++ 3 files changed, 44 insertions(+), 35 deletions(-) diff --git a/include/linux/mutex.h b/include/linux/mutex.h index dcd03fee6e01..eb8c62aba263 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -65,29 +65,6 @@ struct mutex { #endif }; -/* - * Internal helper function; C doesn't allow us to hide it :/ - * - * DO NOT USE (outside of mutex code). - */ -static inline struct task_struct *__mutex_owner(struct mutex *lock) -{ - return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x07); -} - -/* - * This is the control structure for tasks blocked on mutex, - * which resides on the blocked task's kernel stack: - */ -struct mutex_waiter { - struct list_head list; - struct task_struct *task; - struct ww_acquire_ctx *ww_ctx; -#ifdef CONFIG_DEBUG_MUTEXES - void *magic; -#endif -}; - #ifdef CONFIG_DEBUG_MUTEXES #define __DEBUG_MUTEX_INITIALIZER(lockname) \ @@ -144,10 +121,7 @@ extern void __mutex_init(struct mutex *lock, const char *name, * * Returns true if the mutex is locked, false if unlocked. */ -static inline bool mutex_is_locked(struct mutex *lock) -{ - return __mutex_owner(lock) != NULL; -} +extern bool mutex_is_locked(struct mutex *lock); /* * See kernel/locking/mutex.c for detailed documentation of these APIs. @@ -220,13 +194,7 @@ enum mutex_trylock_recursive_enum { * - MUTEX_TRYLOCK_SUCCESS - lock acquired, * - MUTEX_TRYLOCK_RECURSIVE - we already owned the lock. */ -static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum -mutex_trylock_recursive(struct mutex *lock) -{ - if (unlikely(__mutex_owner(lock) == current)) - return MUTEX_TRYLOCK_RECURSIVE; - - return mutex_trylock(lock); -} +extern /* __deprecated */ __must_check enum mutex_trylock_recursive_enum +mutex_trylock_recursive(struct mutex *lock); #endif /* __LINUX_MUTEX_H */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 5e069734363c..ac4929f1e085 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -36,6 +36,19 @@ # include "mutex.h" #endif +/* + * This is the control structure for tasks blocked on mutex, + * which resides on the blocked task's kernel stack: + */ +struct mutex_waiter { + struct list_head list; + struct task_struct *task; + struct ww_acquire_ctx *ww_ctx; +#ifdef CONFIG_DEBUG_MUTEXES + void *magic; +#endif +}; + void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -65,11 +78,37 @@ EXPORT_SYMBOL(__mutex_init); #define MUTEX_FLAGS 0x07 +/* + * Internal helper function; C doesn't allow us to hide it :/ + * + * DO NOT USE (outside of mutex code). + */ +static inline struct task_struct *__mutex_owner(struct mutex *lock) +{ + return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x07); +} + static inline struct task_struct *__owner_task(unsigned long owner) { return (struct task_struct *)(owner & ~MUTEX_FLAGS); } +bool mutex_is_locked(struct mutex *lock) +{ + return __mutex_owner(lock) != NULL; +} +EXPORT_SYMBOL(mutex_is_locked); + +__must_check enum mutex_trylock_recursive_enum +mutex_trylock_recursive(struct mutex *lock) +{ + if (unlikely(__mutex_owner(lock) == current)) + return MUTEX_TRYLOCK_RECURSIVE; + + return mutex_trylock(lock); +} +EXPORT_SYMBOL(mutex_trylock_recursive); + static inline unsigned long __owner_flags(unsigned long owner) { return owner & MUTEX_FLAGS; diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 1c2287d3fa71..7cde5c6d414e 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -19,6 +19,8 @@ #define debug_mutex_unlock(lock) do { } while (0) #define debug_mutex_init(lock, name, key) do { } while (0) +struct mutex_waiter; + static inline void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) { From a037d269221c0ae15f47046757afcbd1a7177bbf Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Wed, 31 Jul 2019 20:35:04 +0530 Subject: [PATCH 09/10] locking/mutex: Use mutex flags macro instead of hard code Use the mutex flag macro instead of hard code value inside __mutex_owner(). Signed-off-by: Mukesh Ojha Signed-off-by: Peter Zijlstra (Intel) Cc: mingo@redhat.com Cc: will@kernel.org Link: https://lkml.kernel.org/r/1564585504-3543-2-git-send-email-mojha@codeaurora.org --- kernel/locking/mutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index ac4929f1e085..b4bcb0236d7a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -85,7 +85,7 @@ EXPORT_SYMBOL(__mutex_init); */ static inline struct task_struct *__mutex_owner(struct mutex *lock) { - return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x07); + return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); } static inline struct task_struct *__owner_task(unsigned long owner) From e57d143091f1c0b1a98140a4d2e63e113afb62c0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 8 Aug 2019 08:47:14 +0200 Subject: [PATCH 10/10] mutex: Fix up mutex_waiter usage The patch moving bits into mutex.c was a little too much; by also moving struct mutex_waiter a few less common CONFIGs would no longer build. Fixes: 5f35d5a66b3e ("locking/mutex: Make __mutex_owner static to mutex.c") Signed-off-by: Peter Zijlstra (Intel) --- include/linux/mutex.h | 13 +++++++++++++ kernel/locking/mutex.c | 13 ------------- kernel/locking/mutex.h | 2 -- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/include/linux/mutex.h b/include/linux/mutex.h index eb8c62aba263..aca8f36dfac9 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -65,6 +65,19 @@ struct mutex { #endif }; +/* + * This is the control structure for tasks blocked on mutex, + * which resides on the blocked task's kernel stack: + */ +struct mutex_waiter { + struct list_head list; + struct task_struct *task; + struct ww_acquire_ctx *ww_ctx; +#ifdef CONFIG_DEBUG_MUTEXES + void *magic; +#endif +}; + #ifdef CONFIG_DEBUG_MUTEXES #define __DEBUG_MUTEX_INITIALIZER(lockname) \ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index b4bcb0236d7a..468a9b8422e3 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -36,19 +36,6 @@ # include "mutex.h" #endif -/* - * This is the control structure for tasks blocked on mutex, - * which resides on the blocked task's kernel stack: - */ -struct mutex_waiter { - struct list_head list; - struct task_struct *task; - struct ww_acquire_ctx *ww_ctx; -#ifdef CONFIG_DEBUG_MUTEXES - void *magic; -#endif -}; - void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 7cde5c6d414e..1c2287d3fa71 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -19,8 +19,6 @@ #define debug_mutex_unlock(lock) do { } while (0) #define debug_mutex_init(lock, name, key) do { } while (0) -struct mutex_waiter; - static inline void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) {