diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 73f7fe8fd4d1..2cf7bbcaed4e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,6 +34,7 @@ config X86 select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST + select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4d33224c055f..d74d75e0952d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -82,7 +82,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * are the same as what exists. */ -static atomic_t in_nmi = ATOMIC_INIT(0); +static atomic_t nmi_running = ATOMIC_INIT(0); static int mod_code_status; /* holds return value of text write */ static int mod_code_write; /* set when NMI should do the write */ static void *mod_code_ip; /* holds the IP to write to */ @@ -115,8 +115,8 @@ static void ftrace_mod_code(void) void ftrace_nmi_enter(void) { - atomic_inc(&in_nmi); - /* Must have in_nmi seen before reading write flag */ + atomic_inc(&nmi_running); + /* Must have nmi_running seen before reading write flag */ smp_mb(); if (mod_code_write) { ftrace_mod_code(); @@ -126,19 +126,19 @@ void ftrace_nmi_enter(void) void ftrace_nmi_exit(void) { - /* Finish all executions before clearing in_nmi */ + /* Finish all executions before clearing nmi_running */ smp_wmb(); - atomic_dec(&in_nmi); + atomic_dec(&nmi_running); } static void wait_for_nmi(void) { - if (!atomic_read(&in_nmi)) + if (!atomic_read(&nmi_running)) return; do { cpu_relax(); - } while(atomic_read(&in_nmi)); + } while (atomic_read(&nmi_running)); nmi_wait_count++; } @@ -367,25 +367,6 @@ int ftrace_disable_ftrace_graph_caller(void) return ftrace_mod_jmp(ip, old_offset, new_offset); } -#else /* CONFIG_DYNAMIC_FTRACE */ - -/* - * These functions are picked from those used on - * this page for dynamic ftrace. They have been - * simplified to ignore all traces in NMI context. - */ -static atomic_t in_nmi; - -void ftrace_nmi_enter(void) -{ - atomic_inc(&in_nmi); -} - -void ftrace_nmi_exit(void) -{ - atomic_dec(&in_nmi); -} - #endif /* !CONFIG_DYNAMIC_FTRACE */ /* Add a function return address to the trace stack on thread info.*/ @@ -475,7 +456,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) &return_to_handler; /* Nmi's are currently unsupported */ - if (unlikely(atomic_read(&in_nmi))) + if (unlikely(in_nmi())) return; if (unlikely(atomic_read(¤t->tracing_graph_pause))) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7840e718c6c7..5e302d636fc2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -140,7 +140,7 @@ static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; } #endif /** - * ftrace_make_nop - convert code into top + * ftrace_make_nop - convert code into nop * @mod: module structure if called by module load initialization * @rec: the mcount call site record * @addr: the address that the call site should be calling diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index 366a054d0b05..dca7bf8cffe2 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -2,7 +2,7 @@ #define _LINUX_FTRACE_IRQ_H -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER) +#ifdef CONFIG_FTRACE_NMI_ENTER extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index f83288347dda..f3cf86e1465b 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -61,6 +61,12 @@ #error PREEMPT_ACTIVE is too low! #endif +#define NMI_OFFSET (PREEMPT_ACTIVE << 1) + +#if NMI_OFFSET >= 0x80000000 +#error PREEMPT_ACTIVE too high! +#endif + #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #define softirq_count() (preempt_count() & SOFTIRQ_MASK) #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) @@ -73,6 +79,11 @@ #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) +/* + * Are we in NMI context? + */ +#define in_nmi() (preempt_count() & NMI_OFFSET) + #if defined(CONFIG_PREEMPT) # define PREEMPT_INATOMIC_BASE kernel_locked() # define PREEMPT_CHECK_OFFSET 1 @@ -167,6 +178,8 @@ extern void irq_exit(void); #define nmi_enter() \ do { \ ftrace_nmi_enter(); \ + BUG_ON(in_nmi()); \ + add_preempt_count(NMI_OFFSET); \ lockdep_off(); \ rcu_nmi_enter(); \ __irq_enter(); \ @@ -177,6 +190,8 @@ extern void irq_exit(void); __irq_exit(); \ rcu_nmi_exit(); \ lockdep_on(); \ + BUG_ON(!in_nmi()); \ + sub_preempt_count(NMI_OFFSET); \ ftrace_nmi_exit(); \ } while (0) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 3110d92e7d81..3c103d636da3 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -121,9 +121,18 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(int cpu); void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); +/* + * The below functions are fine to use outside the tracing facility. + */ +#ifdef CONFIG_RING_BUFFER void tracing_on(void); void tracing_off(void); void tracing_off_permanent(void); +#else +static inline void tracing_on(void) { } +static inline void tracing_off(void) { } +static inline void tracing_off_permanent(void) { } +#endif void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 28f2644484d9..25131a5d5e4f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool +config HAVE_FTRACE_NMI_ENTER + bool + config HAVE_FUNCTION_TRACER bool @@ -37,6 +40,11 @@ config TRACER_MAX_TRACE config RING_BUFFER bool +config FTRACE_NMI_ENTER + bool + depends on HAVE_FTRACE_NMI_ENTER + default y + config TRACING bool select DEBUG_FS diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 68610031780b..1796e018fbff 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -465,7 +465,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) * it is not enabled then do nothing. * * If this record is not to be traced and - * it is enabled then disabled it. + * it is enabled then disable it. * */ if (rec->flags & FTRACE_FL_NOTRACE) { @@ -485,7 +485,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) return 0; - /* Record is not filtered and is not enabled do nothing */ + /* Record is not filtered or enabled, do nothing */ if (!fl) return 0; @@ -507,7 +507,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) } else { - /* if record is not enabled do nothing */ + /* if record is not enabled, do nothing */ if (!(rec->flags & FTRACE_FL_ENABLED)) return 0; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index aee76b3eeed2..53ba3a6d16d0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,9 +4,11 @@ * Copyright (C) 2008 Steven Rostedt */ #include +#include #include #include #include +#include #include #include #include @@ -982,6 +984,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; unsigned long flags; + bool lock_taken = false; commit_page = cpu_buffer->commit_page; /* we just need to protect against interrupts */ @@ -995,7 +998,30 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page = tail_page; local_irq_save(flags); - __raw_spin_lock(&cpu_buffer->lock); + /* + * Since the write to the buffer is still not + * fully lockless, we must be careful with NMIs. + * The locks in the writers are taken when a write + * crosses to a new page. The locks protect against + * races with the readers (this will soon be fixed + * with a lockless solution). + * + * Because we can not protect against NMIs, and we + * want to keep traces reentrant, we need to manage + * what happens when we are in an NMI. + * + * NMIs can happen after we take the lock. + * If we are in an NMI, only take the lock + * if it is not already taken. Otherwise + * simply fail. + */ + if (unlikely(in_nmi())) { + if (!__raw_spin_trylock(&cpu_buffer->lock)) + goto out_unlock; + } else + __raw_spin_lock(&cpu_buffer->lock); + + lock_taken = true; rb_inc_page(cpu_buffer, &next_page); @@ -1097,7 +1123,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (tail <= BUF_PAGE_SIZE) local_set(&tail_page->write, tail); - __raw_spin_unlock(&cpu_buffer->lock); + if (likely(lock_taken)) + __raw_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); return NULL; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ef4dbac95568..03fbd4c20bc2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1519,7 +1519,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) if (trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, entry->cpu); + SEQ_PUT_FIELD_RET(s, iter->cpu); SEQ_PUT_FIELD_RET(s, iter->ts); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f2742fb1575a..b9838f4a6929 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -45,7 +45,6 @@ enum trace_type { */ struct trace_entry { unsigned char type; - unsigned char cpu; unsigned char flags; unsigned char preempt_count; int pid; @@ -625,12 +624,12 @@ extern struct tracer nop_trace; * preempt_enable (after a disable), a schedule might take place * causing an infinite recursion. * - * To prevent this, we read the need_recshed flag before + * To prevent this, we read the need_resched flag before * disabling preemption. When we want to enable preemption we * check the flag, if it is set, then we call preempt_enable_no_resched. * Otherwise, we call preempt_enable. * - * The rational for doing the above is that if need resched is set + * The rational for doing the above is that if need_resched is set * and we have yet to reschedule, we are either in an atomic location * (where we do not need to check for scheduling) or we are inside * the scheduler and do not want to resched. @@ -651,7 +650,7 @@ static inline int ftrace_preempt_disable(void) * * This is a scheduler safe way to enable preemption and not miss * any preemption checks. The disabled saved the state of preemption. - * If resched is set, then we were either inside an atomic or + * If resched is set, then we are either inside an atomic or * are inside the scheduler (we would have already scheduled * otherwise). In this case, we do not want to call normal * preempt_enable, but preempt_enable_no_resched instead. diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index ca4bbcfb9e2c..e3e7db61c067 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -158,7 +158,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) trace_assign_type(it, entry); if (entry->type == TRACE_HW_BRANCHES) { - if (trace_seq_printf(seq, "%4d ", entry->cpu) && + if (trace_seq_printf(seq, "%4d ", iter->cpu) && seq_print_ip_sym(seq, it->to, symflags) && trace_seq_printf(seq, "\t <- ") && seq_print_ip_sym(seq, it->from, symflags) && @@ -193,7 +193,8 @@ void trace_hw_branch(u64 from, u64 to) if (!event) goto out; entry = ring_buffer_event_data(event); - entry->ent.cpu = cpu; + tracing_generic_entry_update(&entry->ent, 0, from); + entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; trace_buffer_unlock_commit(tr, event, 0, 0); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b6e99af79214..9fc815031b09 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -333,7 +333,7 @@ int trace_print_context(struct trace_iterator *iter) unsigned long secs = (unsigned long)t; return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", - comm, entry->pid, entry->cpu, secs, usec_rem); + comm, entry->pid, iter->cpu, secs, usec_rem); } int trace_print_lat_context(struct trace_iterator *iter) @@ -356,7 +356,7 @@ int trace_print_lat_context(struct trace_iterator *iter) char *comm = trace_find_cmdline(entry->pid); ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]" " %ld.%03ldms (+%ld.%03ldms): ", comm, - entry->pid, entry->cpu, entry->flags, + entry->pid, iter->cpu, entry->flags, entry->preempt_count, iter->idx, ns2usecs(iter->ts), abs_usecs / USEC_PER_MSEC, @@ -364,7 +364,7 @@ int trace_print_lat_context(struct trace_iterator *iter) rel_usecs / USEC_PER_MSEC, rel_usecs % USEC_PER_MSEC); } else { - ret = lat_print_generic(s, entry, entry->cpu); + ret = lat_print_generic(s, entry, iter->cpu); if (ret) ret = lat_print_timestamp(s, abs_usecs, rel_usecs); }