ftrace: Optimize the function tracer list loop

There is lots of places that perform:

       op = rcu_dereference_raw(ftrace_control_list);
       while (op != &ftrace_list_end) {

Add a helper macro to do this, and also optimize for a single
entity. That is, gcc will optimize a loop for either no iterations
or more than one iteration. But usually only a single callback
is registered to the function tracer, thus the optimized case
should be a single pass. to do this we now do:

	op = rcu_dereference_raw(list);
	do {
		[...]
	} while (likely(op = rcu_dereference_raw((op)->next)) &&
	       unlikely((op) != &ftrace_list_end));

An op is always registered (ftrace_list_end when no callbacks is
registered), thus when a single callback is registered, the link
list looks like:

 top => callback => ftrace_list_end => NULL.

The likely(op = op->next) still must be performed due to the race
of removing the callback, where the first op assignment could
equal ftrace_list_end. In that case, the op->next would be NULL.
But this is unlikely (only happens in a race condition when
removing the callback).

But it is very likely that the next op would be ftrace_list_end,
unless more than one callback has been registered. This tells
gcc what the most common case is and makes the fast path with
the least amount of branches.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
This commit is contained in:
Steven Rostedt 2012-11-02 17:03:03 -04:00 committed by Steven Rostedt
parent 9640388b63
commit 0a016409e4
1 changed files with 26 additions and 22 deletions

View File

@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
#endif #endif
/*
* Traverse the ftrace_global_list, invoking all entries. The reason that we
* can use rcu_dereference_raw() is that elements removed from this list
* are simply leaked, so there is no need to interact with a grace-period
* mechanism. The rcu_dereference_raw() calls are needed to handle
* concurrent insertions into the ftrace_global_list.
*
* Silly Alpha and silly pointer-speculation compiler optimizations!
*/
#define do_for_each_ftrace_op(op, list) \
op = rcu_dereference_raw(list); \
do
/*
* Optimized for just a single item in the list (as that is the normal case).
*/
#define while_for_each_ftrace_op(op) \
while (likely(op = rcu_dereference_raw((op)->next)) && \
unlikely((op) != &ftrace_list_end))
/** /**
* ftrace_nr_registered_ops - return number of ops registered * ftrace_nr_registered_ops - return number of ops registered
* *
@ -132,15 +152,6 @@ int ftrace_nr_registered_ops(void)
return cnt; return cnt;
} }
/*
* Traverse the ftrace_global_list, invoking all entries. The reason that we
* can use rcu_dereference_raw() is that elements removed from this list
* are simply leaked, so there is no need to interact with a grace-period
* mechanism. The rcu_dereference_raw() calls are needed to handle
* concurrent insertions into the ftrace_global_list.
*
* Silly Alpha and silly pointer-speculation compiler optimizations!
*/
static void static void
ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs) struct ftrace_ops *op, struct pt_regs *regs)
@ -149,11 +160,9 @@ ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
return; return;
trace_recursion_set(TRACE_GLOBAL_BIT); trace_recursion_set(TRACE_GLOBAL_BIT);
op = rcu_dereference_raw(ftrace_global_list); /*see above*/ do_for_each_ftrace_op(op, ftrace_global_list) {
while (op != &ftrace_list_end) {
op->func(ip, parent_ip, op, regs); op->func(ip, parent_ip, op, regs);
op = rcu_dereference_raw(op->next); /*see above*/ } while_for_each_ftrace_op(op);
};
trace_recursion_clear(TRACE_GLOBAL_BIT); trace_recursion_clear(TRACE_GLOBAL_BIT);
} }
@ -4104,14 +4113,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
*/ */
preempt_disable_notrace(); preempt_disable_notrace();
trace_recursion_set(TRACE_CONTROL_BIT); trace_recursion_set(TRACE_CONTROL_BIT);
op = rcu_dereference_raw(ftrace_control_list); do_for_each_ftrace_op(op, ftrace_control_list) {
while (op != &ftrace_list_end) {
if (!ftrace_function_local_disabled(op) && if (!ftrace_function_local_disabled(op) &&
ftrace_ops_test(op, ip)) ftrace_ops_test(op, ip))
op->func(ip, parent_ip, op, regs); op->func(ip, parent_ip, op, regs);
} while_for_each_ftrace_op(op);
op = rcu_dereference_raw(op->next);
};
trace_recursion_clear(TRACE_CONTROL_BIT); trace_recursion_clear(TRACE_CONTROL_BIT);
preempt_enable_notrace(); preempt_enable_notrace();
} }
@ -4139,12 +4145,10 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
* they must be freed after a synchronize_sched(). * they must be freed after a synchronize_sched().
*/ */
preempt_disable_notrace(); preempt_disable_notrace();
op = rcu_dereference_raw(ftrace_ops_list); do_for_each_ftrace_op(op, ftrace_ops_list) {
while (op != &ftrace_list_end) {
if (ftrace_ops_test(op, ip)) if (ftrace_ops_test(op, ip))
op->func(ip, parent_ip, op, regs); op->func(ip, parent_ip, op, regs);
op = rcu_dereference_raw(op->next); } while_for_each_ftrace_op(op);
};
preempt_enable_notrace(); preempt_enable_notrace();
trace_recursion_clear(TRACE_INTERNAL_BIT); trace_recursion_clear(TRACE_INTERNAL_BIT);
} }