sched/fair: Remove double_lock_balance() from load_balance()
Avoid double_rq_lock() and use TASK_ON_RQ_MIGRATING for load_balance(). The advantage is (obviously) not holding two rq->lock's at the same time and thereby increasing parallelism. Further note that if there was no task to migrate we will not have acquired the second rq->lock at all. The important point to note is that because we acquire dst->lock immediately after releasing src->lock the potential wait time of task_rq_lock() callers on TASK_ON_RQ_MIGRATING is not longer than it would have been in the double rq lock scenario. Signed-off-by: Kirill Tkhai <ktkhai@parallels.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul Turner <pjt@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Mike Galbraith <umgwanakikbuti@gmail.com> Cc: Kirill Tkhai <tkhai@yandex.ru> Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Nicolas Pitre <nicolas.pitre@linaro.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/1408528109.23412.94.camel@tkhai Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
e5673f2805
commit
163122b7fc
|
@ -4709,7 +4709,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
|||
return;
|
||||
|
||||
/*
|
||||
* This is possible from callers such as move_task(), in which we
|
||||
* This is possible from callers such as attach_tasks(), in which we
|
||||
* unconditionally check_prempt_curr() after an enqueue (which may have
|
||||
* lead to a throttle). This both saves work and prevents false
|
||||
* next-buddy nomination below.
|
||||
|
@ -5117,20 +5117,9 @@ struct lb_env {
|
|||
unsigned int loop_max;
|
||||
|
||||
enum fbq_type fbq_type;
|
||||
struct list_head tasks;
|
||||
};
|
||||
|
||||
/*
|
||||
* move_task - move a task from one runqueue to another runqueue.
|
||||
* Both runqueues must be locked.
|
||||
*/
|
||||
static void move_task(struct task_struct *p, struct lb_env *env)
|
||||
{
|
||||
deactivate_task(env->src_rq, p, 0);
|
||||
set_task_cpu(p, env->dst_cpu);
|
||||
activate_task(env->dst_rq, p, 0);
|
||||
check_preempt_curr(env->dst_rq, p, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Is this task likely cache-hot:
|
||||
*/
|
||||
|
@ -5345,6 +5334,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* detach_task() -- detach the task for the migration specified in env
|
||||
*/
|
||||
static void detach_task(struct task_struct *p, struct lb_env *env)
|
||||
{
|
||||
lockdep_assert_held(&env->src_rq->lock);
|
||||
|
||||
deactivate_task(env->src_rq, p, 0);
|
||||
p->on_rq = TASK_ON_RQ_MIGRATING;
|
||||
set_task_cpu(p, env->dst_cpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
|
||||
* part of active balancing operations within "domain".
|
||||
|
@ -5361,15 +5362,13 @@ static struct task_struct *detach_one_task(struct lb_env *env)
|
|||
if (!can_migrate_task(p, env))
|
||||
continue;
|
||||
|
||||
deactivate_task(env->src_rq, p, 0);
|
||||
p->on_rq = TASK_ON_RQ_MIGRATING;
|
||||
set_task_cpu(p, env->dst_cpu);
|
||||
detach_task(p, env);
|
||||
|
||||
/*
|
||||
* Right now, this is only the second place where
|
||||
* lb_gained[env->idle] is updated (other is move_tasks)
|
||||
* lb_gained[env->idle] is updated (other is detach_tasks)
|
||||
* so we can safely collect stats here rather than
|
||||
* inside move_tasks().
|
||||
* inside detach_tasks().
|
||||
*/
|
||||
schedstat_inc(env->sd, lb_gained[env->idle]);
|
||||
return p;
|
||||
|
@ -5377,35 +5376,22 @@ static struct task_struct *detach_one_task(struct lb_env *env)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* attach_one_task() -- attaches the task returned from detach_one_task() to
|
||||
* its new rq.
|
||||
*/
|
||||
static void attach_one_task(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
raw_spin_lock(&rq->lock);
|
||||
BUG_ON(task_rq(p) != rq);
|
||||
p->on_rq = TASK_ON_RQ_QUEUED;
|
||||
activate_task(rq, p, 0);
|
||||
check_preempt_curr(rq, p, 0);
|
||||
raw_spin_unlock(&rq->lock);
|
||||
}
|
||||
|
||||
static const unsigned int sched_nr_migrate_break = 32;
|
||||
|
||||
/*
|
||||
* move_tasks tries to move up to imbalance weighted load from busiest to
|
||||
* this_rq, as part of a balancing operation within domain "sd".
|
||||
* Returns 1 if successful and 0 otherwise.
|
||||
* detach_tasks() -- tries to detach up to imbalance weighted load from
|
||||
* busiest_rq, as part of a balancing operation within domain "sd".
|
||||
*
|
||||
* Called with both runqueues locked.
|
||||
* Returns number of detached tasks if successful and 0 otherwise.
|
||||
*/
|
||||
static int move_tasks(struct lb_env *env)
|
||||
static int detach_tasks(struct lb_env *env)
|
||||
{
|
||||
struct list_head *tasks = &env->src_rq->cfs_tasks;
|
||||
struct task_struct *p;
|
||||
unsigned long load;
|
||||
int pulled = 0;
|
||||
int detached = 0;
|
||||
|
||||
lockdep_assert_held(&env->src_rq->lock);
|
||||
|
||||
if (env->imbalance <= 0)
|
||||
return 0;
|
||||
|
@ -5436,14 +5422,16 @@ static int move_tasks(struct lb_env *env)
|
|||
if ((load / 2) > env->imbalance)
|
||||
goto next;
|
||||
|
||||
move_task(p, env);
|
||||
pulled++;
|
||||
detach_task(p, env);
|
||||
list_add(&p->se.group_node, &env->tasks);
|
||||
|
||||
detached++;
|
||||
env->imbalance -= load;
|
||||
|
||||
#ifdef CONFIG_PREEMPT
|
||||
/*
|
||||
* NEWIDLE balancing is a source of latency, so preemptible
|
||||
* kernels will stop after the first task is pulled to minimize
|
||||
* kernels will stop after the first task is detached to minimize
|
||||
* the critical section.
|
||||
*/
|
||||
if (env->idle == CPU_NEWLY_IDLE)
|
||||
|
@ -5463,13 +5451,58 @@ next:
|
|||
}
|
||||
|
||||
/*
|
||||
* Right now, this is one of only two places move_task() is called,
|
||||
* so we can safely collect move_task() stats here rather than
|
||||
* inside move_task().
|
||||
* Right now, this is one of only two places we collect this stat
|
||||
* so we can safely collect detach_one_task() stats here rather
|
||||
* than inside detach_one_task().
|
||||
*/
|
||||
schedstat_add(env->sd, lb_gained[env->idle], pulled);
|
||||
schedstat_add(env->sd, lb_gained[env->idle], detached);
|
||||
|
||||
return pulled;
|
||||
return detached;
|
||||
}
|
||||
|
||||
/*
|
||||
* attach_task() -- attach the task detached by detach_task() to its new rq.
|
||||
*/
|
||||
static void attach_task(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
BUG_ON(task_rq(p) != rq);
|
||||
p->on_rq = TASK_ON_RQ_QUEUED;
|
||||
activate_task(rq, p, 0);
|
||||
check_preempt_curr(rq, p, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* attach_one_task() -- attaches the task returned from detach_one_task() to
|
||||
* its new rq.
|
||||
*/
|
||||
static void attach_one_task(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
raw_spin_lock(&rq->lock);
|
||||
attach_task(rq, p);
|
||||
raw_spin_unlock(&rq->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
|
||||
* new rq.
|
||||
*/
|
||||
static void attach_tasks(struct lb_env *env)
|
||||
{
|
||||
struct list_head *tasks = &env->tasks;
|
||||
struct task_struct *p;
|
||||
|
||||
raw_spin_lock(&env->dst_rq->lock);
|
||||
|
||||
while (!list_empty(tasks)) {
|
||||
p = list_first_entry(tasks, struct task_struct, se.group_node);
|
||||
list_del_init(&p->se.group_node);
|
||||
|
||||
attach_task(env->dst_rq, p);
|
||||
}
|
||||
|
||||
raw_spin_unlock(&env->dst_rq->lock);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
|
@ -6603,6 +6636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|||
.loop_break = sched_nr_migrate_break,
|
||||
.cpus = cpus,
|
||||
.fbq_type = all,
|
||||
.tasks = LIST_HEAD_INIT(env.tasks),
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -6652,16 +6686,29 @@ redo:
|
|||
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
|
||||
|
||||
more_balance:
|
||||
local_irq_save(flags);
|
||||
double_rq_lock(env.dst_rq, busiest);
|
||||
raw_spin_lock_irqsave(&busiest->lock, flags);
|
||||
|
||||
/*
|
||||
* cur_ld_moved - load moved in current iteration
|
||||
* ld_moved - cumulative load moved across iterations
|
||||
*/
|
||||
cur_ld_moved = move_tasks(&env);
|
||||
ld_moved += cur_ld_moved;
|
||||
double_rq_unlock(env.dst_rq, busiest);
|
||||
cur_ld_moved = detach_tasks(&env);
|
||||
|
||||
/*
|
||||
* We've detached some tasks from busiest_rq. Every
|
||||
* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
|
||||
* unlock busiest->lock, and we are able to be sure
|
||||
* that nobody can manipulate the tasks in parallel.
|
||||
* See task_rq_lock() family for the details.
|
||||
*/
|
||||
|
||||
raw_spin_unlock(&busiest->lock);
|
||||
|
||||
if (cur_ld_moved) {
|
||||
attach_tasks(&env);
|
||||
ld_moved += cur_ld_moved;
|
||||
}
|
||||
|
||||
local_irq_restore(flags);
|
||||
|
||||
/*
|
||||
|
@ -6797,7 +6844,7 @@ more_balance:
|
|||
* If we've begun active balancing, start to back off. This
|
||||
* case may not be covered by the all_pinned logic if there
|
||||
* is only 1 task on the busy runqueue (because we don't call
|
||||
* move_tasks).
|
||||
* detach_tasks).
|
||||
*/
|
||||
if (sd->balance_interval < sd->max_interval)
|
||||
sd->balance_interval *= 2;
|
||||
|
|
Loading…
Reference in New Issue