sched/numa: Evaluate move once per node
task_numa_compare() helps choose the best CPU to move or swap the selected task. To achieve this task_numa_compare() is called for every CPU in the node. Currently it evaluates if the task can be moved/swapped for each of the CPUs. However the move evaluation is mostly independent of the CPU. Evaluating the move logic once per node, provides scope for simplifying task_numa_compare(). Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25705.2 25058.2 -2.51 1 74433 72950 -1.99 Running SPECjbb2005 on a 16 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 8 96589.6 105930 9.670 1 181830 178624 -1.76 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 440.65 941.32 758.98 189.17 numa01.sh Sys: 183.48 320.07 258.42 50.09 numa01.sh User: 37384.65 71818.14 60302.51 13798.96 numa02.sh Real: 61.24 65.35 62.49 1.49 numa02.sh Sys: 16.83 24.18 21.40 2.60 numa02.sh User: 5219.59 5356.34 5264.03 49.07 numa03.sh Real: 822.04 912.40 873.55 37.35 numa03.sh Sys: 118.80 140.94 132.90 7.60 numa03.sh User: 62485.19 70025.01 67208.33 2967.10 numa04.sh Real: 690.66 872.12 778.49 65.44 numa04.sh Sys: 459.26 563.03 494.03 42.39 numa04.sh User: 51116.44 70527.20 58849.44 8461.28 numa05.sh Real: 418.37 562.28 525.77 54.27 numa05.sh Sys: 299.45 481.00 392.49 64.27 numa05.sh User: 34115.09 41324.02 39105.30 2627.68 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 516.14 892.41 739.84 151.32 2.587% numa01.sh Sys: 153.16 192.99 177.70 14.58 45.42% numa01.sh User: 39821.04 69528.92 57193.87 10989.48 5.435% numa02.sh Real: 60.91 62.35 61.58 0.63 1.477% numa02.sh Sys: 16.47 26.16 21.20 3.85 0.943% numa02.sh User: 5227.58 5309.61 5265.17 31.04 -0.02% numa03.sh Real: 739.07 917.73 795.75 64.45 9.776% numa03.sh Sys: 94.46 136.08 109.48 14.58 21.39% numa03.sh User: 57478.56 72014.09 61764.48 5343.69 8.813% numa04.sh Real: 442.61 715.43 530.31 96.12 46.79% numa04.sh Sys: 224.90 348.63 285.61 48.83 72.97% numa04.sh User: 35836.84 47522.47 40235.41 3985.26 46.26% numa05.sh Real: 386.13 489.17 434.94 43.59 20.88% numa05.sh Sys: 144.29 438.56 278.80 105.78 40.77% numa05.sh User: 33255.86 36890.82 34879.31 1641.98 12.11% Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Mel Gorman <mgorman@techsingularity.net> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@surriel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1529514181-9842-3-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
6e30396767
commit
305c1fac32
|
@ -1580,9 +1580,8 @@ static bool load_too_imbalanced(long src_load, long dst_load,
|
|||
* be exchanged with the source task
|
||||
*/
|
||||
static void task_numa_compare(struct task_numa_env *env,
|
||||
long taskimp, long groupimp)
|
||||
long taskimp, long groupimp, bool maymove)
|
||||
{
|
||||
struct rq *src_rq = cpu_rq(env->src_cpu);
|
||||
struct rq *dst_rq = cpu_rq(env->dst_cpu);
|
||||
struct task_struct *cur;
|
||||
long src_load, dst_load;
|
||||
|
@ -1603,97 +1602,73 @@ static void task_numa_compare(struct task_numa_env *env,
|
|||
if (cur == env->p)
|
||||
goto unlock;
|
||||
|
||||
if (!cur) {
|
||||
if (maymove || imp > env->best_imp)
|
||||
goto assign;
|
||||
else
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* "imp" is the fault differential for the source task between the
|
||||
* source and destination node. Calculate the total differential for
|
||||
* the source task and potential destination task. The more negative
|
||||
* the value is, the more rmeote accesses that would be expected to
|
||||
* the value is, the more remote accesses that would be expected to
|
||||
* be incurred if the tasks were swapped.
|
||||
*/
|
||||
if (cur) {
|
||||
/* Skip this swap candidate if cannot move to the source CPU: */
|
||||
if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
|
||||
goto unlock;
|
||||
|
||||
/*
|
||||
* If dst and source tasks are in the same NUMA group, or not
|
||||
* in any group then look only at task weights.
|
||||
*/
|
||||
if (cur->numa_group == env->p->numa_group) {
|
||||
imp = taskimp + task_weight(cur, env->src_nid, dist) -
|
||||
task_weight(cur, env->dst_nid, dist);
|
||||
/*
|
||||
* Add some hysteresis to prevent swapping the
|
||||
* tasks within a group over tiny differences.
|
||||
*/
|
||||
if (cur->numa_group)
|
||||
imp -= imp/16;
|
||||
} else {
|
||||
/*
|
||||
* Compare the group weights. If a task is all by
|
||||
* itself (not part of a group), use the task weight
|
||||
* instead.
|
||||
*/
|
||||
if (cur->numa_group)
|
||||
imp += group_weight(cur, env->src_nid, dist) -
|
||||
group_weight(cur, env->dst_nid, dist);
|
||||
else
|
||||
imp += task_weight(cur, env->src_nid, dist) -
|
||||
task_weight(cur, env->dst_nid, dist);
|
||||
}
|
||||
}
|
||||
|
||||
if (imp <= env->best_imp && moveimp <= env->best_imp)
|
||||
/* Skip this swap candidate if cannot move to the source cpu */
|
||||
if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
|
||||
goto unlock;
|
||||
|
||||
if (!cur) {
|
||||
/* Is there capacity at our destination? */
|
||||
if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
|
||||
!env->dst_stats.has_free_capacity)
|
||||
goto unlock;
|
||||
|
||||
goto balance;
|
||||
}
|
||||
|
||||
/* Balance doesn't matter much if we're running a task per CPU: */
|
||||
if (imp > env->best_imp && src_rq->nr_running == 1 &&
|
||||
dst_rq->nr_running == 1)
|
||||
goto assign;
|
||||
|
||||
/*
|
||||
* In the overloaded case, try and keep the load balanced.
|
||||
* If dst and source tasks are in the same NUMA group, or not
|
||||
* in any group then look only at task weights.
|
||||
*/
|
||||
balance:
|
||||
load = task_h_load(env->p);
|
||||
dst_load = env->dst_stats.load + load;
|
||||
src_load = env->src_stats.load - load;
|
||||
|
||||
if (moveimp > imp && moveimp > env->best_imp) {
|
||||
if (cur->numa_group == env->p->numa_group) {
|
||||
imp = taskimp + task_weight(cur, env->src_nid, dist) -
|
||||
task_weight(cur, env->dst_nid, dist);
|
||||
/*
|
||||
* If the improvement from just moving env->p direction is
|
||||
* better than swapping tasks around, check if a move is
|
||||
* possible. Store a slightly smaller score than moveimp,
|
||||
* so an actually idle CPU will win.
|
||||
* Add some hysteresis to prevent swapping the
|
||||
* tasks within a group over tiny differences.
|
||||
*/
|
||||
if (!load_too_imbalanced(src_load, dst_load, env)) {
|
||||
imp = moveimp - 1;
|
||||
cur = NULL;
|
||||
goto assign;
|
||||
}
|
||||
if (cur->numa_group)
|
||||
imp -= imp / 16;
|
||||
} else {
|
||||
/*
|
||||
* Compare the group weights. If a task is all by itself
|
||||
* (not part of a group), use the task weight instead.
|
||||
*/
|
||||
if (cur->numa_group && env->p->numa_group)
|
||||
imp += group_weight(cur, env->src_nid, dist) -
|
||||
group_weight(cur, env->dst_nid, dist);
|
||||
else
|
||||
imp += task_weight(cur, env->src_nid, dist) -
|
||||
task_weight(cur, env->dst_nid, dist);
|
||||
}
|
||||
|
||||
if (imp <= env->best_imp)
|
||||
goto unlock;
|
||||
|
||||
if (cur) {
|
||||
load = task_h_load(cur);
|
||||
dst_load -= load;
|
||||
src_load += load;
|
||||
if (maymove && moveimp > imp && moveimp > env->best_imp) {
|
||||
imp = moveimp - 1;
|
||||
cur = NULL;
|
||||
goto assign;
|
||||
}
|
||||
|
||||
/*
|
||||
* In the overloaded case, try and keep the load balanced.
|
||||
*/
|
||||
load = task_h_load(env->p) - task_h_load(cur);
|
||||
if (!load)
|
||||
goto assign;
|
||||
|
||||
dst_load = env->dst_stats.load + load;
|
||||
src_load = env->src_stats.load - load;
|
||||
|
||||
if (load_too_imbalanced(src_load, dst_load, env))
|
||||
goto unlock;
|
||||
|
||||
assign:
|
||||
/*
|
||||
* One idle CPU per node is evaluated for a task numa move.
|
||||
* Call select_idle_sibling to maybe find a better one.
|
||||
|
@ -1709,7 +1684,6 @@ balance:
|
|||
local_irq_enable();
|
||||
}
|
||||
|
||||
assign:
|
||||
task_numa_assign(env, cur, imp);
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
|
@ -1718,15 +1692,27 @@ unlock:
|
|||
static void task_numa_find_cpu(struct task_numa_env *env,
|
||||
long taskimp, long groupimp)
|
||||
{
|
||||
long src_load, dst_load, load;
|
||||
bool maymove = false;
|
||||
int cpu;
|
||||
|
||||
load = task_h_load(env->p);
|
||||
dst_load = env->dst_stats.load + load;
|
||||
src_load = env->src_stats.load - load;
|
||||
|
||||
/*
|
||||
* If the improvement from just moving env->p direction is better
|
||||
* than swapping tasks around, check if a move is possible.
|
||||
*/
|
||||
maymove = !load_too_imbalanced(src_load, dst_load, env);
|
||||
|
||||
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
|
||||
/* Skip this CPU if the source task cannot migrate */
|
||||
if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
|
||||
continue;
|
||||
|
||||
env->dst_cpu = cpu;
|
||||
task_numa_compare(env, taskimp, groupimp);
|
||||
task_numa_compare(env, taskimp, groupimp, maymove);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue