|
|
|
@ -371,408 +371,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
|
|
|
|
|
spin_unlock_irqrestore(&ioapic_lock, flags);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#if defined(CONFIG_IRQBALANCE)
|
|
|
|
|
# include <asm/processor.h> /* kernel_thread() */
|
|
|
|
|
# include <linux/kernel_stat.h> /* kstat */
|
|
|
|
|
# include <linux/slab.h> /* kmalloc() */
|
|
|
|
|
# include <linux/timer.h>
|
|
|
|
|
|
|
|
|
|
#define IRQBALANCE_CHECK_ARCH -999
|
|
|
|
|
#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
|
|
|
|
|
#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
|
|
|
|
|
#define BALANCED_IRQ_MORE_DELTA (HZ/10)
|
|
|
|
|
#define BALANCED_IRQ_LESS_DELTA (HZ)
|
|
|
|
|
|
|
|
|
|
static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
|
|
|
|
|
static int physical_balance __read_mostly;
|
|
|
|
|
static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
|
|
|
|
|
|
|
|
|
|
static struct irq_cpu_info {
|
|
|
|
|
unsigned long *last_irq;
|
|
|
|
|
unsigned long *irq_delta;
|
|
|
|
|
unsigned long irq;
|
|
|
|
|
} irq_cpu_data[NR_CPUS];
|
|
|
|
|
|
|
|
|
|
#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
|
|
|
|
|
#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
|
|
|
|
|
#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
|
|
|
|
|
|
|
|
|
|
#define IDLE_ENOUGH(cpu,now) \
|
|
|
|
|
(idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
|
|
|
|
|
|
|
|
|
|
#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
|
|
|
|
|
|
|
|
|
|
#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i)))
|
|
|
|
|
|
|
|
|
|
static cpumask_t balance_irq_affinity_init __initdata = CPU_MASK_ALL;
|
|
|
|
|
|
|
|
|
|
static cpumask_t *balance_irq_affinity;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void __init irq_affinity_init_work(void *data)
|
|
|
|
|
{
|
|
|
|
|
struct dyn_array *da = data;
|
|
|
|
|
|
|
|
|
|
int i;
|
|
|
|
|
struct balance_irq_affinity *affinity;
|
|
|
|
|
|
|
|
|
|
affinity = *da->name;
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < *da->nr; i++)
|
|
|
|
|
memcpy(&affinity[i], &balance_irq_affinity_init,
|
|
|
|
|
sizeof(struct balance_irq_affinity));
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
DEFINE_DYN_ARRAY(balance_irq_affinity, sizeof(struct balance_irq_affinity), nr_irqs, PAGE_SIZE, irq_affinity_init_work);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
|
|
|
|
|
{
|
|
|
|
|
balance_irq_affinity[irq] = mask;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
|
|
|
|
|
unsigned long now, int direction)
|
|
|
|
|
{
|
|
|
|
|
int search_idle = 1;
|
|
|
|
|
int cpu = curr_cpu;
|
|
|
|
|
|
|
|
|
|
goto inside;
|
|
|
|
|
|
|
|
|
|
do {
|
|
|
|
|
if (unlikely(cpu == curr_cpu))
|
|
|
|
|
search_idle = 0;
|
|
|
|
|
inside:
|
|
|
|
|
if (direction == 1) {
|
|
|
|
|
cpu++;
|
|
|
|
|
if (cpu >= NR_CPUS)
|
|
|
|
|
cpu = 0;
|
|
|
|
|
} else {
|
|
|
|
|
cpu--;
|
|
|
|
|
if (cpu == -1)
|
|
|
|
|
cpu = NR_CPUS-1;
|
|
|
|
|
}
|
|
|
|
|
} while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
|
|
|
|
|
(search_idle && !IDLE_ENOUGH(cpu, now)));
|
|
|
|
|
|
|
|
|
|
return cpu;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void balance_irq(int cpu, int irq)
|
|
|
|
|
{
|
|
|
|
|
unsigned long now = jiffies;
|
|
|
|
|
cpumask_t allowed_mask;
|
|
|
|
|
unsigned int new_cpu;
|
|
|
|
|
|
|
|
|
|
if (irqbalance_disabled)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
|
|
|
|
|
new_cpu = move(cpu, allowed_mask, now, 1);
|
|
|
|
|
if (cpu != new_cpu)
|
|
|
|
|
set_pending_irq(irq, cpumask_of_cpu(new_cpu));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
|
|
|
|
|
{
|
|
|
|
|
int i, j;
|
|
|
|
|
struct irq_desc *desc;
|
|
|
|
|
|
|
|
|
|
for_each_online_cpu(i) {
|
|
|
|
|
for (j = 0; j < nr_irqs; j++) {
|
|
|
|
|
desc = irq_to_desc(j);
|
|
|
|
|
if (!desc->action)
|
|
|
|
|
continue;
|
|
|
|
|
/* Is it a significant load ? */
|
|
|
|
|
if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
|
|
|
|
|
useful_load_threshold)
|
|
|
|
|
continue;
|
|
|
|
|
balance_irq(i, j);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
|
|
|
|
|
balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void do_irq_balance(void)
|
|
|
|
|
{
|
|
|
|
|
int i, j;
|
|
|
|
|
unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
|
|
|
|
|
unsigned long move_this_load = 0;
|
|
|
|
|
int max_loaded = 0, min_loaded = 0;
|
|
|
|
|
int load;
|
|
|
|
|
unsigned long useful_load_threshold = balanced_irq_interval + 10;
|
|
|
|
|
int selected_irq;
|
|
|
|
|
int tmp_loaded, first_attempt = 1;
|
|
|
|
|
unsigned long tmp_cpu_irq;
|
|
|
|
|
unsigned long imbalance = 0;
|
|
|
|
|
cpumask_t allowed_mask, target_cpu_mask, tmp;
|
|
|
|
|
struct irq_desc *desc;
|
|
|
|
|
|
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
|
int package_index;
|
|
|
|
|
CPU_IRQ(i) = 0;
|
|
|
|
|
if (!cpu_online(i))
|
|
|
|
|
continue;
|
|
|
|
|
package_index = CPU_TO_PACKAGEINDEX(i);
|
|
|
|
|
for (j = 0; j < nr_irqs; j++) {
|
|
|
|
|
unsigned long value_now, delta;
|
|
|
|
|
/* Is this an active IRQ or balancing disabled ? */
|
|
|
|
|
desc = irq_to_desc(j);
|
|
|
|
|
if (!desc->action || irq_balancing_disabled(j))
|
|
|
|
|
continue;
|
|
|
|
|
if (package_index == i)
|
|
|
|
|
IRQ_DELTA(package_index, j) = 0;
|
|
|
|
|
/* Determine the total count per processor per IRQ */
|
|
|
|
|
value_now = (unsigned long) kstat_irqs_cpu(j, i);
|
|
|
|
|
|
|
|
|
|
/* Determine the activity per processor per IRQ */
|
|
|
|
|
delta = value_now - LAST_CPU_IRQ(i, j);
|
|
|
|
|
|
|
|
|
|
/* Update last_cpu_irq[][] for the next time */
|
|
|
|
|
LAST_CPU_IRQ(i, j) = value_now;
|
|
|
|
|
|
|
|
|
|
/* Ignore IRQs whose rate is less than the clock */
|
|
|
|
|
if (delta < useful_load_threshold)
|
|
|
|
|
continue;
|
|
|
|
|
/* update the load for the processor or package total */
|
|
|
|
|
IRQ_DELTA(package_index, j) += delta;
|
|
|
|
|
|
|
|
|
|
/* Keep track of the higher numbered sibling as well */
|
|
|
|
|
if (i != package_index)
|
|
|
|
|
CPU_IRQ(i) += delta;
|
|
|
|
|
/*
|
|
|
|
|
* We have sibling A and sibling B in the package
|
|
|
|
|
*
|
|
|
|
|
* cpu_irq[A] = load for cpu A + load for cpu B
|
|
|
|
|
* cpu_irq[B] = load for cpu B
|
|
|
|
|
*/
|
|
|
|
|
CPU_IRQ(package_index) += delta;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
/* Find the least loaded processor package */
|
|
|
|
|
for_each_online_cpu(i) {
|
|
|
|
|
if (i != CPU_TO_PACKAGEINDEX(i))
|
|
|
|
|
continue;
|
|
|
|
|
if (min_cpu_irq > CPU_IRQ(i)) {
|
|
|
|
|
min_cpu_irq = CPU_IRQ(i);
|
|
|
|
|
min_loaded = i;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
max_cpu_irq = ULONG_MAX;
|
|
|
|
|
|
|
|
|
|
tryanothercpu:
|
|
|
|
|
/*
|
|
|
|
|
* Look for heaviest loaded processor.
|
|
|
|
|
* We may come back to get the next heaviest loaded processor.
|
|
|
|
|
* Skip processors with trivial loads.
|
|
|
|
|
*/
|
|
|
|
|
tmp_cpu_irq = 0;
|
|
|
|
|
tmp_loaded = -1;
|
|
|
|
|
for_each_online_cpu(i) {
|
|
|
|
|
if (i != CPU_TO_PACKAGEINDEX(i))
|
|
|
|
|
continue;
|
|
|
|
|
if (max_cpu_irq <= CPU_IRQ(i))
|
|
|
|
|
continue;
|
|
|
|
|
if (tmp_cpu_irq < CPU_IRQ(i)) {
|
|
|
|
|
tmp_cpu_irq = CPU_IRQ(i);
|
|
|
|
|
tmp_loaded = i;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (tmp_loaded == -1) {
|
|
|
|
|
/*
|
|
|
|
|
* In the case of small number of heavy interrupt sources,
|
|
|
|
|
* loading some of the cpus too much. We use Ingo's original
|
|
|
|
|
* approach to rotate them around.
|
|
|
|
|
*/
|
|
|
|
|
if (!first_attempt && imbalance >= useful_load_threshold) {
|
|
|
|
|
rotate_irqs_among_cpus(useful_load_threshold);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
goto not_worth_the_effort;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
first_attempt = 0; /* heaviest search */
|
|
|
|
|
max_cpu_irq = tmp_cpu_irq; /* load */
|
|
|
|
|
max_loaded = tmp_loaded; /* processor */
|
|
|
|
|
imbalance = (max_cpu_irq - min_cpu_irq) / 2;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* if imbalance is less than approx 10% of max load, then
|
|
|
|
|
* observe diminishing returns action. - quit
|
|
|
|
|
*/
|
|
|
|
|
if (imbalance < (max_cpu_irq >> 3))
|
|
|
|
|
goto not_worth_the_effort;
|
|
|
|
|
|
|
|
|
|
tryanotherirq:
|
|
|
|
|
/* if we select an IRQ to move that can't go where we want, then
|
|
|
|
|
* see if there is another one to try.
|
|
|
|
|
*/
|
|
|
|
|
move_this_load = 0;
|
|
|
|
|
selected_irq = -1;
|
|
|
|
|
for (j = 0; j < nr_irqs; j++) {
|
|
|
|
|
/* Is this an active IRQ? */
|
|
|
|
|
desc = irq_to_desc(j);
|
|
|
|
|
if (!desc->action)
|
|
|
|
|
continue;
|
|
|
|
|
if (imbalance <= IRQ_DELTA(max_loaded, j))
|
|
|
|
|
continue;
|
|
|
|
|
/* Try to find the IRQ that is closest to the imbalance
|
|
|
|
|
* without going over.
|
|
|
|
|
*/
|
|
|
|
|
if (move_this_load < IRQ_DELTA(max_loaded, j)) {
|
|
|
|
|
move_this_load = IRQ_DELTA(max_loaded, j);
|
|
|
|
|
selected_irq = j;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (selected_irq == -1)
|
|
|
|
|
goto tryanothercpu;
|
|
|
|
|
|
|
|
|
|
imbalance = move_this_load;
|
|
|
|
|
|
|
|
|
|
/* For physical_balance case, we accumulated both load
|
|
|
|
|
* values in the one of the siblings cpu_irq[],
|
|
|
|
|
* to use the same code for physical and logical processors
|
|
|
|
|
* as much as possible.
|
|
|
|
|
*
|
|
|
|
|
* NOTE: the cpu_irq[] array holds the sum of the load for
|
|
|
|
|
* sibling A and sibling B in the slot for the lowest numbered
|
|
|
|
|
* sibling (A), _AND_ the load for sibling B in the slot for
|
|
|
|
|
* the higher numbered sibling.
|
|
|
|
|
*
|
|
|
|
|
* We seek the least loaded sibling by making the comparison
|
|
|
|
|
* (A+B)/2 vs B
|
|
|
|
|
*/
|
|
|
|
|
load = CPU_IRQ(min_loaded) >> 1;
|
|
|
|
|
for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) {
|
|
|
|
|
if (load > CPU_IRQ(j)) {
|
|
|
|
|
/* This won't change cpu_sibling_map[min_loaded] */
|
|
|
|
|
load = CPU_IRQ(j);
|
|
|
|
|
min_loaded = j;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cpus_and(allowed_mask,
|
|
|
|
|
cpu_online_map,
|
|
|
|
|
balance_irq_affinity[selected_irq]);
|
|
|
|
|
target_cpu_mask = cpumask_of_cpu(min_loaded);
|
|
|
|
|
cpus_and(tmp, target_cpu_mask, allowed_mask);
|
|
|
|
|
|
|
|
|
|
if (!cpus_empty(tmp)) {
|
|
|
|
|
/* mark for change destination */
|
|
|
|
|
set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
|
|
|
|
|
|
|
|
|
|
/* Since we made a change, come back sooner to
|
|
|
|
|
* check for more variation.
|
|
|
|
|
*/
|
|
|
|
|
balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
|
|
|
|
|
balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
goto tryanotherirq;
|
|
|
|
|
|
|
|
|
|
not_worth_the_effort:
|
|
|
|
|
/*
|
|
|
|
|
* if we did not find an IRQ to move, then adjust the time interval
|
|
|
|
|
* upward
|
|
|
|
|
*/
|
|
|
|
|
balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
|
|
|
|
|
balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int balanced_irq(void *unused)
|
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
unsigned long prev_balance_time = jiffies;
|
|
|
|
|
long time_remaining = balanced_irq_interval;
|
|
|
|
|
struct irq_desc *desc;
|
|
|
|
|
|
|
|
|
|
/* push everything to CPU 0 to give us a starting point. */
|
|
|
|
|
for (i = 0 ; i < nr_irqs ; i++) {
|
|
|
|
|
desc = irq_to_desc(i);
|
|
|
|
|
desc->pending_mask = cpumask_of_cpu(0);
|
|
|
|
|
set_pending_irq(i, cpumask_of_cpu(0));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
set_freezable();
|
|
|
|
|
for ( ; ; ) {
|
|
|
|
|
time_remaining = schedule_timeout_interruptible(time_remaining);
|
|
|
|
|
try_to_freeze();
|
|
|
|
|
if (time_after(jiffies,
|
|
|
|
|
prev_balance_time+balanced_irq_interval)) {
|
|
|
|
|
preempt_disable();
|
|
|
|
|
do_irq_balance();
|
|
|
|
|
prev_balance_time = jiffies;
|
|
|
|
|
time_remaining = balanced_irq_interval;
|
|
|
|
|
preempt_enable();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int __init balanced_irq_init(void)
|
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
struct cpuinfo_x86 *c;
|
|
|
|
|
cpumask_t tmp;
|
|
|
|
|
|
|
|
|
|
cpus_shift_right(tmp, cpu_online_map, 2);
|
|
|
|
|
c = &boot_cpu_data;
|
|
|
|
|
/* When not overwritten by the command line ask subarchitecture. */
|
|
|
|
|
if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
|
|
|
|
|
irqbalance_disabled = NO_BALANCE_IRQ;
|
|
|
|
|
if (irqbalance_disabled)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
/* disable irqbalance completely if there is only one processor online */
|
|
|
|
|
if (num_online_cpus() < 2) {
|
|
|
|
|
irqbalance_disabled = 1;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
/*
|
|
|
|
|
* Enable physical balance only if more than 1 physical processor
|
|
|
|
|
* is present
|
|
|
|
|
*/
|
|
|
|
|
if (smp_num_siblings > 1 && !cpus_empty(tmp))
|
|
|
|
|
physical_balance = 1;
|
|
|
|
|
|
|
|
|
|
for_each_online_cpu(i) {
|
|
|
|
|
irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL);
|
|
|
|
|
irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL);
|
|
|
|
|
if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
|
|
|
|
|
printk(KERN_ERR "balanced_irq_init: out of memory");
|
|
|
|
|
goto failed;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
printk(KERN_INFO "Starting balanced_irq\n");
|
|
|
|
|
if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
|
|
|
|
|
return 0;
|
|
|
|
|
printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
|
|
|
|
|
failed:
|
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
|
kfree(irq_cpu_data[i].irq_delta);
|
|
|
|
|
irq_cpu_data[i].irq_delta = NULL;
|
|
|
|
|
kfree(irq_cpu_data[i].last_irq);
|
|
|
|
|
irq_cpu_data[i].last_irq = NULL;
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int __devinit irqbalance_disable(char *str)
|
|
|
|
|
{
|
|
|
|
|
irqbalance_disabled = 1;
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__setup("noirqbalance", irqbalance_disable);
|
|
|
|
|
|
|
|
|
|
late_initcall(balanced_irq_init);
|
|
|
|
|
#endif /* CONFIG_IRQBALANCE */
|
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
|
|
|
|
|
#ifndef CONFIG_SMP
|
|
|
|
|