From 151766fce8bee0e3e6076c8b829f9fcc0a2412ae Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Sat, 28 Apr 2012 14:30:40 +0800 Subject: [PATCH 001/612] Revert "x86/platform: Add a wallclock_init func to x86_platforms ops" This reverts commit cf8ff6b6ab0e99dd3058852f4ec76a6140abadec. Just found this commit is a function duplicatation of commit 6b617e22 "x86/platform: Add a wallclock_init func to x86_init.timers ops". Let's revert it and sorry for the noise. Signed-off-by: Feng Tang Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Jacob Pan Cc: Alan Cox Cc: Dirk Brandewie Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/x86_init.h | 2 -- arch/x86/kernel/setup.c | 2 -- arch/x86/kernel/x86_init.c | 2 -- 3 files changed, 6 deletions(-) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index c090af10ac7d..42d2ae18dab2 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -156,7 +156,6 @@ struct x86_cpuinit_ops { /** * struct x86_platform_ops - platform specific runtime functions * @calibrate_tsc: calibrate TSC - * @wallclock_init: init the wallclock device * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic @@ -167,7 +166,6 @@ struct x86_cpuinit_ops { */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); - void (*wallclock_init)(void); unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long nowtime); void (*iommu_shutdown)(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 366c688d619e..58a07b10812c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1027,8 +1027,6 @@ void __init setup_arch(char **cmdline_p) x86_init.timers.wallclock_init(); - x86_platform.wallclock_init(); - mcheck_init(); arch_init_ideal_nops(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 35c5e543f550..9f3167e891ef 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -29,7 +29,6 @@ void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } -void wallclock_init_noop(void) { } /* * The platform setup functions are preset with the default functions @@ -101,7 +100,6 @@ static int default_i8042_detect(void) { return 1; }; struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, - .wallclock_init = wallclock_init_noop, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, From f841d792e38f75f5e25b0b66f7b5d235d180a735 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 30 Mar 2012 23:11:35 +0800 Subject: [PATCH 002/612] x86: Return IRQ_SET_MASK_OK_NOCOPY from irq affinity functions The interrupt chip irq_set_affinity() functions copy the affinity mask to irq_data->affinity but return 0, i.e. IRQ_SET_MASK_OK. IRQ_SET_MASK_OK causes the core code to do another redundant copy. Return IRQ_SET_MASK_OK_NOCOPY to avoid this. Signed-off-by: Jiang Liu Cc: Suresh Siddha Cc: Yinghai Lu Cc: Naga Chumbalkar Cc: Jacob Pan Cc: Cliff Wickman Cc: Jiang Liu Cc: Keping Chen Link: http://lkml.kernel.org/r/1333120296-13563-4-git-send-email-jiang.liu@huawei.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 9 +++++---- arch/x86/platform/uv/uv_irq.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac96561d1a99..bce2001b2644 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2270,6 +2270,7 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, data->chip_data); + ret = IRQ_SET_MASK_OK_NOCOPY; } raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; @@ -3092,7 +3093,7 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) __write_msi_msg(data->msi_desc, &msg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } #endif /* CONFIG_SMP */ @@ -3214,7 +3215,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, dmar_msi_write(irq, &msg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } #endif /* CONFIG_SMP */ @@ -3267,7 +3268,7 @@ static int hpet_msi_set_affinity(struct irq_data *data, hpet_msi_write(data->handler_data, &msg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } #endif /* CONFIG_SMP */ @@ -3340,7 +3341,7 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) return -1; target_ht_irq(data->irq, dest, cfg->vector); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } #endif diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index f25c2765a5c9..a22c41656b52 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -222,7 +222,7 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, if (cfg->move_in_progress) send_cleanup_vector(cfg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } /* From 4ee0444bebf3d0399c93841d98826ade193e6330 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 29 May 2012 15:15:56 +0800 Subject: [PATCH 003/612] hexagon: SMP: Remove call to ipi_call_lock()/ipi_call_unlock() ipi_call_lock/unlock() lock resp. unlock call_function.lock. This lock protects only the call_function data structure itself, but it's completely unrelated to cpu_online_mask. The mask to which the IPIs are sent is calculated before call_function.lock is taken in smp_call_function_many(), so the locking around set_cpu_online() is pointless and can be removed. [ tglx: Massaged changelog ] Signed-off-by: Yong Zhang Cc: ralf@linux-mips.org Cc: sshtylyov@mvista.com Cc: david.daney@cavium.com Cc: nikunj@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: axboe@kernel.dk Cc: Richard Kuo Cc: linux-hexagon@vger.kernel.org Link: http://lkml.kernel.org/r/1338275765-3217-2-git-send-email-yong.zhang0@gmail.com Acked-by: Srivatsa S. Bhat Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- arch/hexagon/kernel/smp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index f7264621e58d..149fbefc1a4d 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -180,9 +180,7 @@ void __cpuinit start_secondary(void) notify_cpu_starting(cpu); - ipi_call_lock(); set_cpu_online(cpu, true); - ipi_call_unlock(); local_irq_enable(); From 4cbd19fcf9d49f28766415b96eb6201bc263d817 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 29 May 2012 15:15:57 +0800 Subject: [PATCH 004/612] mn10300: SMP: Remove call to ipi_call_lock()/ipi_call_unlock() ipi_call_lock/unlock() lock resp. unlock call_function.lock. This lock protects only the call_function data structure itself, but it's completely unrelated to cpu_online_mask. The mask to which the IPIs are sent is calculated before call_function.lock is taken in smp_call_function_many(), so the locking around set_cpu_online() is pointless and can be removed. [ tglx: Massaged changelog ] Signed-off-by: Yong Zhang Cc: ralf@linux-mips.org Cc: sshtylyov@mvista.com Cc: david.daney@cavium.com Cc: nikunj@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: axboe@kernel.dk Cc: peterz@infradead.org Cc: David Howells Cc: Koichi Yasutake Cc: linux-am33-list@redhat.com Link: http://lkml.kernel.org/r/1338275765-3217-3-git-send-email-yong.zhang0@gmail.com Acked-by: Srivatsa S. Bhat Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- arch/mn10300/kernel/smp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c index 090d35d36973..e62c223e4c45 100644 --- a/arch/mn10300/kernel/smp.c +++ b/arch/mn10300/kernel/smp.c @@ -876,9 +876,7 @@ static void __init smp_online(void) notify_cpu_starting(cpu); - ipi_call_lock(); set_cpu_online(cpu, true); - ipi_call_unlock(); local_irq_enable(); } From ac4216d6922f90c459d531e051c832dadde3a8f7 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 29 May 2012 15:15:58 +0800 Subject: [PATCH 005/612] parisc: Smp: remove call to ipi_call_lock()/ipi_call_unlock() ipi_call_lock/unlock() lock resp. unlock call_function.lock. This lock protects only the call_function data structure itself, but it's completely unrelated to cpu_online_mask. The mask to which the IPIs are sent is calculated before call_function.lock is taken in smp_call_function_many(), so the locking around set_cpu_online() is pointless and can be removed. [ tglx: Massaged changelog ] Signed-off-by: Yong Zhang Cc: ralf@linux-mips.org Cc: sshtylyov@mvista.com Cc: david.daney@cavium.com Cc: nikunj@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: axboe@kernel.dk Cc: peterz@infradead.org Cc: James E.J. Bottomley Cc: Helge Deller Cc: linux-parisc@vger.kernel.org Link: http://lkml.kernel.org/r/1338275765-3217-4-git-send-email-yong.zhang0@gmail.com Acked-by: Srivatsa S. Bhat Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- arch/parisc/kernel/smp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index a47828d31fe6..6266730efd61 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -300,9 +300,7 @@ smp_cpu_init(int cpunum) notify_cpu_starting(cpunum); - ipi_call_lock(); set_cpu_online(cpunum, true); - ipi_call_unlock(); /* Initialise the idle task for this CPU */ atomic_inc(&init_mm.mm_count); From 46ce7fbfdbe0be6aaf0e816331aac08a74a00e5a Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 29 May 2012 15:15:59 +0800 Subject: [PATCH 006/612] S390: Smp: remove call to ipi_call_lock()/ipi_call_unlock() ipi_call_lock/unlock() lock resp. unlock call_function.lock. This lock protects only the call_function data structure itself, but it's completely unrelated to cpu_online_mask. The mask to which the IPIs are sent is calculated before call_function.lock is taken in smp_call_function_many(), so the locking around set_cpu_online() is pointless and can be removed. [ tglx: Massaged changelog ] Signed-off-by: Yong Zhang Cc: ralf@linux-mips.org Cc: sshtylyov@mvista.com Cc: david.daney@cavium.com Cc: nikunj@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: axboe@kernel.dk Cc: peterz@infradead.org Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: linux390@de.ibm.com Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/1338275765-3217-5-git-send-email-yong.zhang0@gmail.com Acked-by: Srivatsa S. Bhat Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- arch/s390/kernel/smp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 15cca26ccb6c..8dca9c248ac7 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -717,9 +717,7 @@ static void __cpuinit smp_start_secondary(void *cpuvoid) init_cpu_vtimer(); pfault_init(); notify_cpu_starting(smp_processor_id()); - ipi_call_lock(); set_cpu_online(smp_processor_id(), true); - ipi_call_unlock(); local_irq_enable(); /* cpu_idle will call schedule for us */ cpu_idle(); From 8efdfc3a4ed009c978dab6609d15fb958e7cff12 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 29 May 2012 15:16:00 +0800 Subject: [PATCH 007/612] tile: SMP: Remove call to ipi_call_lock()/ipi_call_unlock() ipi_call_lock/unlock() lock resp. unlock call_function.lock. This lock protects only the call_function data structure itself, but it's completely unrelated to cpu_online_mask. The mask to which the IPIs are sent is calculated before call_function.lock is taken in smp_call_function_many(), so the locking around set_cpu_online() is pointless and can be removed. [ tglx: Massaged changelog ] Signed-off-by: Yong Zhang Cc: ralf@linux-mips.org Cc: sshtylyov@mvista.com Cc: david.daney@cavium.com Cc: nikunj@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: axboe@kernel.dk Cc: peterz@infradead.org Cc: Chris Metcalf Link: http://lkml.kernel.org/r/1338275765-3217-6-git-send-email-yong.zhang0@gmail.com Acked-by: Srivatsa S. Bhat Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- arch/tile/kernel/smpboot.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c index 84873fbe8f27..e686c5ac90be 100644 --- a/arch/tile/kernel/smpboot.c +++ b/arch/tile/kernel/smpboot.c @@ -198,17 +198,7 @@ void __cpuinit online_secondary(void) notify_cpu_starting(smp_processor_id()); - /* - * We need to hold call_lock, so there is no inconsistency - * between the time smp_call_function() determines number of - * IPI recipients, and the time when the determination is made - * for which cpus receive the IPI. Holding this - * lock helps us to not include this cpu in a currently in progress - * smp_call_function(). - */ - ipi_call_lock(); set_cpu_online(smp_processor_id(), 1); - ipi_call_unlock(); __get_cpu_var(cpu_state) = CPU_ONLINE; /* Set up tile-specific state for this cpu. */ From 3b6f70fd7dd4e19fc674ec99e389bf0da5589525 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 29 May 2012 15:16:01 +0800 Subject: [PATCH 008/612] x86-smp-remove-call-to-ipi_call_lock-ipi_call_unlock ipi_call_lock/unlock() lock resp. unlock call_function.lock. This lock protects only the call_function data structure itself, but it's completely unrelated to cpu_online_mask. The mask to which the IPIs are sent is calculated before call_function.lock is taken in smp_call_function_many(), so the locking around set_cpu_online() is pointless and can be removed. [ tglx: Massaged changelog ] Signed-off-by: Yong Zhang Cc: ralf@linux-mips.org Cc: sshtylyov@mvista.com Cc: david.daney@cavium.com Cc: nikunj@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: axboe@kernel.dk Cc: peterz@infradead.org Cc: Konrad Rzeszutek Wilk Cc: Jeremy Fitzhardinge Cc: Ingo Molnar Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1338275765-3217-7-git-send-email-yong.zhang0@gmail.com Acked-by: Srivatsa S. Bhat Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- arch/x86/kernel/smpboot.c | 9 --------- arch/x86/xen/smp.c | 2 -- 2 files changed, 11 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f56f96da77f5..b2fd28ff84b5 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -255,22 +255,13 @@ notrace static void __cpuinit start_secondary(void *unused) check_tsc_sync_target(); /* - * We need to hold call_lock, so there is no inconsistency - * between the time smp_call_function() determines number of - * IPI recipients, and the time when the determination is made - * for which cpus receive the IPI. Holding this - * lock helps us to not include this cpu in a currently in progress - * smp_call_function(). - * * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. */ - ipi_call_lock(); lock_vector_lock(); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); - ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; x86_platform.nmi_init(); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index afb250d22a6b..f58dca7a6e52 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -80,9 +80,7 @@ static void __cpuinit cpu_bringup(void) notify_cpu_starting(cpu); - ipi_call_lock(); set_cpu_online(cpu, true); - ipi_call_unlock(); this_cpu_write(cpu_state, CPU_ONLINE); From 459165e25030c0023cb54f73c14261a3d2f4a244 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 29 May 2012 15:16:02 +0800 Subject: [PATCH 009/612] ia64: SMP: Remove call to ipi_call_lock_irq()/ipi_call_unlock_irq() ipi_call_lock/unlock() lock resp. unlock call_function.lock. This lock protects only the call_function data structure itself, but it's completely unrelated to cpu_online_mask. The mask to which the IPIs are sent is calculated before call_function.lock is taken in smp_call_function_many(), so the locking around set_cpu_online() is pointless and can be removed. [ tglx: Massaged changelog ] Signed-off-by: Yong Zhang Cc: ralf@linux-mips.org Cc: sshtylyov@mvista.com Cc: david.daney@cavium.com Cc: nikunj@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: axboe@kernel.dk Cc: peterz@infradead.org Cc: Tony Luck Cc: Fenghua Yu Cc: linux-ia64@vger.kernel.org Link: http://lkml.kernel.org/r/1338275765-3217-8-git-send-email-yong.zhang0@gmail.com Acked-by: Srivatsa S. Bhat Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- arch/ia64/kernel/smpboot.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 1113b8aba07f..963d2db53bfa 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -382,7 +382,6 @@ smp_callin (void) set_numa_node(cpu_to_node_map[cpuid]); set_numa_mem(local_memory_node(cpu_to_node_map[cpuid])); - ipi_call_lock_irq(); spin_lock(&vector_lock); /* Setup the per cpu irq handling data structures */ __setup_vector_irq(cpuid); @@ -390,7 +389,6 @@ smp_callin (void) set_cpu_online(cpuid, true); per_cpu(cpu_state, cpuid) = CPU_ONLINE; spin_unlock(&vector_lock); - ipi_call_unlock_irq(); smp_setup_percpu_timer(); From bc6833009583bd5b096ef7aa2bb006854a5a2dce Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 29 May 2012 16:27:33 +0800 Subject: [PATCH 010/612] SPARC: SMP: Remove call to ipi_call_lock_irq()/ipi_call_unlock_irq() ipi_call_lock/unlock() lock resp. unlock call_function.lock. This lock protects only the call_function data structure itself, but it's completely unrelated to cpu_online_mask. The mask to which the IPIs are sent is calculated before call_function.lock is taken in smp_call_function_many(), so the locking around set_cpu_online() is pointless and can be removed. Delay irq enable to after set_cpu_online(). [ tglx: Massaged changelog ] Signed-off-by: Yong Zhang Cc: ralf@linux-mips.org Cc: sshtylyov@mvista.com Cc: david.daney@cavium.com Cc: nikunj@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: axboe@kernel.dk Cc: peterz@infradead.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/20120529082732.GA4250@zhy Acked-by: "David S. Miller" Acked-by: Srivatsa S. Bhat Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- arch/sparc/kernel/smp_64.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index f591598d92f6..781bcb10b8bd 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -103,8 +103,6 @@ void __cpuinit smp_callin(void) if (cheetah_pcache_forced_on) cheetah_enable_pcache(); - local_irq_enable(); - callin_flag = 1; __asm__ __volatile__("membar #Sync\n\t" "flush %%g6" : : : "memory"); @@ -124,9 +122,8 @@ void __cpuinit smp_callin(void) while (!cpumask_test_cpu(cpuid, &smp_commenced_mask)) rmb(); - ipi_call_lock_irq(); set_cpu_online(cpuid, true); - ipi_call_unlock_irq(); + local_irq_enable(); /* idle thread is expected to have preempt disabled */ preempt_disable(); @@ -1308,9 +1305,7 @@ int __cpu_disable(void) mdelay(1); local_irq_disable(); - ipi_call_lock(); set_cpu_online(cpu, false); - ipi_call_unlock(); cpu_map_rebuild(); From 72ec61a9e8347375e37c05e23511173d8451c3d4 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 29 May 2012 15:16:04 +0800 Subject: [PATCH 011/612] POWERPC: Smp: remove call to ipi_call_lock()/ipi_call_unlock() ipi_call_lock/unlock() lock resp. unlock call_function.lock. This lock protects only the call_function data structure itself, but it's completely unrelated to cpu_online_mask. The mask to which the IPIs are sent is calculated before call_function.lock is taken in smp_call_function_many(), so the locking around set_cpu_online() is pointless and can be removed. [ tglx: Massaged changelog ] Signed-off-by: Yong Zhang Cc: ralf@linux-mips.org Cc: sshtylyov@mvista.com Cc: david.daney@cavium.com Cc: nikunj@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: axboe@kernel.dk Cc: peterz@infradead.org Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1338275765-3217-10-git-send-email-yong.zhang0@gmail.com Acked-by: Srivatsa S. Bhat Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- arch/powerpc/kernel/smp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index e4cb34322de4..e1417c42155c 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -571,7 +571,6 @@ void __devinit start_secondary(void *unused) if (system_state == SYSTEM_RUNNING) vdso_data->processorCount++; #endif - ipi_call_lock(); notify_cpu_starting(cpu); set_cpu_online(cpu, true); /* Update sibling maps */ @@ -601,7 +600,6 @@ void __devinit start_secondary(void *unused) of_node_put(np); } of_node_put(l2_cache); - ipi_call_unlock(); local_irq_enable(); From 8feb8e896d77439146d2e2ab3d0ab55bb5baf5fc Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 29 May 2012 15:16:05 +0800 Subject: [PATCH 012/612] smp: Remove ipi_call_lock[_irq]()/ipi_call_unlock[_irq]() There is no user of those APIs anymore, just remove it. Signed-off-by: Yong Zhang Cc: ralf@linux-mips.org Cc: sshtylyov@mvista.com Cc: david.daney@cavium.com Cc: nikunj@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: axboe@kernel.dk Cc: Andrew Morton Link: http://lkml.kernel.org/r/1338275765-3217-11-git-send-email-yong.zhang0@gmail.com Acked-by: Srivatsa S. Bhat Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- include/linux/smp.h | 4 ---- kernel/smp.c | 20 -------------------- 2 files changed, 24 deletions(-) diff --git a/include/linux/smp.h b/include/linux/smp.h index 717fb746c9a8..a34d4f15430d 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -90,10 +90,6 @@ void kick_all_cpus_sync(void); void __init call_function_init(void); void generic_smp_call_function_single_interrupt(void); void generic_smp_call_function_interrupt(void); -void ipi_call_lock(void); -void ipi_call_unlock(void); -void ipi_call_lock_irq(void); -void ipi_call_unlock_irq(void); #else static inline void call_function_init(void) { } #endif diff --git a/kernel/smp.c b/kernel/smp.c index d0ae5b24875e..29dd40a9f2f4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -581,26 +581,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait) return 0; } EXPORT_SYMBOL(smp_call_function); - -void ipi_call_lock(void) -{ - raw_spin_lock(&call_function.lock); -} - -void ipi_call_unlock(void) -{ - raw_spin_unlock(&call_function.lock); -} - -void ipi_call_lock_irq(void) -{ - raw_spin_lock_irq(&call_function.lock); -} - -void ipi_call_unlock_irq(void) -{ - raw_spin_unlock_irq(&call_function.lock); -} #endif /* USE_GENERIC_SMP_HELPERS */ /* Setup configured maximum number of CPUs to activate */ From 43cc7e86f3200b094e2960b732623aeec00b482d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 May 2012 17:26:16 +0200 Subject: [PATCH 013/612] smp: Remove num_booting_cpus() No users. Signed-off-by: Thomas Gleixner Cc: Srivatsa S. Bhat Cc: Rusty Russell --- arch/m32r/include/asm/smp.h | 5 ----- arch/x86/include/asm/smp.h | 5 ----- include/linux/smp.h | 1 - 3 files changed, 11 deletions(-) diff --git a/arch/m32r/include/asm/smp.h b/arch/m32r/include/asm/smp.h index cf7829a61551..c689b828dfe2 100644 --- a/arch/m32r/include/asm/smp.h +++ b/arch/m32r/include/asm/smp.h @@ -79,11 +79,6 @@ static __inline__ int cpu_number_map(int cpu) return cpu; } -static __inline__ unsigned int num_booting_cpus(void) -{ - return cpumask_weight(&cpu_callout_map); -} - extern void smp_send_timer(void); extern unsigned long send_IPI_mask_phys(const cpumask_t*, int, int); diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index f48394513c37..2ffa95dc2333 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -169,11 +169,6 @@ void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) -/* We don't mark CPUs online until __cpu_up(), so we need another measure */ -static inline int num_booting_cpus(void) -{ - return cpumask_weight(cpu_callout_mask); -} #else /* !CONFIG_SMP */ #define wbinvd_on_cpu(cpu) wbinvd() static inline int wbinvd_on_all_cpus(void) diff --git a/include/linux/smp.h b/include/linux/smp.h index a34d4f15430d..dd6f06be3c9f 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -177,7 +177,6 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) } while (0) static inline void smp_send_reschedule(int cpu) { } -#define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_many(mask, func, info, wait) \ (up_smp_call_function(func, info)) From 7db971b235480849aa5b9209b67b62e987b3181b Mon Sep 17 00:00:00 2001 From: Ido Yariv Date: Sun, 3 Jun 2012 01:11:34 +0300 Subject: [PATCH 014/612] x86/platform: Introduce APIC post-initialization callback Some subarchitectures (such as vSMP) need to slightly adjust the underlying APIC structure. Add an APIC post-initialization callback to 'struct x86_platform_ops' for this purpose and use it for adjusting the APIC structure on vSMP systems. Signed-off-by: Ido Yariv Acked-by: Shai Fultheim Link: http://lkml.kernel.org/r/1338675095-27260-1-git-send-email-ido@wizery.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/probe_32.c | 3 +++ arch/x86/kernel/apic/probe_64.c | 11 ++--------- arch/x86/kernel/vsmp_64.c | 13 +++++++++++++ 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index c090af10ac7d..c377d9ccb696 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -164,6 +164,7 @@ struct x86_cpuinit_ops { * @i8042_detect pre-detect if i8042 controller exists * @save_sched_clock_state: save state for sched_clock() on suspend * @restore_sched_clock_state: restore state for sched_clock() on resume + * @apic_post_init: adjust apic if neeeded */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -177,6 +178,7 @@ struct x86_platform_ops { int (*i8042_detect)(void); void (*save_sched_clock_state)(void); void (*restore_sched_clock_state)(void); + void (*apic_post_init)(void); }; struct pci_dev; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 1b291da09e60..8616d5198e16 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -208,6 +208,9 @@ void __init default_setup_apic_routing(void) if (apic->setup_apic_routing) apic->setup_apic_routing(); + + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); } void __init generic_apic_probe(void) diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 3fe986698929..1793dba7a741 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -23,11 +23,6 @@ #include #include -static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) -{ - return hard_smp_processor_id() >> index_msb; -} - /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ @@ -48,10 +43,8 @@ void __init default_setup_apic_routing(void) } } - if (is_vsmp_box()) { - /* need to update phys_pkg_id */ - apic->phys_pkg_id = apicid_phys_pkg_id; - } + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); } /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 8eeb55a551b4..59eea855f451 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -187,12 +187,25 @@ static void __init vsmp_cap_cpus(void) #endif } +static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return hard_smp_processor_id() >> index_msb; +} + +static void vsmp_apic_post_init(void) +{ + /* need to update phys_pkg_id */ + apic->phys_pkg_id = apicid_phys_pkg_id; +} + void __init vsmp_init(void) { detect_vsmp_box(); if (!is_vsmp_box()) return; + x86_platform.apic_post_init = vsmp_apic_post_init; + vsmp_cap_cpus(); set_vsmp_pv_ops(); From c767a54ba0657e52e6edaa97cbe0b0a8bf1c1655 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 21 May 2012 19:50:07 -0700 Subject: [PATCH 015/612] x86/debug: Add KERN_ to bare printks, convert printks to pr_ Use a more current logging style: - Bare printks should have a KERN_ for consistency's sake - Add pr_fmt where appropriate - Neaten some macro definitions - Convert some Ok output to OK - Use "%s: ", __func__ in pr_fmt for summit - Convert some printks to pr_ Message output is not identical in all cases. Signed-off-by: Joe Perches Cc: levinsasha928@gmail.com Link: http://lkml.kernel.org/r/1337655007.24226.10.camel@joe2Laptop [ merged two similar patches, tidied up the changelog ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/floppy.h | 2 +- arch/x86/include/asm/pci_x86.h | 8 ++- arch/x86/include/asm/pgtable-2level.h | 4 +- arch/x86/include/asm/pgtable-3level.h | 6 +- arch/x86/include/asm/pgtable_64.h | 8 +-- arch/x86/kernel/alternative.c | 17 +++-- arch/x86/kernel/amd_nb.c | 10 +-- arch/x86/kernel/apic/io_apic.c | 38 +++++----- arch/x86/kernel/apic/summit_32.c | 22 +++--- arch/x86/kernel/apm_32.c | 29 ++++---- arch/x86/kernel/cpu/bugs.c | 20 +++--- arch/x86/kernel/cpu/mcheck/mce.c | 22 +++--- arch/x86/kernel/cpu/perf_event_intel.c | 14 ++-- arch/x86/kernel/dumpstack.c | 4 +- arch/x86/kernel/dumpstack_32.c | 24 +++---- arch/x86/kernel/dumpstack_64.c | 20 +++--- arch/x86/kernel/irq.c | 4 +- arch/x86/kernel/module.c | 32 +++++---- arch/x86/kernel/pci-calgary_64.c | 34 +++++---- arch/x86/kernel/process.c | 34 +++++---- arch/x86/kernel/process_64.c | 8 +-- arch/x86/kernel/reboot.c | 14 ++-- arch/x86/kernel/signal.c | 5 +- arch/x86/kernel/smpboot.c | 97 ++++++++++++-------------- arch/x86/kernel/traps.c | 19 ++--- arch/x86/kernel/tsc.c | 50 +++++++------ arch/x86/kernel/vm86_32.c | 6 +- arch/x86/kernel/vsyscall_64.c | 17 ++--- arch/x86/kernel/xsave.c | 12 ++-- 29 files changed, 301 insertions(+), 279 deletions(-) diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index dbe82a5c5eac..d3d74698dce9 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -99,7 +99,7 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id) virtual_dma_residue += virtual_dma_count; virtual_dma_count = 0; #ifdef TRACE_FLPY_INT - printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", + printk(KERN_DEBUG "count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", virtual_dma_count, virtual_dma_residue, calls, bytes, dma_wait); calls = 0; diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b3a531746026..5ad24a89b19b 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -7,9 +7,13 @@ #undef DEBUG #ifdef DEBUG -#define DBG(x...) printk(x) +#define DBG(fmt, ...) printk(fmt, ##__VA_ARGS__) #else -#define DBG(x...) +#define DBG(fmt, ...) \ +do { \ + if (0) \ + printk(fmt, ##__VA_ARGS__); \ +} while (0) #endif #define PCI_PROBE_BIOS 0x0001 diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 98391db840c6..f2b489cf1602 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -2,9 +2,9 @@ #define _ASM_X86_PGTABLE_2LEVEL_H #define pte_ERROR(e) \ - printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) + pr_err("%s:%d: bad pte %08lx\n", __FILE__, __LINE__, (e).pte_low) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) + pr_err("%s:%d: bad pgd %08lx\n", __FILE__, __LINE__, pgd_val(e)) /* * Certain architectures need to do special things when PTEs diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 43876f16caf1..f824cfbaa9d4 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -9,13 +9,13 @@ */ #define pte_ERROR(e) \ - printk("%s:%d: bad pte %p(%08lx%08lx).\n", \ + pr_err("%s:%d: bad pte %p(%08lx%08lx)\n", \ __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016Lx).\n", \ + pr_err("%s:%d: bad pmd %p(%016Lx)\n", \ __FILE__, __LINE__, &(e), pmd_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016Lx).\n", \ + pr_err("%s:%d: bad pgd %p(%016Lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) /* Rules for using set_pte: the pte being assigned *must* be diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 975f709e09ae..8251be02301e 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -26,16 +26,16 @@ extern pgd_t init_level4_pgt[]; extern void paging_init(void); #define pte_ERROR(e) \ - printk("%s:%d: bad pte %p(%016lx).\n", \ + pr_err("%s:%d: bad pte %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pte_val(e)) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016lx).\n", \ + pr_err("%s:%d: bad pmd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pmd_val(e)) #define pud_ERROR(e) \ - printk("%s:%d: bad pud %p(%016lx).\n", \ + pr_err("%s:%d: bad pud %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pud_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016lx).\n", \ + pr_err("%s:%d: bad pgd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) struct mm_struct; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1f84794f0759..1729d720299a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "SMP alternatives: " fmt + #include #include #include @@ -63,8 +65,11 @@ static int __init setup_noreplace_paravirt(char *str) __setup("noreplace-paravirt", setup_noreplace_paravirt); #endif -#define DPRINTK(fmt, args...) if (debug_alternative) \ - printk(KERN_DEBUG fmt, args) +#define DPRINTK(fmt, ...) \ +do { \ + if (debug_alternative) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) /* * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes @@ -428,7 +433,7 @@ void alternatives_smp_switch(int smp) * If this still occurs then you should see a hang * or crash shortly after this line: */ - printk("lockdep: fixing up alternatives.\n"); + pr_info("lockdep: fixing up alternatives\n"); #endif if (noreplace_smp || smp_alt_once || skip_smp_alternatives) @@ -444,14 +449,14 @@ void alternatives_smp_switch(int smp) if (smp == smp_mode) { /* nothing */ } else if (smp) { - printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); + pr_info("switching to SMP code\n"); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); } else { - printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + pr_info("switching to UP code\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) @@ -546,7 +551,7 @@ void __init alternative_instructions(void) #ifdef CONFIG_SMP if (smp_alt_once) { if (1 == num_possible_cpus()) { - printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + pr_info("switching to UP code\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index be16854591cc..f29f6dd6bc08 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -2,6 +2,9 @@ * Shared support code for AMD K8 northbridges and derivates. * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -258,7 +261,7 @@ void amd_flush_garts(void) } spin_unlock_irqrestore(&gart_lock, flags); if (!flushed) - printk("nothing to flush?\n"); + pr_notice("nothing to flush?\n"); } EXPORT_SYMBOL_GPL(amd_flush_garts); @@ -269,11 +272,10 @@ static __init int init_amd_nbs(void) err = amd_cache_northbridges(); if (err < 0) - printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); + pr_notice("Cannot enumerate AMD northbridges\n"); if (amd_cache_gart() < 0) - printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " - "GART support disabled.\n"); + pr_notice("Cannot initialize GART flush words, GART support disabled\n"); return err; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac96561d1a99..5155d6f806f5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -448,8 +448,8 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi entry = alloc_irq_pin_list(node); if (!entry) { - printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", - node, apic, pin); + pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); return -ENOMEM; } entry->apic = apic; @@ -661,7 +661,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ioapic_mask_entry(apic, pin); entry = ioapic_read_entry(apic, pin); if (entry.irr) - printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n", + pr_err("Unable to reset IRR for apic: %d, pin :%d\n", mpc_ioapic_id(apic), pin); } @@ -895,7 +895,7 @@ static int irq_polarity(int idx) } case 2: /* reserved */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); polarity = 1; break; } @@ -906,7 +906,7 @@ static int irq_polarity(int idx) } default: /* invalid */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); polarity = 1; break; } @@ -948,7 +948,7 @@ static int irq_trigger(int idx) } default: { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 1; break; } @@ -962,7 +962,7 @@ static int irq_trigger(int idx) } case 2: /* reserved */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 1; break; } @@ -973,7 +973,7 @@ static int irq_trigger(int idx) } default: /* invalid */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 0; break; } @@ -991,7 +991,7 @@ static int pin_2_irq(int idx, int apic, int pin) * Debugging check, we are in big trouble if this message pops up! */ if (mp_irqs[idx].dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); if (test_bit(bus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; @@ -1521,7 +1521,6 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) reg_03.raw = io_apic_read(ioapic_idx, 3); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); @@ -1578,7 +1577,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) i, ir_entry->index ); - printk("%1d %1d %1d %1d %1d " + pr_cont("%1d %1d %1d %1d %1d " "%1d %1d %X %02X\n", ir_entry->format, ir_entry->mask, @@ -1598,7 +1597,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) i, entry.dest ); - printk("%1d %1d %1d %1d %1d " + pr_cont("%1d %1d %1d %1d %1d " "%1d %1d %02X\n", entry.mask, entry.trigger, @@ -1651,8 +1650,8 @@ __apicdebuginit(void) print_IO_APICs(void) continue; printk(KERN_DEBUG "IRQ%d ", irq); for_each_irq_pin(entry, cfg->irq_2_pin) - printk("-> %d:%d", entry->apic, entry->pin); - printk("\n"); + pr_cont("-> %d:%d", entry->apic, entry->pin); + pr_cont("\n"); } printk(KERN_INFO ".................................... done.\n"); @@ -1665,9 +1664,9 @@ __apicdebuginit(void) print_APIC_field(int base) printk(KERN_DEBUG); for (i = 0; i < 8; i++) - printk(KERN_CONT "%08x", apic_read(base + i*0x10)); + pr_cont("%08x", apic_read(base + i*0x10)); - printk(KERN_CONT "\n"); + pr_cont("\n"); } __apicdebuginit(void) print_local_APIC(void *dummy) @@ -1769,7 +1768,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); } } - printk("\n"); + pr_cont("\n"); } __apicdebuginit(void) print_local_APICs(int maxcpu) @@ -2065,7 +2064,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) reg_00.raw = io_apic_read(ioapic_idx, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) - printk("could not set ID!\n"); + pr_cont("could not set ID!\n"); else apic_printk(APIC_VERBOSE, " ok.\n"); } @@ -3563,7 +3562,8 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id) /* Sanity check */ if (reg_00.bits.ID != apic_id) { - printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + pr_err("IOAPIC[%d]: Unable to change apic_id!\n", + ioapic); return -1; } } diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 659897c00755..e97d542ccdd0 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -26,6 +26,8 @@ * */ +#define pr_fmt(fmt) "summit: %s: " fmt, __func__ + #include #include #include @@ -235,8 +237,8 @@ static int summit_apic_id_registered(void) static void summit_setup_apic_routing(void) { - printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", - nr_ioapics); + pr_info("Enabling APIC mode: Summit. Using %d I/O APICs\n", + nr_ioapics); } static int summit_cpu_present_to_apicid(int mps_cpu) @@ -275,7 +277,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { - printk("%s: Not a valid mask!\n", __func__); + pr_err("Not a valid mask!\n"); return BAD_APICID; } apicid |= new_apicid; @@ -355,7 +357,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) } } if (i == rio_table_hdr->num_rio_dev) { - printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__); + pr_err("Couldn't find owner Cyclone for Winnipeg!\n"); return last_bus; } @@ -366,7 +368,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) } } if (i == rio_table_hdr->num_scal_dev) { - printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__); + pr_err("Couldn't find owner Twister for Cyclone!\n"); return last_bus; } @@ -396,7 +398,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) num_buses = 9; break; default: - printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__); + pr_info("Unsupported Winnipeg type!\n"); return last_bus; } @@ -411,13 +413,15 @@ static int build_detail_arrays(void) int i, scal_detail_size, rio_detail_size; if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { - printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); + pr_warn("MAX_NUMNODES too low! Defined as %d, but system has %d nodes\n", + MAX_NUMNODES, rio_table_hdr->num_scal_dev); return 0; } switch (rio_table_hdr->version) { default: - printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version); + pr_warn("Invalid Rio Grande Table Version: %d\n", + rio_table_hdr->version); return 0; case 2: scal_detail_size = 11; @@ -462,7 +466,7 @@ void setup_summit(void) offset = *((unsigned short *)(ptr + offset)); } if (!rio_table_hdr) { - printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__); + pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n"); return; } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 07b0c0db466c..d65464e43503 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -201,6 +201,8 @@ * http://www.microsoft.com/whdc/archive/amp_12.mspx] */ +#define pr_fmt(fmt) "apm: " fmt + #include #include @@ -485,11 +487,11 @@ static void apm_error(char *str, int err) if (error_table[i].key == err) break; if (i < ERROR_COUNT) - printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); + pr_notice("%s: %s\n", str, error_table[i].msg); else if (err < 0) - printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err); + pr_notice("%s: linux error code %i\n", str, err); else - printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", + pr_notice("%s: unknown error code %#2.2x\n", str, err); } @@ -1184,7 +1186,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender) static int notified; if (notified++ == 0) - printk(KERN_ERR "apm: an event queue overflowed\n"); + pr_err("an event queue overflowed\n"); if (++as->event_tail >= APM_MAX_EVENTS) as->event_tail = 0; } @@ -1447,7 +1449,7 @@ static void apm_mainloop(void) static int check_apm_user(struct apm_user *as, const char *func) { if (as == NULL || as->magic != APM_BIOS_MAGIC) { - printk(KERN_ERR "apm: %s passed bad filp\n", func); + pr_err("%s passed bad filp\n", func); return 1; } return 0; @@ -1586,7 +1588,7 @@ static int do_release(struct inode *inode, struct file *filp) as1 = as1->next) ; if (as1 == NULL) - printk(KERN_ERR "apm: filp not in user list\n"); + pr_err("filp not in user list\n"); else as1->next = as->next; } @@ -1600,11 +1602,9 @@ static int do_open(struct inode *inode, struct file *filp) struct apm_user *as; as = kmalloc(sizeof(*as), GFP_KERNEL); - if (as == NULL) { - printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", - sizeof(*as)); + if (as == NULL) return -ENOMEM; - } + as->magic = APM_BIOS_MAGIC; as->event_tail = as->event_head = 0; as->suspends_pending = as->standbys_pending = 0; @@ -2313,16 +2313,16 @@ static int __init apm_init(void) } if (apm_info.disabled) { - printk(KERN_NOTICE "apm: disabled on user request.\n"); + pr_notice("disabled on user request.\n"); return -ENODEV; } if ((num_online_cpus() > 1) && !power_off && !smp) { - printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); + pr_notice("disabled - APM is not SMP safe.\n"); apm_info.disabled = 1; return -ENODEV; } if (!acpi_disabled) { - printk(KERN_NOTICE "apm: overridden by ACPI.\n"); + pr_notice("overridden by ACPI.\n"); apm_info.disabled = 1; return -ENODEV; } @@ -2356,8 +2356,7 @@ static int __init apm_init(void) kapmd_task = kthread_create(apm, NULL, "kapmd"); if (IS_ERR(kapmd_task)) { - printk(KERN_ERR "apm: disabled - Unable to start kernel " - "thread.\n"); + pr_err("disabled - Unable to start kernel thread\n"); err = PTR_ERR(kapmd_task); kapmd_task = NULL; remove_proc_entry("apm", NULL); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 46674fbb62ba..c97bb7b5a9f8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -55,8 +55,8 @@ static void __init check_fpu(void) if (!boot_cpu_data.hard_math) { #ifndef CONFIG_MATH_EMULATION - printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); - printk(KERN_EMERG "Giving up.\n"); + pr_emerg("No coprocessor found and no math emulation present\n"); + pr_emerg("Giving up\n"); for (;;) ; #endif return; @@ -86,7 +86,7 @@ static void __init check_fpu(void) boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) - printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); + pr_warn("Hmm, FPU with FDIV bug\n"); } static void __init check_hlt(void) @@ -94,16 +94,16 @@ static void __init check_hlt(void) if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) return; - printk(KERN_INFO "Checking 'hlt' instruction... "); + pr_info("Checking 'hlt' instruction... "); if (!boot_cpu_data.hlt_works_ok) { - printk("disabled\n"); + pr_cont("disabled\n"); return; } halt(); halt(); halt(); halt(); - printk(KERN_CONT "OK.\n"); + pr_cont("OK\n"); } /* @@ -116,7 +116,7 @@ static void __init check_popad(void) #ifndef CONFIG_X86_POPAD_OK int res, inp = (int) &res; - printk(KERN_INFO "Checking for popad bug... "); + pr_info("Checking for popad bug... "); __asm__ __volatile__( "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " : "=&a" (res) @@ -127,9 +127,9 @@ static void __init check_popad(void) * CPU hard. Too bad. */ if (res != 12345678) - printk(KERN_CONT "Buggy.\n"); + pr_cont("Buggy\n"); else - printk(KERN_CONT "OK.\n"); + pr_cont("OK\n"); #endif } @@ -161,7 +161,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #ifndef CONFIG_SMP - printk(KERN_INFO "CPU: "); + pr_info("CPU: "); print_cpu_info(&boot_cpu_data); #endif check_config(); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0a687fd185e6..5623b4b5d511 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -7,6 +7,9 @@ * Copyright 2008 Intel Corporation * Author: Andi Kleen */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -210,7 +213,7 @@ static void drain_mcelog_buffer(void) cpu_relax(); if (!m->finished && retries >= 4) { - pr_err("MCE: skipping error being logged currently!\n"); + pr_err("skipping error being logged currently!\n"); break; } } @@ -1167,8 +1170,9 @@ int memory_failure(unsigned long pfn, int vector, int flags) { /* mce_severity() should not hand us an ACTION_REQUIRED error */ BUG_ON(flags & MF_ACTION_REQUIRED); - printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" - "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); + pr_err("Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", + pfn); return 0; } @@ -1358,11 +1362,10 @@ static int __cpuinit __mcheck_cpu_cap_init(void) b = cap & MCG_BANKCNT_MASK; if (!banks) - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + pr_info("CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { - printk(KERN_WARNING - "MCE: Using only %u machine check banks out of %u\n", + pr_warn("Using only %u machine check banks out of %u\n", MAX_NR_BANKS, b); b = MAX_NR_BANKS; } @@ -1419,7 +1422,7 @@ static void __mcheck_cpu_init_generic(void) static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); + pr_info("unknown CPU type - not enabling MCE support\n"); return -EOPNOTSUPP; } @@ -1574,7 +1577,7 @@ static void __mcheck_cpu_init_timer(void) /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); } @@ -1893,8 +1896,7 @@ static int __init mcheck_enable(char *str) get_option(&str, &monarch_timeout); } } else { - printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", - str); + pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; } return 1; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 166546ec6aef..9e3f5d6e3d20 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -5,6 +5,8 @@ * among events on a single PMU. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -1000,7 +1002,7 @@ static void intel_pmu_reset(void) local_irq_save(flags); - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); for (idx = 0; idx < x86_pmu.num_counters; idx++) { checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); @@ -1638,14 +1640,14 @@ static __init void intel_clovertown_quirk(void) * But taken together it might just make sense to not enable PEBS on * these chips. */ - printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); + pr_warn("PEBS disabled due to CPU errata\n"); x86_pmu.pebs = 0; x86_pmu.pebs_constraints = NULL; } static __init void intel_sandybridge_quirk(void) { - printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); + pr_warn("PEBS disabled due to CPU errata\n"); x86_pmu.pebs = 0; x86_pmu.pebs_constraints = NULL; } @@ -1667,8 +1669,8 @@ static __init void intel_arch_events_quirk(void) /* disable event that reported as not presend by cpuid */ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; - printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", - intel_arch_events_map[bit].name); + pr_warn("CPUID marked event: \'%s\' unavailable\n", + intel_arch_events_map[bit].name); } } @@ -1687,7 +1689,7 @@ static __init void intel_nehalem_quirk(void) intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; ebx.split.no_branch_misses_retired = 0; x86_pmu.events_maskl = ebx.full; - printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); + pr_info("CPU erratum AAJ80 worked around\n"); } } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 571246d81edf..87d3b5d663cd 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -27,8 +27,8 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { - printk(" [<%p>] %s%pB\n", (void *) address, - reliable ? "" : "? ", (void *) address); + pr_cont(" [<%p>] %s%pB\n", + (void *)address, reliable ? "" : "? ", (void *)address); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e0b1d783daab..3a8aced11ae9 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -73,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk(KERN_CONT "\n"); - printk(KERN_CONT " %08lx", *stack++); + pr_cont("\n"); + pr_cont(" %08lx", *stack++); touch_nmi_watchdog(); } - printk(KERN_CONT "\n"); + pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -89,9 +89,9 @@ void show_regs(struct pt_regs *regs) print_modules(); __show_regs(regs, !user_mode_vm(regs)); - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", - TASK_COMM_LEN, current->comm, task_pid_nr(current), - current_thread_info(), current, task_thread_info(current)); + pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", + TASK_COMM_LEN, current->comm, task_pid_nr(current), + current_thread_info(), current, task_thread_info(current)); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. @@ -102,10 +102,10 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_EMERG "Stack:\n"); + pr_emerg("Stack:\n"); show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); - printk(KERN_EMERG "Code: "); + pr_emerg("Code:"); ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { @@ -116,16 +116,16 @@ void show_regs(struct pt_regs *regs) for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - printk(KERN_CONT " Bad EIP value."); + pr_cont(" Bad EIP value."); break; } if (ip == (u8 *)regs->ip) - printk(KERN_CONT "<%02x> ", c); + pr_cont(" <%02x>", c); else - printk(KERN_CONT "%02x ", c); + pr_cont(" %02x", c); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 791b76122aa8..c582e9c5bd1a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -228,20 +228,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); - printk(KERN_CONT " "); + pr_cont(" "); } } else { if (((long) stack & (THREAD_SIZE-1)) == 0) break; } if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk(KERN_CONT "\n"); - printk(KERN_CONT " %016lx", *stack++); + pr_cont("\n"); + pr_cont(" %016lx", *stack++); touch_nmi_watchdog(); } preempt_enable(); - printk(KERN_CONT "\n"); + pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -256,8 +256,8 @@ void show_regs(struct pt_regs *regs) printk("CPU %d ", cpu); print_modules(); __show_regs(regs, 1); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); + printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, task_thread_info(cur), cur); /* * When in-kernel, we also print out the stack and code at the @@ -284,16 +284,16 @@ void show_regs(struct pt_regs *regs) for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - printk(KERN_CONT " Bad RIP value."); + pr_cont(" Bad RIP value."); break; } if (ip == (u8 *)regs->ip) - printk(KERN_CONT "<%02x> ", c); + pr_cont("<%02x> ", c); else - printk(KERN_CONT "%02x ", c); + pr_cont("%02x ", c); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3dafc6003b7c..1f5f1d5d2a02 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -294,9 +294,9 @@ void fixup_irqs(void) raw_spin_unlock(&desc->lock); if (break_affinity && set_affinity) - printk("Broke affinity for irq %i\n", irq); + pr_notice("Broke affinity for irq %i\n", irq); else if (!set_affinity) - printk("Cannot set affinity for irq %i\n", irq); + pr_notice("Cannot set affinity for irq %i\n", irq); } /* diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f21fd94ac897..202494d2ec6e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -15,6 +15,9 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -30,9 +33,14 @@ #include #if 0 -#define DEBUGP printk +#define DEBUGP(fmt, ...) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__) #else -#define DEBUGP(fmt...) +#define DEBUGP(fmt, ...) \ +do { \ + if (0) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) #endif void *module_alloc(unsigned long size) @@ -56,8 +64,8 @@ int apply_relocate(Elf32_Shdr *sechdrs, Elf32_Sym *sym; uint32_t *location; - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -77,7 +85,7 @@ int apply_relocate(Elf32_Shdr *sechdrs, *location += sym->st_value - (uint32_t)location; break; default: - printk(KERN_ERR "module %s: Unknown relocation: %u\n", + pr_err("%s: Unknown relocation: %u\n", me->name, ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -97,8 +105,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, void *loc; u64 val; - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -110,8 +118,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, + ELF64_R_SYM(rel[i].r_info); DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), - sym->st_value, rel[i].r_addend, (u64)loc); + (int)ELF64_R_TYPE(rel[i].r_info), + sym->st_value, rel[i].r_addend, (u64)loc); val = sym->st_value + rel[i].r_addend; @@ -140,7 +148,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, #endif break; default: - printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", + pr_err("%s: Unknown rela relocation: %llu\n", me->name, ELF64_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -148,9 +156,9 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, return 0; overflow: - printk(KERN_ERR "overflow in relocation type %d val %Lx\n", + pr_err("overflow in relocation type %d val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), val); - printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", + pr_err("`%s' likely not compiled with -mcmodel=kernel\n", me->name); return -ENOEXEC; } diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index b72838bae64a..299d49302e7d 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -22,6 +22,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "Calgary: " fmt + #include #include #include @@ -245,7 +247,7 @@ static unsigned long iommu_range_alloc(struct device *dev, offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, npages, 0, boundary_size, 0); if (offset == ~0UL) { - printk(KERN_WARNING "Calgary: IOMMU full.\n"); + pr_warn("IOMMU full\n"); spin_unlock_irqrestore(&tbl->it_lock, flags); if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); @@ -271,8 +273,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, entry = iommu_range_alloc(dev, tbl, npages); if (unlikely(entry == DMA_ERROR_CODE)) { - printk(KERN_WARNING "Calgary: failed to allocate %u pages in " - "iommu %p\n", npages, tbl); + pr_warn("failed to allocate %u pages in iommu %p\n", + npages, tbl); return DMA_ERROR_CODE; } @@ -561,8 +563,7 @@ static void calgary_tce_cache_blast(struct iommu_table *tbl) i++; } while ((val & 0xff) != 0xff && i < 100); if (i == 100) - printk(KERN_WARNING "Calgary: PCI bus not quiesced, " - "continuing anyway\n"); + pr_warn("PCI bus not quiesced, continuing anyway\n"); /* invalidate TCE cache */ target = calgary_reg(bbar, tar_offset(tbl->it_busno)); @@ -604,8 +605,7 @@ begin: i++; } while ((val64 & 0xff) != 0xff && i < 100); if (i == 100) - printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " - "continuing anyway\n"); + pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n"); /* 3. poll Page Migration DEBUG for SoftStopFault */ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); @@ -617,8 +617,7 @@ begin: if (++count < 100) goto begin; else { - printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " - "aborting TCE cache flush sequence!\n"); + pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n"); return; /* pray for the best */ } } @@ -840,8 +839,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl) plssr = be32_to_cpu(readl(target)); /* If no error, the agent ID in the CSR is not valid */ - printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " - "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); + pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n", + tbl->it_busno, csr, plssr); } static void calioc2_dump_error_regs(struct iommu_table *tbl) @@ -867,22 +866,21 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl) target = calgary_reg(bbar, phboff | 0x800); mck = be32_to_cpu(readl(target)); - printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", - tbl->it_busno); + pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno); - printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", - csr, plssr, csmr, mck); + pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", + csr, plssr, csmr, mck); /* dump rest of error regs */ - printk(KERN_EMERG "Calgary: "); + pr_emerg(""); for (i = 0; i < ARRAY_SIZE(errregs); i++) { /* err regs are at 0x810 - 0x870 */ erroff = (0x810 + (i * 0x10)); target = calgary_reg(bbar, phboff | erroff); errregs[i] = be32_to_cpu(readl(target)); - printk("0x%08x@0x%lx ", errregs[i], erroff); + pr_cont("0x%08x@0x%lx ", errregs[i], erroff); } - printk("\n"); + pr_cont("\n"); /* root complex status */ target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 735279e54e59..ef6a8456f719 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -145,16 +147,14 @@ void show_regs_common(void) /* Board Name is optional */ board = dmi_get_system_info(DMI_BOARD_NAME); - printk(KERN_CONT "\n"); - printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - printk(KERN_CONT " %s %s", vendor, product); - if (board) - printk(KERN_CONT "/%s", board); - printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, + vendor, product, + board ? "/" : "", + board ? board : ""); } void flush_thread(void) @@ -645,7 +645,7 @@ static void amd_e400_idle(void) amd_e400_c1e_detected = true; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); - printk(KERN_INFO "System has AMD C1E enabled\n"); + pr_info("System has AMD C1E enabled\n"); } } @@ -659,8 +659,7 @@ static void amd_e400_idle(void) */ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); - printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", - cpu); + pr_info("Switch to broadcast mode on CPU%d\n", cpu); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); @@ -681,8 +680,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { - printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," - " performance may degrade.\n"); + pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); } #endif if (pm_idle) @@ -692,11 +690,11 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) /* * One CPU supports mwait => All CPUs supports mwait */ - printk(KERN_INFO "using mwait in idle threads.\n"); + pr_info("using mwait in idle threads\n"); pm_idle = mwait_idle; } else if (cpu_has_amd_erratum(amd_erratum_400)) { /* E400: APIC timer interrupt does not wake up CPU from C1e */ - printk(KERN_INFO "using AMD E400 aware idle routine\n"); + pr_info("using AMD E400 aware idle routine\n"); pm_idle = amd_e400_idle; } else pm_idle = default_idle; @@ -715,7 +713,7 @@ static int __init idle_setup(char *str) return -EINVAL; if (!strcmp(str, "poll")) { - printk("using polling idle threads.\n"); + pr_info("using polling idle threads\n"); pm_idle = poll_idle; boot_option_idle_override = IDLE_POLL; } else if (!strcmp(str, "mwait")) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 61cdf7fdf099..85151f3e36e2 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -117,10 +117,10 @@ void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", - dead_task->comm, - dead_task->mm->context.ldt, - dead_task->mm->context.size); + pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); BUG(); } } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 79c45af81604..ab3f06260712 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -152,7 +154,8 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_BIOS) { reboot_type = BOOT_BIOS; - printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + "BIOS", d->ident); } return 0; } @@ -207,8 +210,8 @@ static int __init set_pci_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_CF9) { reboot_type = BOOT_CF9; - printk(KERN_INFO "%s series board detected. " - "Selecting PCI-method for reboots.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + "PCI", d->ident); } return 0; } @@ -217,7 +220,8 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_KBD) { reboot_type = BOOT_KBD; - printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboot.\n", + "KBD", d->ident); } return 0; } @@ -668,7 +672,7 @@ static void __machine_emergency_restart(int emergency) static void native_machine_restart(char *__unused) { - printk("machine restart\n"); + pr_notice("machine restart\n"); if (!reboot_force) machine_shutdown(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 21af737053aa..b280908a376e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,6 +6,9 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -814,7 +817,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); + pr_cont("\n"); } force_sig(SIGSEGV, me); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fd019d78b1f4..456d64806c8f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1,4 +1,4 @@ -/* + /* * x86 SMP booting functions * * (c) 1995 Alan Cox, Building #3 @@ -39,6 +39,8 @@ * Glauber Costa : i386 and x86_64 integration */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -184,7 +186,7 @@ static void __cpuinit smp_callin(void) * boards) */ - pr_debug("CALLIN, before setup_local_APIC().\n"); + pr_debug("CALLIN, before setup_local_APIC()\n"); if (apic->smp_callin_clear_local_apic) apic->smp_callin_clear_local_apic(); setup_local_APIC(); @@ -420,17 +422,16 @@ static void impress_friends(void) /* * Allow the user to impress friends. */ - pr_debug("Before bogomips.\n"); + pr_debug("Before bogomips\n"); for_each_possible_cpu(cpu) if (cpumask_test_cpu(cpu, cpu_callout_mask)) bogosum += cpu_data(cpu).loops_per_jiffy; - printk(KERN_INFO - "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n", num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); - pr_debug("Before bogocount - setting activated=1.\n"); + pr_debug("Before bogocount - setting activated=1\n"); } void __inquire_remote_apic(int apicid) @@ -440,18 +441,17 @@ void __inquire_remote_apic(int apicid) int timeout; u32 status; - printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); + pr_info("Inquiring remote APIC 0x%x...\n", apicid); for (i = 0; i < ARRAY_SIZE(regs); i++) { - printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); + pr_info("... APIC 0x%x %s: ", apicid, names[i]); /* * Wait for idle. */ status = safe_apic_wait_icr_idle(); if (status) - printk(KERN_CONT - "a previous APIC delivery may have failed\n"); + pr_cont("a previous APIC delivery may have failed\n"); apic_icr_write(APIC_DM_REMRD | regs[i], apicid); @@ -464,10 +464,10 @@ void __inquire_remote_apic(int apicid) switch (status) { case APIC_ICR_RR_VALID: status = apic_read(APIC_RRR); - printk(KERN_CONT "%08x\n", status); + pr_cont("%08x\n", status); break; default: - printk(KERN_CONT "failed\n"); + pr_cont("failed\n"); } } } @@ -501,12 +501,12 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) apic_write(APIC_ESR, 0); accept_status = (apic_read(APIC_ESR) & 0xEF); } - pr_debug("NMI sent.\n"); + pr_debug("NMI sent\n"); if (send_status) - printk(KERN_ERR "APIC never delivered???\n"); + pr_err("APIC never delivered???\n"); if (accept_status) - printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); + pr_err("APIC delivery error (%lx)\n", accept_status); return (send_status | accept_status); } @@ -528,7 +528,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_read(APIC_ESR); } - pr_debug("Asserting INIT.\n"); + pr_debug("Asserting INIT\n"); /* * Turn INIT on target chip @@ -544,7 +544,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) mdelay(10); - pr_debug("Deasserting INIT.\n"); + pr_debug("Deasserting INIT\n"); /* Target chip */ /* Send IPI */ @@ -577,14 +577,14 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Run STARTUP IPI loop. */ - pr_debug("#startup loops: %d.\n", num_starts); + pr_debug("#startup loops: %d\n", num_starts); for (j = 1; j <= num_starts; j++) { - pr_debug("Sending STARTUP #%d.\n", j); + pr_debug("Sending STARTUP #%d\n", j); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - pr_debug("After apic_write.\n"); + pr_debug("After apic_write\n"); /* * STARTUP IPI @@ -601,7 +601,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) */ udelay(300); - pr_debug("Startup point 1.\n"); + pr_debug("Startup point 1\n"); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -616,12 +616,12 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) if (send_status || accept_status) break; } - pr_debug("After Startup.\n"); + pr_debug("After Startup\n"); if (send_status) - printk(KERN_ERR "APIC never delivered???\n"); + pr_err("APIC never delivered???\n"); if (accept_status) - printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); + pr_err("APIC delivery error (%lx)\n", accept_status); return (send_status | accept_status); } @@ -635,11 +635,11 @@ static void __cpuinit announce_cpu(int cpu, int apicid) if (system_state == SYSTEM_BOOTING) { if (node != current_node) { if (current_node > (-1)) - pr_cont(" Ok.\n"); + pr_cont(" OK\n"); current_node = node; pr_info("Booting Node %3d, Processors ", node); } - pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : ""); + pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : ""); return; } else pr_info("Booting Node %d Processor %d APIC 0x%x\n", @@ -719,9 +719,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) /* * allow APs to start initializing. */ - pr_debug("Before Callout %d.\n", cpu); + pr_debug("Before Callout %d\n", cpu); cpumask_set_cpu(cpu, cpu_callout_mask); - pr_debug("After Callout %d.\n", cpu); + pr_debug("After Callout %d\n", cpu); /* * Wait 5s total for a response @@ -749,7 +749,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) pr_err("CPU%d: Stuck ??\n", cpu); else /* trampoline code not run */ - pr_err("CPU%d: Not responding.\n", cpu); + pr_err("CPU%d: Not responding\n", cpu); if (apic->inquire_remote_apic) apic->inquire_remote_apic(apicid); } @@ -794,7 +794,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || !physid_isset(apicid, phys_cpu_present_map) || !apic->apic_id_valid(apicid)) { - printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); + pr_err("%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } @@ -875,9 +875,8 @@ static int __init smp_sanity_check(unsigned max_cpus) unsigned int cpu; unsigned nr; - printk(KERN_WARNING - "More than 8 CPUs detected - skipping them.\n" - "Use CONFIG_X86_BIGSMP.\n"); + pr_warn("More than 8 CPUs detected - skipping them\n" + "Use CONFIG_X86_BIGSMP\n"); nr = 0; for_each_present_cpu(cpu) { @@ -898,8 +897,7 @@ static int __init smp_sanity_check(unsigned max_cpus) #endif if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk(KERN_WARNING - "weird, boot CPU (#%d) not listed by the BIOS.\n", + pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n", hard_smp_processor_id()); physid_set(hard_smp_processor_id(), phys_cpu_present_map); @@ -911,11 +909,10 @@ static int __init smp_sanity_check(unsigned max_cpus) */ if (!smp_found_config && !acpi_lapic) { preempt_enable(); - printk(KERN_NOTICE "SMP motherboard not detected.\n"); + pr_notice("SMP motherboard not detected\n"); disable_smp(); if (APIC_init_uniprocessor()) - printk(KERN_NOTICE "Local APIC not detected." - " Using dummy APIC emulation.\n"); + pr_notice("Local APIC not detected. Using dummy APIC emulation.\n"); return -1; } @@ -924,9 +921,8 @@ static int __init smp_sanity_check(unsigned max_cpus) * CPU too, but we do it for the sake of robustness anyway. */ if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { - printk(KERN_NOTICE - "weird, boot CPU (#%d) not listed by the BIOS.\n", - boot_cpu_physical_apicid); + pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n", + boot_cpu_physical_apicid); physid_set(hard_smp_processor_id(), phys_cpu_present_map); } preempt_enable(); @@ -939,8 +935,7 @@ static int __init smp_sanity_check(unsigned max_cpus) if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", boot_cpu_physical_apicid); - pr_err("... forcing use of dummy APIC emulation." - "(tell your hw vendor)\n"); + pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n"); } smpboot_clear_io_apic(); disable_ioapic_support(); @@ -953,7 +948,7 @@ static int __init smp_sanity_check(unsigned max_cpus) * If SMP should be disabled, then really disable it! */ if (!max_cpus) { - printk(KERN_INFO "SMP mode deactivated.\n"); + pr_info("SMP mode deactivated\n"); smpboot_clear_io_apic(); connect_bsp_APIC(); @@ -1005,7 +1000,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (smp_sanity_check(max_cpus) < 0) { - printk(KERN_INFO "SMP disabled\n"); + pr_info("SMP disabled\n"); disable_smp(); goto out; } @@ -1043,7 +1038,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) * Set up local APIC timer on boot CPU. */ - printk(KERN_INFO "CPU%d: ", 0); + pr_info("CPU%d: ", 0); print_cpu_info(&cpu_data(0)); x86_init.timers.setup_percpu_clockev(); @@ -1093,7 +1088,7 @@ void __init native_smp_prepare_boot_cpu(void) void __init native_smp_cpus_done(unsigned int max_cpus) { - pr_debug("Boot done.\n"); + pr_debug("Boot done\n"); nmi_selftest(); impress_friends(); @@ -1154,8 +1149,7 @@ __init void prefill_possible_map(void) /* nr_cpu_ids could be reduced via nr_cpus= */ if (possible > nr_cpu_ids) { - printk(KERN_WARNING - "%d Processors exceeds NR_CPUS limit of %d\n", + pr_warn("%d Processors exceeds NR_CPUS limit of %d\n", possible, nr_cpu_ids); possible = nr_cpu_ids; } @@ -1164,13 +1158,12 @@ __init void prefill_possible_map(void) if (!setup_max_cpus) #endif if (possible > i) { - printk(KERN_WARNING - "%d Processors exceeds max_cpus limit of %u\n", + pr_warn("%d Processors exceeds max_cpus limit of %u\n", possible, setup_max_cpus); possible = i; } - printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", + pr_info("Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); for (i = 0; i < possible; i++) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 05b31d92f69c..b481341c9369 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -9,6 +9,9 @@ /* * Handle hardware traps and faults. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -143,12 +146,11 @@ trap_signal: #ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); + pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); - printk("\n"); + pr_cont("\n"); } #endif @@ -269,12 +271,11 @@ do_general_protection(struct pt_regs *regs, long error_code) if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", + pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); - printk("\n"); + pr_cont("\n"); } force_sig(SIGSEGV, tsk); @@ -570,7 +571,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) conditional_sti(regs); #if 0 /* No need to warn about this any longer. */ - printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); + pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); #endif } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index fc0a147e3727..cfa5d4f7ca56 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -84,8 +86,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC completely.\n"); + pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); tsc_disabled = 1; return 1; } @@ -373,7 +374,7 @@ static unsigned long quick_pit_calibrate(void) goto success; } } - printk("Fast TSC calibration failed\n"); + pr_err("Fast TSC calibration failed\n"); return 0; success: @@ -392,7 +393,7 @@ success: */ delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); - printk("Fast TSC calibration using PIT\n"); + pr_info("Fast TSC calibration using PIT\n"); return delta; } @@ -487,9 +488,8 @@ unsigned long native_calibrate_tsc(void) * use the reference value, as it is more precise. */ if (delta >= 90 && delta <= 110) { - printk(KERN_INFO - "TSC: PIT calibration matches %s. %d loops\n", - hpet ? "HPET" : "PMTIMER", i + 1); + pr_info("PIT calibration matches %s. %d loops\n", + hpet ? "HPET" : "PMTIMER", i + 1); return tsc_ref_min; } @@ -511,38 +511,36 @@ unsigned long native_calibrate_tsc(void) */ if (tsc_pit_min == ULONG_MAX) { /* PIT gave no useful value */ - printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); + pr_warn("Unable to calibrate against PIT\n"); /* We don't have an alternative source, disable TSC */ if (!hpet && !ref1 && !ref2) { - printk("TSC: No reference (HPET/PMTIMER) available\n"); + pr_notice("No reference (HPET/PMTIMER) available\n"); return 0; } /* The alternative source failed as well, disable TSC */ if (tsc_ref_min == ULONG_MAX) { - printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " - "failed.\n"); + pr_warn("HPET/PMTIMER calibration failed\n"); return 0; } /* Use the alternative source */ - printk(KERN_INFO "TSC: using %s reference calibration\n", - hpet ? "HPET" : "PMTIMER"); + pr_info("using %s reference calibration\n", + hpet ? "HPET" : "PMTIMER"); return tsc_ref_min; } /* We don't have an alternative source, use the PIT calibration value */ if (!hpet && !ref1 && !ref2) { - printk(KERN_INFO "TSC: Using PIT calibration value\n"); + pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /* The alternative source failed, use the PIT calibration value */ if (tsc_ref_min == ULONG_MAX) { - printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " - "Using PIT calibration\n"); + pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n"); return tsc_pit_min; } @@ -551,9 +549,9 @@ unsigned long native_calibrate_tsc(void) * the PIT value as we know that there are PMTIMERs around * running at double speed. At least we let the user know: */ - printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", - hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); - printk(KERN_INFO "TSC: Using PIT calibration value\n"); + pr_warn("PIT calibration deviates from %s: %lu %lu\n", + hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); + pr_info("Using PIT calibration value\n"); return tsc_pit_min; } @@ -785,7 +783,7 @@ void mark_tsc_unstable(char *reason) tsc_unstable = 1; sched_clock_stable = 0; disable_sched_clock_irqtime(); - printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); + pr_info("Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) clocksource_mark_unstable(&clocksource_tsc); @@ -912,9 +910,9 @@ static void tsc_refine_calibration_work(struct work_struct *work) goto out; tsc_khz = freq; - printk(KERN_INFO "Refined TSC clocksource calibration: " - "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, - (unsigned long)tsc_khz % 1000); + pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); out: clocksource_register_khz(&clocksource_tsc, tsc_khz); @@ -970,9 +968,9 @@ void __init tsc_init(void) return; } - printk("Detected %lu.%03lu MHz processor.\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); + pr_info("Detected %lu.%03lu MHz processor\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); /* * Secondary CPUs do not run through tsc_init(), so set up diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 255f58ae71e8..54abcc0baf23 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -28,6 +28,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -137,14 +139,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) local_irq_enable(); if (!current->thread.vm86_info) { - printk("no vm86_info: BAD\n"); + pr_alert("no vm86_info: BAD\n"); do_exit(SIGSEGV); } set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs, regs); tmp += put_user(current->thread.screen_bitmap, ¤t->thread.vm86_info->screen_bitmap); if (tmp) { - printk("vm86: could not access userspace vm86_info\n"); + pr_alert("could not access userspace vm86_info\n"); do_exit(SIGSEGV); } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 7515cf0e1805..acdc125ad44e 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -18,6 +18,8 @@ * use the vDSO. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -111,18 +113,13 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, const char *message) { - static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - struct task_struct *tsk; - - if (!show_unhandled_signals || !__ratelimit(&rs)) + if (!show_unhandled_signals) return; - tsk = current; - - printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", - level, tsk->comm, task_pid_nr(tsk), - message, regs->ip, regs->cs, - regs->sp, regs->ax, regs->si, regs->di); + pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); } static int addr_to_vsyscall_nr(unsigned long addr) diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index bd18149b2b0f..3d3e20709119 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -3,6 +3,9 @@ * * Author: Suresh Siddha */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -162,7 +165,7 @@ int save_i387_xstate(void __user *buf) BUG_ON(sig_xstate_size < xstate_size); if ((unsigned long)buf % 64) - printk("save_i387_xstate: bad fpstate %p\n", buf); + pr_err("%s: bad fpstate %p\n", __func__, buf); if (!used_math()) return 0; @@ -422,7 +425,7 @@ static void __init xstate_enable_boot_cpu(void) pcntxt_mask = eax + ((u64)edx << 32); if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { - printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n", + pr_err("FP/SSE not shown under xsave features 0x%llx\n", pcntxt_mask); BUG(); } @@ -445,9 +448,8 @@ static void __init xstate_enable_boot_cpu(void) setup_xstate_init(); - printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, " - "cntxt size 0x%x\n", - pcntxt_mask, xstate_size); + pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", + pcntxt_mask, xstate_size); } /* From 332afa656e76458ee9cf0f0d123016a0658539e4 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 21 May 2012 16:58:01 -0700 Subject: [PATCH 016/612] x86/irq: Update irq_cfg domain unless the new affinity is a subset of the current domain Until now, irq_cfg domain is mostly static. Either all CPU's (used by flat mode) or one CPU (first CPU in the irq afffinity mask) to which irq is being migrated (this is used by the rest of apic modes). Upcoming x2apic cluster mode optimization patch allows the irq to be sent to any CPU in the x2apic cluster (if supported by the HW). So irq_cfg domain changes on the fly (depending on which CPU in the x2apic cluster is online). Instead of checking for any intersection between the new irq affinity mask and the current irq_cfg domain, check if the new irq affinity mask is a subset of the current irq_cfg domain. Otherwise proceed with updating the irq_cfg domain aswell as assigning vector's on all the CPUs specified in the new mask. This also cleans up a workaround in updating irq_cfg domain for legacy irq's that are handled by the IO-APIC. Signed-off-by: Suresh Siddha Cc: yinghai@kernel.org Cc: gorcunov@openvz.org Cc: agordeev@redhat.com Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1337644682-19854-1-git-send-email-suresh.b.siddha@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac96561d1a99..910a3118438b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1126,8 +1126,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) old_vector = cfg->vector; if (old_vector) { cpumask_and(tmp_mask, mask, cpu_online_mask); - cpumask_and(tmp_mask, cfg->domain, tmp_mask); - if (!cpumask_empty(tmp_mask)) { + if (cpumask_subset(tmp_mask, cfg->domain)) { free_cpumask_var(tmp_mask); return 0; } @@ -1141,6 +1140,11 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) apic->vector_allocation_domain(cpu, tmp_mask); + if (cpumask_subset(tmp_mask, cfg->domain)) { + free_cpumask_var(tmp_mask); + return 0; + } + vector = current_vector; offset = current_offset; next: @@ -1346,13 +1350,6 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (!IO_APIC_IRQ(irq)) return; - /* - * For legacy irqs, cfg->domain starts with cpu 0 for legacy - * controllers like 8259. Now that IO-APIC can handle this irq, update - * the cfg->domain. - */ - if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) - apic->vector_allocation_domain(0, cfg->domain); if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; From 0b8255e660a0c229ebfe8f9fde12a8d4d34c50e0 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 21 May 2012 16:58:02 -0700 Subject: [PATCH 017/612] x86/x2apic/cluster: Use all the members of one cluster specified in the smp_affinity mask for the interrupt destination If the HW implements round-robin interrupt delivery, this enables multiple cpu's (which are part of the user specified interrupt smp_affinity mask and belong to the same x2apic cluster) to service the interrupt. Also if the platform supports Power Aware Interrupt Routing, then this enables the interrupt to be routed to an idle cpu or a busy cpu depending on the perf/power bias tunable. We are now grouping all the cpu's in a cluster to one vector domain. So that will limit the total number of interrupt sources handled by Linux. Previously we support "cpu-count * available-vectors-per-cpu" interrupt sources but this will now reduce to "cpu-count/16 * available-vectors-per-cpu". Signed-off-by: Suresh Siddha Cc: yinghai@kernel.org Cc: gorcunov@openvz.org Cc: agordeev@redhat.com Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1337644682-19854-2-git-send-email-suresh.b.siddha@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/x2apic.h | 9 ----- arch/x86/kernel/apic/x2apic_cluster.c | 56 +++++++++++++++++++-------- arch/x86/kernel/apic/x2apic_phys.c | 9 +++++ 3 files changed, 48 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h index 92e54abf89e0..7a5a832a99b6 100644 --- a/arch/x86/include/asm/x2apic.h +++ b/arch/x86/include/asm/x2apic.h @@ -28,15 +28,6 @@ static int x2apic_apic_id_registered(void) return 1; } -/* - * For now each logical cpu is in its own vector allocation domain. - */ -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) { diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index ff35cff0e1a7..90d999c7f2ea 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -98,34 +98,47 @@ static void x2apic_send_IPI_all(int vector) static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) { - /* - * We're using fixed IRQ delivery, can only return one logical APIC ID. - * May as well be the first. - */ int cpu = cpumask_first(cpumask); + u32 dest = 0; + int i; - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_logical_apicid, cpu); - else + if (cpu > nr_cpu_ids) return BAD_APICID; + + for_each_cpu_and(i, cpumask, per_cpu(cpus_in_cluster, cpu)) + dest |= per_cpu(x86_cpu_to_logical_apicid, i); + + return dest; } static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask) { - int cpu; + u32 dest = 0; + u16 cluster; + int i; - /* - * We're using fixed IRQ delivery, can only return one logical APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; + for_each_cpu_and(i, cpumask, andmask) { + if (!cpumask_test_cpu(i, cpu_online_mask)) + continue; + dest = per_cpu(x86_cpu_to_logical_apicid, i); + cluster = x2apic_cluster(i); + break; } - return per_cpu(x86_cpu_to_logical_apicid, cpu); + if (!dest) + return BAD_APICID; + + for_each_cpu_and(i, cpumask, andmask) { + if (!cpumask_test_cpu(i, cpu_online_mask)) + continue; + if (cluster != x2apic_cluster(i)) + continue; + dest |= per_cpu(x86_cpu_to_logical_apicid, i); + } + + return dest; } static void init_x2apic_ldr(void) @@ -208,6 +221,15 @@ static int x2apic_cluster_probe(void) return 0; } +/* + * Each x2apic cluster is an allocation domain. + */ +static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_clear(retmask); + cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu)); +} + static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", @@ -225,7 +247,7 @@ static struct apic apic_x2apic_cluster = { .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = x2apic_vector_allocation_domain, + .vector_allocation_domain = cluster_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index c17e982db275..93b25706f177 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -120,6 +120,15 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } +/* + * Each logical cpu is in its own vector allocation domain. + */ +static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); +} + static struct apic apic_x2apic_phys = { .name = "physical x2apic", From 49d0c7a0a425a89190b7c3b1445faba9eb227bec Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 5 Jun 2012 13:23:15 +0200 Subject: [PATCH 018/612] x86/apic: Trivial whitespace fixes Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120605112310.GA11443@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 0e881c46e8c8..de279b32ceb1 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -92,7 +92,7 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) } static void - flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) +flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { unsigned long mask = cpumask_bits(cpumask)[0]; int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 910a3118438b..74c569791e75 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1363,7 +1363,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, cfg->vector, irq, attr->trigger, attr->polarity, dest); if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) { - pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", + pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); __clear_irq_vector(irq, cfg); @@ -1466,7 +1466,7 @@ void setup_IO_APIC_irq_extra(u32 gsi) * Set up the timer pin, possibly with the 8259A-master behind. */ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, - unsigned int pin, int vector) + unsigned int pin, int vector) { struct IO_APIC_route_entry entry; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 90d999c7f2ea..2919e45d30c3 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -81,7 +81,7 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) } static void - x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } From bf721d3a3bc7a731add45c8078b142b494ab413e Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 5 Jun 2012 13:23:29 +0200 Subject: [PATCH 019/612] x86/apic: Factor out default target_cpus() operation Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120605112324.GA11449@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 5 +++++ arch/x86/include/asm/x2apic.h | 9 --------- arch/x86/kernel/apic/apic_flat_64.c | 14 ++------------ arch/x86/kernel/apic/apic_numachip.c | 7 +------ arch/x86/kernel/apic/bigsmp_32.c | 11 +---------- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- arch/x86/kernel/apic/x2apic_phys.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 7 +------ 8 files changed, 12 insertions(+), 45 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index eaff4790ed96..fc38195d6405 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -537,6 +537,11 @@ static inline const struct cpumask *default_target_cpus(void) #endif } +static inline const struct cpumask *online_target_cpus(void) +{ + return cpu_online_mask; +} + DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h index 7a5a832a99b6..f90f0a587c66 100644 --- a/arch/x86/include/asm/x2apic.h +++ b/arch/x86/include/asm/x2apic.h @@ -9,15 +9,6 @@ #include #include -/* - * Need to use more than cpu 0, because we need more vectors - * when MSI-X are used. - */ -static const struct cpumask *x2apic_target_cpus(void) -{ - return cpu_online_mask; -} - static int x2apic_apic_id_valid(int apicid) { return 1; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index de279b32ceb1..61ac1afeff07 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -36,11 +36,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -static const struct cpumask *flat_target_cpus(void) -{ - return cpu_online_mask; -} - static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus @@ -186,7 +181,7 @@ static struct apic apic_flat = { .irq_delivery_mode = dest_LowestPrio, .irq_dest_mode = 1, /* logical */ - .target_cpus = flat_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, @@ -262,11 +257,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static const struct cpumask *physflat_target_cpus(void) -{ - return cpu_online_mask; -} - static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_clear(retmask); @@ -345,7 +335,7 @@ static struct apic apic_physflat = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = physflat_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 6ec6d5d297c3..3255a60fcc95 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -72,11 +72,6 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -static const struct cpumask *numachip_target_cpus(void) -{ - return cpu_online_mask; -} - static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_clear(retmask); @@ -253,7 +248,7 @@ static struct apic apic_numachip __refconst = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = numachip_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 31fbdbfbf960..c288e81e00ff 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -26,15 +26,6 @@ static int bigsmp_apic_id_registered(void) return 1; } -static const struct cpumask *bigsmp_target_cpus(void) -{ -#ifdef CONFIG_SMP - return cpu_online_mask; -#else - return cpumask_of(0); -#endif -} - static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { return 0; @@ -205,7 +196,7 @@ static struct apic apic_bigsmp = { /* phys delivery to target CPU: */ .irq_dest_mode = 0, - .target_cpus = bigsmp_target_cpus, + .target_cpus = default_target_cpus, .disable_esr = 1, .dest_logical = 0, .check_apicid_used = bigsmp_check_apicid_used, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 2919e45d30c3..612622c47dfb 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -241,7 +241,7 @@ static struct apic apic_x2apic_cluster = { .irq_delivery_mode = dest_LowestPrio, .irq_dest_mode = 1, /* logical */ - .target_cpus = x2apic_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 93b25706f177..b1a8b39e3c3f 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -140,7 +140,7 @@ static struct apic apic_x2apic_phys = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = x2apic_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index c6d03f7a4401..16efb92bfea5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -185,11 +185,6 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -static const struct cpumask *uv_target_cpus(void) -{ - return cpu_online_mask; -} - static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_clear(retmask); @@ -362,7 +357,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = uv_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, From 6398268d2bc454735f11e08705e858f9fdf5c750 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 5 Jun 2012 13:23:44 +0200 Subject: [PATCH 020/612] x86/apic: Factor out default cpu_mask_to_apicid() operations Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120605112340.GA11454@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 13 ++++++--- arch/x86/kernel/apic/apic.c | 28 +++++++++++++++++++ arch/x86/kernel/apic/apic_flat_64.c | 40 +++------------------------- arch/x86/kernel/apic/apic_noop.c | 4 +-- arch/x86/kernel/apic/apic_numachip.c | 36 ++----------------------- arch/x86/kernel/apic/bigsmp_32.c | 30 ++------------------- arch/x86/kernel/apic/probe_32.c | 4 +-- arch/x86/kernel/apic/x2apic_phys.c | 36 ++----------------------- 8 files changed, 52 insertions(+), 139 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index fc38195d6405..bef571769e68 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -592,14 +592,14 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) #endif static inline unsigned int -default_cpu_mask_to_apicid(const struct cpumask *cpumask) +flat_cpu_mask_to_apicid(const struct cpumask *cpumask) { return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; } static inline unsigned int -default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) +flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) { unsigned long mask1 = cpumask_bits(cpumask)[0]; unsigned long mask2 = cpumask_bits(andmask)[0]; @@ -608,6 +608,13 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, return (unsigned int)(mask1 & mask2 & mask3); } +extern unsigned int +default_cpu_mask_to_apicid(const struct cpumask *cpumask); + +extern unsigned int +default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask); + static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) { return physid_isset(apicid, *map); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 39a222e094af..96a2608252f1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2123,6 +2123,34 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } +unsigned int default_cpu_mask_to_apicid(const struct cpumask *cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = cpumask_first(cpumask); + if (likely((unsigned)cpu < nr_cpu_ids)) + return per_cpu(x86_cpu_to_apicid, cpu); + + return BAD_APICID; +} + +unsigned int +default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + for_each_cpu_and(cpu, cpumask, andmask) { + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + } + return per_cpu(x86_cpu_to_apicid, cpu); +} + /* * Power management */ diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 61ac1afeff07..55b97ce4fa19 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -205,8 +205,8 @@ static struct apic apic_flat = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFu << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = flat_send_IPI_mask, .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, @@ -284,38 +284,6 @@ static void physflat_send_IPI_all(int vector) physflat_send_IPI_mask(cpu_online_mask, vector); } -static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = cpumask_first(cpumask); - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - else - return BAD_APICID; -} - -static unsigned int -physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - return per_cpu(x86_cpu_to_apicid, cpu); -} - static int physflat_probe(void) { if (apic == &apic_physflat || num_possible_cpus() > 8) @@ -360,8 +328,8 @@ static struct apic apic_physflat = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFu << 24, - .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = physflat_send_IPI_mask, .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index a6e4c6e06c08..7c3dd4fe0686 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -159,8 +159,8 @@ struct apic apic_noop = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = noop_send_IPI_mask, .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 3255a60fcc95..dba4bf2ed566 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -152,38 +152,6 @@ static void numachip_send_IPI_self(int vector) __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); } -static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = cpumask_first(cpumask); - if (likely((unsigned)cpu < nr_cpu_ids)) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; -} - -static unsigned int -numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - return per_cpu(x86_cpu_to_apicid, cpu); -} - static int __init numachip_probe(void) { return apic == &apic_numachip; @@ -272,8 +240,8 @@ static struct apic apic_numachip __refconst = { .set_apic_id = set_apic_id, .apic_id_mask = 0xffU << 24, - .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = numachip_send_IPI_mask, .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index c288e81e00ff..907aa3d112a6 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -96,32 +96,6 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid) return 1; } -/* As we are using single CPU as destination, pick only one CPU here */ -static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - int cpu = cpumask_first(cpumask); - - if (cpu < nr_cpu_ids) - return cpu_physical_id(cpu); - return BAD_APICID; -} - -static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - return cpu_physical_id(cpu); - } - return BAD_APICID; -} - static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; @@ -220,8 +194,8 @@ static struct apic apic_bigsmp = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = bigsmp_send_IPI_mask, .send_IPI_mask_allbutself = NULL, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 1b291da09e60..71b6ac48ab26 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -123,8 +123,8 @@ static struct apic apic_default = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = default_send_IPI_mask_logical, .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index b1a8b39e3c3f..f730269edef2 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -76,38 +76,6 @@ static void x2apic_send_IPI_all(int vector) __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } -static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - int cpu = cpumask_first(cpumask); - - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - else - return BAD_APICID; -} - -static unsigned int -x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - - return per_cpu(x86_cpu_to_apicid, cpu); -} - static void init_x2apic_ldr(void) { } @@ -164,8 +132,8 @@ static struct apic apic_x2apic_phys = { .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = x2apic_send_IPI_mask, .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, From fbd24153c48b8425b09c161a020483cd77da870e Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 30 May 2012 18:40:03 -0600 Subject: [PATCH 021/612] x86/early_printk: Replace obsolete simple_strtoul() usage with kstrtoint() Change early_serial_init() to call kstrtoul() instead of calling obsoleted simple_strtoul(). Signed-off-by: Shuah Khan Cc: Joe Perches Link: http://lkml.kernel.org/r/1338424803.3569.5.camel@lorien2 Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 9b9f18b49918..5e4771266f1a 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -119,7 +119,7 @@ static __init void early_serial_init(char *s) unsigned char c; unsigned divisor; unsigned baud = DEFAULT_BAUD; - char *e; + ssize_t ret; if (*s == ',') ++s; @@ -127,14 +127,14 @@ static __init void early_serial_init(char *s) if (*s) { unsigned port; if (!strncmp(s, "0x", 2)) { - early_serial_base = simple_strtoul(s, &e, 16); + ret = kstrtoint(s, 16, &early_serial_base); } else { static const int __initconst bases[] = { 0x3f8, 0x2f8 }; if (!strncmp(s, "ttyS", 4)) s += 4; - port = simple_strtoul(s, &e, 10); - if (port > 1 || s == e) + ret = kstrtouint(s, 10, &port); + if (ret || port > 1) port = 0; early_serial_base = bases[port]; } @@ -149,8 +149,8 @@ static __init void early_serial_init(char *s) outb(0x3, early_serial_base + MCR); /* DTR + RTS */ if (*s) { - baud = simple_strtoul(s, &e, 0); - if (baud == 0 || s == e) + ret = kstrtouint(s, 0, &baud); + if (ret || baud == 0) baud = DEFAULT_BAUD; } From ec44bc7acc3687ba6ae8154b4b5a845b70279237 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 May 2012 22:08:57 +0000 Subject: [PATCH 022/612] timers: Create detach_if_pending() and use it Most callers of detach_timer() have the same pattern around them. Check whether the timer is pending and eventually updating base->next_timer. Create detach_if_pending() and replace the duplicated code. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Gilad Ben-Yossef Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20120525214819.131246037@linutronix.de --- kernel/timer.c | 56 +++++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/kernel/timer.c b/kernel/timer.c index 6ec7e7e0db43..0f70deb20151 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -654,8 +654,7 @@ void init_timer_deferrable_key(struct timer_list *timer, } EXPORT_SYMBOL(init_timer_deferrable_key); -static inline void detach_timer(struct timer_list *timer, - int clear_pending) +static inline void detach_timer(struct timer_list *timer, bool clear_pending) { struct list_head *entry = &timer->entry; @@ -667,6 +666,19 @@ static inline void detach_timer(struct timer_list *timer, entry->prev = LIST_POISON2; } +static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, + bool clear_pending) +{ + if (!timer_pending(timer)) + return 0; + + detach_timer(timer, clear_pending); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; + return 1; +} + /* * We are using hashed locking: holding per_cpu(tvec_bases).lock * means that all timers which are tied to this base via timer->base are @@ -712,16 +724,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, base = lock_timer_base(timer, &flags); - if (timer_pending(timer)) { - detach_timer(timer, 0); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; - } else { - if (pending_only) - goto out_unlock; - } + ret = detach_if_pending(timer, base, false); + if (!ret && pending_only) + goto out_unlock; debug_activate(timer, expires); @@ -959,13 +964,7 @@ int del_timer(struct timer_list *timer) timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); - if (timer_pending(timer)) { - detach_timer(timer, 1); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; - } + ret = detach_if_pending(timer, base, true); spin_unlock_irqrestore(&base->lock, flags); } @@ -990,19 +989,10 @@ int try_to_del_timer_sync(struct timer_list *timer) base = lock_timer_base(timer, &flags); - if (base->running_timer == timer) - goto out; - - timer_stats_timer_clear_start_info(timer); - ret = 0; - if (timer_pending(timer)) { - detach_timer(timer, 1); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; + if (base->running_timer != timer) { + timer_stats_timer_clear_start_info(timer); + ret = detach_if_pending(timer, base, true); } -out: spin_unlock_irqrestore(&base->lock, flags); return ret; @@ -1178,7 +1168,7 @@ static inline void __run_timers(struct tvec_base *base) timer_stats_account_timer(timer); base->running_timer = timer; - detach_timer(timer, 1); + detach_timer(timer, true); spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, data); @@ -1714,7 +1704,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea while (!list_empty(head)) { timer = list_first_entry(head, struct timer_list, entry); - detach_timer(timer, 0); + detach_timer(timer, false); timer_set_base(timer, new_base); if (time_before(timer->expires, new_base->next_timer) && !tbase_get_deferrable(timer->base)) From facbb4a7efbd658046bf615f03cd97a1504785d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 May 2012 22:08:57 +0000 Subject: [PATCH 023/612] timers: Consolidate base->next_timer update Another bunch of mindlessly copied code. All callers of internal_add_timer() except the recascading code updates base->next_timer. Move this into internal_add_timer() and let the cascading code call __internal_add_timer(). Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Gilad Ben-Yossef Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20120525214819.189946224@linutronix.de --- kernel/timer.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/kernel/timer.c b/kernel/timer.c index 0f70deb20151..7207690b5353 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -330,7 +330,8 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); -static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +static void +__internal_add_timer(struct tvec_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; unsigned long idx = expires - base->timer_jiffies; @@ -372,6 +373,17 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } +static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +{ + __internal_add_timer(base, timer); + /* + * Update base->next_timer if this is the earliest one. + */ + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; +} + #ifdef CONFIG_TIMER_STATS void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) { @@ -757,9 +769,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, } timer->expires = expires; - if (time_before(timer->expires, base->next_timer) && - !tbase_get_deferrable(timer->base)) - base->next_timer = timer->expires; internal_add_timer(base, timer); out_unlock: @@ -925,9 +934,6 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); debug_activate(timer, timer->expires); - if (time_before(timer->expires, base->next_timer) && - !tbase_get_deferrable(timer->base)) - base->next_timer = timer->expires; internal_add_timer(base, timer); /* * Check whether the other CPU is idle and needs to be @@ -1079,7 +1085,8 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) */ list_for_each_entry_safe(timer, tmp, &tv_list, entry) { BUG_ON(tbase_get_base(timer->base) != base); - internal_add_timer(base, timer); + /* No accounting, while moving them */ + __internal_add_timer(base, timer); } return index; @@ -1706,9 +1713,6 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea timer = list_first_entry(head, struct timer_list, entry); detach_timer(timer, false); timer_set_base(timer, new_base); - if (time_before(timer->expires, new_base->next_timer) && - !tbase_get_deferrable(timer->base)) - new_base->next_timer = timer->expires; internal_add_timer(new_base, timer); } } From 99d5f3aac674fe081ffddd2dbb8946ccbc14c410 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 May 2012 22:08:58 +0000 Subject: [PATCH 024/612] timers: Add accounting of non deferrable timers The code in get_next_timer_interrupt() is suboptimal as it has to run through the cascade to find the next expiring timer. On a completely idle core we should only do that when there is an active timer enqueued and base->next_timer does not give us a fast answer. Add accounting of the active timers to the now consolidated attach/detach code. I deliberately avoided sanity checks because the code is fully symetric and any fiddling with timers w/o using the API functions will lead to cute explosions anyway. ulong is big enough even on 32bit and if we really run into the situation to have more than 1<<32 timers enqueued there, then we are definitely not in a state to go idle and run through that code. This allows us to fix another shortcoming of get_next_timer_interrupt(). Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Gilad Ben-Yossef Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20120525214819.236377028@linutronix.de --- kernel/timer.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/kernel/timer.c b/kernel/timer.c index 7207690b5353..7fada698bd1a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -77,6 +77,7 @@ struct tvec_base { struct timer_list *running_timer; unsigned long timer_jiffies; unsigned long next_timer; + unsigned long active_timers; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -377,11 +378,13 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) { __internal_add_timer(base, timer); /* - * Update base->next_timer if this is the earliest one. + * Update base->active_timers and base->next_timer */ - if (time_before(timer->expires, base->next_timer) && - !tbase_get_deferrable(timer->base)) - base->next_timer = timer->expires; + if (!tbase_get_deferrable(timer->base)) { + if (time_before(timer->expires, base->next_timer)) + base->next_timer = timer->expires; + base->active_timers++; + } } #ifdef CONFIG_TIMER_STATS @@ -678,6 +681,14 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending) entry->prev = LIST_POISON2; } +static inline void +detach_expired_timer(struct timer_list *timer, struct tvec_base *base) +{ + detach_timer(timer, true); + if (!tbase_get_deferrable(timer->base)) + timer->base->active_timers--; +} + static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, bool clear_pending) { @@ -685,9 +696,11 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, return 0; detach_timer(timer, clear_pending); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; + if (!tbase_get_deferrable(timer->base)) { + timer->base->active_timers--; + if (timer->expires == base->next_timer) + base->next_timer = base->timer_jiffies; + } return 1; } @@ -1175,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base) timer_stats_account_timer(timer); base->running_timer = timer; - detach_timer(timer, true); + detach_expired_timer(timer, base); spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, data); @@ -1701,6 +1714,7 @@ static int __cpuinit init_timers_cpu(int cpu) base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; + base->active_timers = 0; return 0; } @@ -1711,6 +1725,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea while (!list_empty(head)) { timer = list_first_entry(head, struct timer_list, entry); + /* We ignore the accounting on the dying cpu */ detach_timer(timer, false); timer_set_base(timer, new_base); internal_add_timer(new_base, timer); From e40468a54882ef7411fb178dbf2e465ec2349af7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 May 2012 22:08:59 +0000 Subject: [PATCH 025/612] timers: Improve get_next_timer_interrupt() Gilad reported at http://lkml.kernel.org/r/1336056962-10465-2-git-send-email-gilad@benyossef.com "Current timer code fails to correctly return a value meaning that there is no future timer event, with the result that the timer keeps getting re-armed in HZ one shot mode even when we could turn it off, generating unneeded interrupts. What is happening is that when __next_timer_interrupt() wishes to return a value that signifies "there is no future timer event", it returns (base->timer_jiffies + NEXT_TIMER_MAX_DELTA). However, the code in tick_nohz_stop_sched_tick(), which called __next_timer_interrupt() via get_next_timer_interrupt(), compares the return value to (last_jiffies + NEXT_TIMER_MAX_DELTA) to see if the timer needs to be re-armed. base->timer_jiffies != last_jiffies and so tick_nohz_stop_sched_tick() interperts the return value as indication that there is a distant future event 12 days from now and programs the timer to fire next after KTIME_MAX nsecs instead of avoiding to arm it. This ends up causing a needless interrupt once every KTIME_MAX nsecs." Fix this by using the new active timer accounting. This avoids scans when no active timer is enqueued completely, so we don't have to rely on base->timer_next and base->timer_jiffies anymore. Reported-by: Gilad Ben-Yossef Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20120525214819.317535385@linutronix.de --- kernel/timer.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/timer.c b/kernel/timer.c index 7fada698bd1a..a61c09374eba 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1326,18 +1326,21 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, unsigned long get_next_timer_interrupt(unsigned long now) { struct tvec_base *base = __this_cpu_read(tvec_bases); - unsigned long expires; + unsigned long expires = now + NEXT_TIMER_MAX_DELTA; /* * Pretend that there is no timer pending if the cpu is offline. * Possible pending timers will be migrated later to an active cpu. */ if (cpu_is_offline(smp_processor_id())) - return now + NEXT_TIMER_MAX_DELTA; + return expires; + spin_lock(&base->lock); - if (time_before_eq(base->next_timer, base->timer_jiffies)) - base->next_timer = __next_timer_interrupt(base); - expires = base->next_timer; + if (base->active_timers) { + if (time_before_eq(base->next_timer, base->timer_jiffies)) + base->next_timer = __next_timer_interrupt(base); + expires = base->next_timer; + } spin_unlock(&base->lock); if (time_before_eq(expires, now)) From c00b275043adc14d668f36266b890f0c53d46640 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:27:44 +0200 Subject: [PATCH 026/612] uprobes: Optimize is_swbp_at_addr() for current->mm Change is_swbp_at_addr() to try to avoid the costly read_opcode() if mm == current->mm, __copy_from_user_inatomic() should succeed in the likely case. Currently this optimization is not important, but we are going to add more is_swbp_at_addr(current->mm) callers. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192744.GA8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 985be4d80fe8..d0f5ec0dcdea 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -333,10 +333,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) uprobe_opcode_t opcode; int result; + if (current->mm == mm) { + pagefault_disable(); + result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, + sizeof(opcode)); + pagefault_enable(); + + if (likely(result == 0)) + goto out; + } + result = read_opcode(mm, vaddr, &opcode); if (result) return result; - +out: if (is_swbp_insn(&opcode)) return 1; From a3d7bb47937b3a40b9f0c75655e97b3bb6407cbe Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:27:59 +0200 Subject: [PATCH 027/612] uprobes: Change read_opcode() to use FOLL_FORCE set_orig_insn()->read_opcode() should not fail if the probed task did mprotect() after uprobe_register(), change it to use FOLL_FORCE. Without FOLL_WRITE this doesn't have any "side" effect but allows to read the !VM_READ memory. There is another reason for this change, we are going to use is_swbp_at_addr() from handle_swbp() which can race with another thread doing mprotect(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192759.GB8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d0f5ec0dcdea..a0dbc87a2ec6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -312,7 +312,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ void *vaddr_new; int ret; - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); if (ret <= 0) return ret; From 3a9ea0520f38def4a3915b91f82455b749f07d88 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:28:57 +0200 Subject: [PATCH 028/612] uprobes: Introduce find_active_uprobe() helper No functional changes. Move the "find uprobe" code from handle_swbp() to the new helper, find_active_uprobe(). Note: with or without this change, the find-active-uprobe logic is not exactly right. We can race with another thread which unmaps the memory with the valid uprobe before we take mm->mmap_sem. We can't find this uprobe simply because find_vma() fails. In this case we wrongly assume that this trap was not caused by uprobe and send the erroneous SIGTRAP. See the next changes. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192857.GC8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 49 ++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a0dbc87a2ec6..eaf4d55fd424 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1489,37 +1489,46 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) return false; } +static struct uprobe *find_active_uprobe(unsigned long bp_vaddr) +{ + struct mm_struct *mm = current->mm; + struct uprobe *uprobe = NULL; + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, bp_vaddr); + + if (vma && vma->vm_start <= bp_vaddr) { + if (valid_vma(vma, false)) { + struct inode *inode; + loff_t offset; + + inode = vma->vm_file->f_mapping->host; + offset = bp_vaddr - vma->vm_start; + offset += (vma->vm_pgoff << PAGE_SHIFT); + uprobe = find_uprobe(inode, offset); + } + } + + srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); + current->uprobe_srcu_id = -1; + up_read(&mm->mmap_sem); + + return uprobe; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. */ static void handle_swbp(struct pt_regs *regs) { - struct vm_area_struct *vma; struct uprobe_task *utask; struct uprobe *uprobe; - struct mm_struct *mm; unsigned long bp_vaddr; - uprobe = NULL; bp_vaddr = uprobe_get_swbp_addr(regs); - mm = current->mm; - down_read(&mm->mmap_sem); - vma = find_vma(mm, bp_vaddr); - - if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) { - struct inode *inode; - loff_t offset; - - inode = vma->vm_file->f_mapping->host; - offset = bp_vaddr - vma->vm_start; - offset += (vma->vm_pgoff << PAGE_SHIFT); - uprobe = find_uprobe(inode, offset); - } - - srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); - current->uprobe_srcu_id = -1; - up_read(&mm->mmap_sem); + uprobe = find_active_uprobe(bp_vaddr); if (!uprobe) { /* No matching uprobe; signal SIGTRAP. */ From d790d34653ab20c74034902f5f0889bba807949a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:29:14 +0200 Subject: [PATCH 029/612] uprobes: Teach find_active_uprobe() to provide the "is_swbp" info A separate patch to simplify the review, and for the documentation. The patch adds another "int *is_swbp" argument to find_active_uprobe(), so far its only caller doesn't use this info. With this patch find_active_uprobe() additionally does: - if find_vma() + ->vm_start check fails, *is_swbp = -EFAULT - otherwise, if valid_vma() + find_uprobe() fails, it holds the result of is_swbp_at_addr(), can be negative too. The latter is only possible if we raced with another thread which did munmap/etc after we hit this bp. IOW. If find_active_uprobe(&is_swbp) returns NULL, the caller can look at is_swbp to figure out whether the current insn is bp or not, or detect the race with another thread if it is negative. Note: I think that performance-wise this change is fine. This adds is_swbp_at_addr(), but only if we raced with uprobe_unregister() or if we hit the "normal" int3 but this mm has uprobes as well. And even in this case the slow read_opcode() path is very unlikely, this insn recently triggered do_int3(), __copy_from_user_inatomic() shouldn't fail in the likely case. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192914.GD8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eaf4d55fd424..ee3df704e78a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1489,7 +1489,7 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) return false; } -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr) +static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; @@ -1497,7 +1497,6 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr) down_read(&mm->mmap_sem); vma = find_vma(mm, bp_vaddr); - if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { struct inode *inode; @@ -1508,6 +1507,11 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr) offset += (vma->vm_pgoff << PAGE_SHIFT); uprobe = find_uprobe(inode, offset); } + + if (!uprobe) + *is_swbp = is_swbp_at_addr(mm, bp_vaddr); + } else { + *is_swbp = -EFAULT; } srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); @@ -1526,9 +1530,10 @@ static void handle_swbp(struct pt_regs *regs) struct uprobe_task *utask; struct uprobe *uprobe; unsigned long bp_vaddr; + int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); - uprobe = find_active_uprobe(bp_vaddr); + uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { /* No matching uprobe; signal SIGTRAP. */ From 77fc4af1b59d12ab3b1467adf0a5204806853123 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:29:28 +0200 Subject: [PATCH 030/612] uprobes: Change register_for_each_vma() to take mm->mmap_sem for writing Change register_for_each_vma() to take mm->mmap_sem for writing. This is a bit unfortunate but hopefully not too bad, this is the slow path anyway. This is needed to ensure that find_active_uprobe() can not race with uprobe_register() which adds the new bp at the same bp_vaddr, after find_uprobe() fails and before is_swbp_at_addr_fast() checks the memory. IOW, this is needed to ensure that if find_active_uprobe() returns NULL but is_swbp == true, we can safely assume that it was the "normal" int3 and we should send SIGTRAP. There is another reason for this change. We are going to replace uprobes_state->count with MMF_ flags set by register/unregister and cleared by find_active_uprobe(), and set/clear shouldn't race with each other. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192928.GE8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ee3df704e78a..a2ed82b4808c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -853,12 +853,12 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) } mm = vi->mm; - down_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); vma = find_vma(mm, (unsigned long)vi->vaddr); if (!vma || !valid_vma(vma, is_register)) { list_del(&vi->probe_list); kfree(vi); - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); mmput(mm); continue; } @@ -867,7 +867,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) vaddr != vi->vaddr) { list_del(&vi->probe_list); kfree(vi); - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); mmput(mm); continue; } @@ -877,7 +877,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) else remove_breakpoint(uprobe, mm, vi->vaddr); - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); mmput(mm); if (is_register) { if (ret && ret == -EEXIST) From 56bb4cf6475d702d2fb00fc641aa6441097c0330 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:29:47 +0200 Subject: [PATCH 031/612] uprobes: Teach handle_swbp() to rely on "is_swbp" rather than uprobes_srcu Currently handle_swbp() assumes that it can't race with unregister, so it roughly does: if (find_uprobe(vaddr)) process_uprobe(); else send_sig(SIGTRAP); This relies on the not-really-working uprobes_srcu code we are going to remove, see the next patch. With this patch we rely on the result of is_swbp_at_addr(bp_vaddr) if find_uprobe() fails. If is_swbp == 1, then we hit the normal int3, we should send SIGTRAP. If is_swbp == 0, we raced with uprobe_unregister(), we simply restart this insn again. The "difficult" case is is_swbp == -EFAULT, when we can't read this memory. In this case I think we should restart too, and this is more correct compared to the current code which sends SIGTRAP. Ignoring ENOMEM/etc from get_user_pages(), this can only happen if another thread unmaps this memory before find_active_uprobe() takes mmap_sem. It would be better to pretend it was unmapped before this insn was executed, restart, and get SIGSEGV. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192947.GF8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a2ed82b4808c..1f02e3bbfc1d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1530,14 +1530,26 @@ static void handle_swbp(struct pt_regs *regs) struct uprobe_task *utask; struct uprobe *uprobe; unsigned long bp_vaddr; - int is_swbp; + int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { - /* No matching uprobe; signal SIGTRAP. */ - send_sig(SIGTRAP, current, 0); + if (is_swbp > 0) { + /* No matching uprobe; signal SIGTRAP. */ + send_sig(SIGTRAP, current, 0); + } else { + /* + * Either we raced with uprobe_unregister() or we can't + * access this memory. The latter is only possible if + * another thread plays with our ->mm. In both cases + * we can simply restart. If this vma was unmapped we + * can pretend this insn was not executed yet and get + * the (correct) SIGSEGV after restart. + */ + instruction_pointer_set(regs, bp_vaddr); + } return; } From 778b032d96909690c19d84f8d17c13be65ed6f8e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:30:08 +0200 Subject: [PATCH 032/612] uprobes: Kill uprobes_srcu/uprobe_srcu_id Kill the no longer needed uprobes_srcu/uprobe_srcu_id code. It doesn't really work anyway. synchronize_srcu() can only synchronize with the code "inside" the srcu_read_lock/srcu_read_unlock section, while uprobe_pre_sstep_notifier() does srcu_read_lock() _after_ we already hit the breakpoint. I guess this probably works "in practice". synchronize_srcu() is slow and it implies synchronize_sched(), and the probed task enters the non- preemptible section at the start of exception handler. Still this is not right at least in theory, and task->uprobe_srcu_id blows task_struct. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529193008.GG8057@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/events/uprobes.c | 22 +++------------------- 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6029d8c54476..6bd19655c1a7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1569,7 +1569,6 @@ struct task_struct { #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; - int uprobe_srcu_id; #endif }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1f02e3bbfc1d..8c5e043cd309 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -38,7 +38,6 @@ #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE -static struct srcu_struct uprobes_srcu; static struct rb_root uprobes_tree = RB_ROOT; static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ @@ -738,20 +737,14 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) } /* - * There could be threads that have hit the breakpoint and are entering the - * notifier code and trying to acquire the uprobes_treelock. The thread - * calling delete_uprobe() that is removing the uprobe from the rb_tree can - * race with these threads and might acquire the uprobes_treelock compared - * to some of the breakpoint hit threads. In such a case, the breakpoint - * hit threads will not find the uprobe. The current unregistering thread - * waits till all other threads have hit a breakpoint, to acquire the - * uprobes_treelock before the uprobe is removed from the rbtree. + * There could be threads that have already hit the breakpoint. They + * will recheck the current insn and restart if find_uprobe() fails. + * See find_active_uprobe(). */ static void delete_uprobe(struct uprobe *uprobe) { unsigned long flags; - synchronize_srcu(&uprobes_srcu); spin_lock_irqsave(&uprobes_treelock, flags); rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock_irqrestore(&uprobes_treelock, flags); @@ -1388,9 +1381,6 @@ void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - if (t->uprobe_srcu_id != -1) - srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id); - if (!utask) return; @@ -1408,7 +1398,6 @@ void uprobe_free_utask(struct task_struct *t) void uprobe_copy_process(struct task_struct *t) { t->utask = NULL; - t->uprobe_srcu_id = -1; } /* @@ -1513,9 +1502,6 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) } else { *is_swbp = -EFAULT; } - - srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); - current->uprobe_srcu_id = -1; up_read(&mm->mmap_sem); return uprobe; @@ -1656,7 +1642,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) utask->state = UTASK_BP_HIT; set_thread_flag(TIF_UPROBE); - current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu); return 1; } @@ -1691,7 +1676,6 @@ static int __init init_uprobes(void) mutex_init(&uprobes_mutex[i]); mutex_init(&uprobes_mmap_mutex[i]); } - init_srcu_struct(&uprobes_srcu); return register_die_notifier(&uprobe_exception_nb); } From 5a425294ee7d4ab5a374248e85838dfd450caf75 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 15:30:31 +0200 Subject: [PATCH 033/612] perf/x86: Fix Intel shared extra MSR allocation Zheng Yan reported that event group validation can wreck event state when Intel extra_reg allocation changes event state. Validation shouldn't change any persistent state. Cloning events in validate_{event,group}() isn't really pretty either, so add a few special cases to avoid modifying the event state. The code is restructured to minimize the special case impact. Reported-by: Zheng Yan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338903031.28282.175.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 1 + arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 92 ++++++++++++++++++-------- 3 files changed, 66 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e049d6da0183..cb608383e4f6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1496,6 +1496,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void) if (!cpuc->shared_regs) goto error; } + cpuc->is_fake = 1; return cpuc; error: free_fake_cpuc(cpuc); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6638aaf54493..83794d8e6af0 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -117,6 +117,7 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ unsigned int group_flag; + int is_fake; /* * Intel DebugStore bits diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 166546ec6aef..965baa2fa790 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1119,27 +1119,33 @@ intel_bts_constraints(struct perf_event *event) return NULL; } -static bool intel_try_alt_er(struct perf_event *event, int orig_idx) +static int intel_alt_er(int idx) { if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) - return false; + return idx; - if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) { - event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01bb; - event->hw.extra_reg.idx = EXTRA_REG_RSP_1; - event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; - } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) { + if (idx == EXTRA_REG_RSP_0) + return EXTRA_REG_RSP_1; + + if (idx == EXTRA_REG_RSP_1) + return EXTRA_REG_RSP_0; + + return idx; +} + +static void intel_fixup_er(struct perf_event *event, int idx) +{ + event->hw.extra_reg.idx = idx; + + if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; event->hw.config |= 0x01b7; - event->hw.extra_reg.idx = EXTRA_REG_RSP_0; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } else if (idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01bb; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } - - if (event->hw.extra_reg.idx == orig_idx) - return false; - - return true; } /* @@ -1157,14 +1163,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, struct event_constraint *c = &emptyconstraint; struct er_account *era; unsigned long flags; - int orig_idx = reg->idx; + int idx = reg->idx; - /* already allocated shared msr */ - if (reg->alloc) + /* + * reg->alloc can be set due to existing state, so for fake cpuc we + * need to ignore this, otherwise we might fail to allocate proper fake + * state for this extra reg constraint. Also see the comment below. + */ + if (reg->alloc && !cpuc->is_fake) return NULL; /* call x86_get_event_constraint() */ again: - era = &cpuc->shared_regs->regs[reg->idx]; + era = &cpuc->shared_regs->regs[idx]; /* * we use spin_lock_irqsave() to avoid lockdep issues when * passing a fake cpuc @@ -1173,6 +1183,29 @@ again: if (!atomic_read(&era->ref) || era->config == reg->config) { + /* + * If its a fake cpuc -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + * + * Not doing the ER fixup will only result in era->reg being + * wrong, but since we won't actually try and program hardware + * this isn't a problem either. + */ + if (!cpuc->is_fake) { + if (idx != reg->idx) + intel_fixup_er(event, idx); + + /* + * x86_schedule_events() can call get_event_constraints() + * multiple times on events in the case of incremental + * scheduling(). reg->alloc ensures we only do the ER + * allocation once. + */ + reg->alloc = 1; + } + /* lock in msr value */ era->config = reg->config; era->reg = reg->reg; @@ -1180,17 +1213,17 @@ again: /* one more user */ atomic_inc(&era->ref); - /* no need to reallocate during incremental event scheduling */ - reg->alloc = 1; - /* * need to call x86_get_event_constraint() * to check if associated event has constraints */ c = NULL; - } else if (intel_try_alt_er(event, orig_idx)) { - raw_spin_unlock_irqrestore(&era->lock, flags); - goto again; + } else { + idx = intel_alt_er(idx); + if (idx != reg->idx) { + raw_spin_unlock_irqrestore(&era->lock, flags); + goto again; + } } raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1204,11 +1237,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, struct er_account *era; /* - * only put constraint if extra reg was actually - * allocated. Also takes care of event which do - * not use an extra shared reg + * Only put constraint if extra reg was actually allocated. Also takes + * care of event which do not use an extra shared reg. + * + * Also, if this is a fake cpuc we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent cpuc state + * either since it'll be thrown out. */ - if (!reg->alloc) + if (!reg->alloc || cpuc->is_fake) return; era = &cpuc->shared_regs->regs[reg->idx]; From 0780c927a02492f917a74f51f3c801c76a637c57 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: [PATCH 034/612] perf/x86: Implement cycles:p for SNB/IVB Now that there's finally a chip with working PEBS (IvyBridge), we can enable the hardware and implement cycles:p for SNB/IVB. Cc: Stephane Eranian Requested-and-tested-by: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 50 +++++++++++++++++++++----- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 83794d8e6af0..7241e2fc3c17 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -365,6 +365,7 @@ struct x86_pmu { int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; + void (*pebs_aliases)(struct perf_event *event); /* * Intel LBR diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 965baa2fa790..2312c1ff1b19 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1336,15 +1336,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc, intel_put_shared_regs_event_constraints(cpuc, event); } -static int intel_pmu_hw_config(struct perf_event *event) +static void intel_pebs_aliases_core2(struct perf_event *event) { - int ret = x86_pmu_hw_config(event); - - if (ret) - return ret; - - if (event->attr.precise_ip && - (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { /* * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P * (0x003c) so that we can use it with PEBS. @@ -1365,10 +1359,48 @@ static int intel_pmu_hw_config(struct perf_event *event) */ u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + +static void intel_pebs_aliases_snb(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use UOPS_RETIRED.ALL + * (0x01c2), which is a PEBS capable event, to get the same + * count. + * + * UOPS_RETIRED.ALL counts the number of cycles that retires + * CNTMASK micro-ops. By setting CNTMASK to a value (16) + * larger than the maximum number of micro-ops that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less micro-ops, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16); alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); event->hw.config = alt_config; } +} + +static int intel_pmu_hw_config(struct perf_event *event) +{ + int ret = x86_pmu_hw_config(event); + + if (ret) + return ret; + + if (event->attr.precise_ip && x86_pmu.pebs_aliases) + x86_pmu.pebs_aliases(event); if (intel_pmu_needs_lbr_smpl(event)) { ret = intel_pmu_setup_lbr_filter(event); @@ -1643,6 +1675,7 @@ static __initconst const struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, .put_event_constraints = intel_put_event_constraints, + .pebs_aliases = intel_pebs_aliases_core2, .format_attrs = intel_arch3_formats_attr, @@ -1885,6 +1918,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ x86_pmu.er_flags |= ERF_HAS_RSP_1; From 47a8863dbb11745446314ca126593789ab74d93a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: [PATCH 035/612] perf/x86: Enable/Add IvyBridge hardware support Implement rudimentary IVB perf support. The SDM states its identical to SNB with exception of the exact event tables, but a quick look suggests they're similar enough. Also mark SNB-EP as broken for now. Requested-and-tested-by: Linus Torvalds Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2312c1ff1b19..187c294bc658 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1909,8 +1909,9 @@ __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ - x86_add_quirk(intel_sandybridge_quirk); case 45: /* SandyBridge, "Romely-EP" */ + x86_add_quirk(intel_sandybridge_quirk); + case 58: /* IvyBridge */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); From 212d95dfdb66e5c81879b08e4f7fbfc8498b1ab5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: [PATCH 036/612] perf/x86: Update SNB PEBS constraints Afaict there's no need to (incompletely) iterate the MEM_UOPS_RETIRED.* umask state. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 5a3edc27f6e5..35e2192df9f4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -400,14 +400,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */ - INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */ - INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */ - INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ From 1c2ac3fde3e35279958e7b0408e2dcf866465301 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 May 2012 15:25:34 +0200 Subject: [PATCH 037/612] perf/x86: Fix wrmsrl() debug wrapper Move the wrmslr() debug wrapper to the common header now that all the include games are gone. Also clean it up a bit to avoid multiple evaluation of the argument. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-l4gkfnivwv4yi5mqxjlovymx@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 11 ----------- arch/x86/kernel/cpu/perf_event.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c4706cf9c011..43c2017347e7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -35,17 +35,6 @@ #include "perf_event.h" -#if 0 -#undef wrmsrl -#define wrmsrl(msr, val) \ -do { \ - trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\ - (unsigned long)(val)); \ - native_write_msr((msr), (u32)((u64)(val)), \ - (u32)((u64)(val) >> 32)); \ -} while (0) -#endif - struct x86_pmu x86_pmu __read_mostly; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 7241e2fc3c17..23b5710b1747 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -14,6 +14,18 @@ #include +#if 0 +#undef wrmsrl +#define wrmsrl(msr, val) \ +do { \ + unsigned int _msr = (msr); \ + u64 _val = (val); \ + trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \ + (unsigned long long)(_val)); \ + native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \ +} while (0) +#endif + /* * | NHM/WSM | SNB | * register ------------------------------- From 1ff4d58a192aea7f245981e2579765f961f6eb9c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 5 Jun 2012 17:56:50 -0700 Subject: [PATCH 038/612] x86: Add rdpmcl() Add a version of rdpmc() that directly reads into a u64 Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338944211-28275-4-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 2 ++ arch/x86/include/asm/paravirt.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 084ef95274cd..e489c1475be9 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -237,6 +237,8 @@ do { \ (high) = (u32)(_l >> 32); \ } while (0) +#define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) + #define rdtscp(low, high, aux) \ do { \ unsigned long long _val = native_read_tscp(&(aux)); \ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6cbbabf52707..14ce05dfe04e 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -252,6 +252,8 @@ do { \ high = _l >> 32; \ } while (0) +#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) + static inline unsigned long long paravirt_rdtscp(unsigned int *aux) { return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); From c48b60538c3ba05a7a2713c4791b25405525431b Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Thu, 1 Mar 2012 17:28:14 -0500 Subject: [PATCH 039/612] perf/x86: Use rdpmc() rather than rdmsr() when possible in the kernel The rdpmc instruction is faster than the equivelant rdmsr call, so use it when possible in the kernel. The perfctr kernel patches did this, after extensive testing showed rdpmc to always be faster (One can look in etc/costs in the perfctr-2.6 package to see a historical list of the overhead). I have done some tests on a 3.2 kernel, the kernel module I used was included in the first posting of this patch: rdmsr rdpmc Core2 T9900: 203.9 cycles 30.9 cycles AMD fam0fh: 56.2 cycles 9.8 cycles Atom 6/28/2: 129.7 cycles 50.6 cycles The speedup of using rdpmc is large. [ It's probably possible (and desirable) to do this without requiring a new field in the hw_perf_event structure, but the fixed events make this tricky. ] Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1203011724030.26934@cl320.eecs.utk.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 +++- include/linux/perf_event.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 43c2017347e7..000a4746c7ce 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -75,7 +75,7 @@ u64 x86_perf_event_update(struct perf_event *event) */ again: prev_raw_count = local64_read(&hwc->prev_count); - rdmsrl(hwc->event_base, new_raw_count); + rdpmcl(hwc->event_base_rdpmc, new_raw_count); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -819,9 +819,11 @@ static inline void x86_assign_hw_event(struct perf_event *event, } else if (hwc->idx >= X86_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (hwc->idx - X86_PMC_IDX_FIXED) | 1<<30; } else { hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); + hwc->event_base_rdpmc = x86_pmu_addr_offset(hwc->idx); } } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 45db49f64bb4..1ce887abcc5c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -677,6 +677,7 @@ struct hw_perf_event { u64 last_tag; unsigned long config_base; unsigned long event_base; + int event_base_rdpmc; int idx; int last_cpu; From 70ab7003dec58afeae7f5d681dfa309b3a259f03 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 5 Jun 2012 17:56:48 -0700 Subject: [PATCH 040/612] perf/x86: Don't assume there can be only 4 PEBS events On Sandy Bridge in non HT mode there are 8 counters available. Since every counter can write a PEBS record assuming there are 4 max is incorrect. Use the reported counter number -- with an upper limit for a static array -- instead. Also I made the warning messages a bit more informational. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338944211-28275-2-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 3 ++- arch/x86/kernel/cpu/perf_event_intel.c | 2 ++ arch/x86/kernel/cpu/perf_event_intel_ds.c | 8 ++++---- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 23b5710b1747..3df3de9452a9 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -69,7 +69,7 @@ struct amd_nb { }; /* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 4 +#define MAX_PEBS_EVENTS 8 /* * A debug store configuration. @@ -378,6 +378,7 @@ struct x86_pmu { void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); + int max_pebs_events; /* * Intel LBR diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 187c294bc658..e23e71f25264 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1800,6 +1800,8 @@ __init int intel_pmu_init(void) x86_pmu.events_maskl = ebx.full; x86_pmu.events_mask_len = eax.split.mask_length; + x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 35e2192df9f4..026373edef7f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -620,7 +620,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) * Should not happen, we program the threshold at 1 and do not * set a reset value. */ - WARN_ON_ONCE(n > 1); + WARN_ONCE(n > 1, "bad leftover pebs %d\n", n); at += n - 1; __intel_pmu_pebs_event(event, iregs, at); @@ -651,10 +651,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * Should not happen, we program the threshold at 1 and do not * set a reset value. */ - WARN_ON_ONCE(n > MAX_PEBS_EVENTS); + WARN_ONCE(n > x86_pmu.max_pebs_events, "Unexpected number of pebs records %d\n", n); for ( ; at < top; at++) { - for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) { + for_each_set_bit(bit, (unsigned long *)&at->status, x86_pmu.max_pebs_events) { event = cpuc->events[bit]; if (!test_bit(bit, cpuc->active_mask)) continue; @@ -670,7 +670,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) break; } - if (!event || bit >= MAX_PEBS_EVENTS) + if (!event || bit >= x86_pmu.max_pebs_events) continue; __intel_pmu_pebs_event(event, iregs, at); From f2f12b6fc032c7b1419fd6db84e2868b5f05a878 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 6 Jun 2012 10:50:06 -0600 Subject: [PATCH 041/612] iommu/amd: Fix missing iommu_shutdown initialization in passthrough mode The iommu_shutdown callback is not initialized when the AMD IOMMU driver runs in passthrough mode. Fix that by moving the callback initialization before the check for passthrough mode. Signed-off-by: Shuah Khan Cc: stable@vger.kernel.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 542024ba6dba..c04ddca7f12f 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -1641,6 +1641,8 @@ static int __init amd_iommu_init(void) amd_iommu_init_api(); + x86_platform.iommu_shutdown = disable_iommus; + if (iommu_pass_through) goto out; @@ -1649,8 +1651,6 @@ static int __init amd_iommu_init(void) else printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); - x86_platform.iommu_shutdown = disable_iommus; - out: return ret; From 7fbb98c5cb07563d3ee08714073a8e5452a96be2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 Jun 2012 10:21:21 -0400 Subject: [PATCH 042/612] x86: Save cr2 in NMI in case NMIs take a page fault Avi Kivity reported that page faults in NMIs could cause havic if the NMI preempted another page fault handler: The recent changes to NMI allow exceptions to take place in NMI handlers, but I think that a #PF (say, due to access to vmalloc space) is still problematic. Consider the sequence #PF (cr2 set by processor) NMI ... #PF (cr2 clobbered) do_page_fault() IRET ... IRET do_page_fault() address = read_cr2() The last line reads the overwritten cr2 value. Originally I wrote a patch to solve this by saving the cr2 on the stack. Brian Gerst suggested to save it in the r12 register as both r12 and rbx are saved by the do_nmi handler as required by the C standard. But rbx is already used for saving if swapgs needs to be run on exit of the NMI handler. Link: http://lkml.kernel.org/r/4FBB8C40.6080304@redhat.com Link: http://lkml.kernel.org/r/1337763411.13348.140.camel@gandalf.stny.rr.com Reported-by: Avi Kivity Cc: Linus Torvalds Cc: H. Peter Anvin Cc: Thomas Gleixner Suggested-by: Brian Gerst Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7d65133b51be..111f6bbd8b38 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1758,10 +1758,30 @@ end_repeat_nmi: */ call save_paranoid DEFAULT_FRAME 0 + + /* + * Save off the CR2 register. If we take a page fault in the NMI then + * it could corrupt the CR2 value. If the NMI preempts a page fault + * handler before it was able to read the CR2 register, and then the + * NMI itself takes a page fault, the page fault that was preempted + * will read the information from the NMI page fault and not the + * origin fault. Save it off and restore it if it changes. + * Use the r12 callee-saved register. + */ + movq %cr2, %r12 + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp,%rdi movq $-1,%rsi call do_nmi + + /* Did the NMI take a page fault? Restore cr2 if it did */ + movq %cr2, %rcx + cmpq %rcx, %r12 + je 1f + movq %r12, %cr2 +1: + testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: From 1f975f78c84c852e09463a2dfa57e3174e5c719e Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 1 Jun 2012 16:52:35 +0200 Subject: [PATCH 043/612] x86, pvops: Remove hooks for {rd,wr}msr_safe_regs There were paravirt_ops hooks for the full register set variant of {rd,wr}msr_safe which are actually not used by anyone anymore. Remove them to make the code cleaner and avoid silent breakages when the pvops members were uninitialized. This has been boot-tested natively and under Xen with PVOPS enabled and disabled on one machine. Signed-off-by: Andre Przywara Link: http://lkml.kernel.org/r/1338562358-28182-2-git-send-email-bp@amd64.org Acked-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr.h | 67 +++++++++++---------------- arch/x86/include/asm/paravirt.h | 39 ---------------- arch/x86/include/asm/paravirt_types.h | 2 - arch/x86/kernel/paravirt.c | 2 - arch/x86/lib/msr-reg-export.c | 4 +- arch/x86/lib/msr-reg.S | 10 ++-- arch/x86/xen/enlighten.c | 2 - 7 files changed, 35 insertions(+), 91 deletions(-) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 084ef95274cd..81860cc012d1 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -115,8 +115,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr, extern unsigned long long native_read_tsc(void); -extern int native_rdmsr_safe_regs(u32 regs[8]); -extern int native_wrmsr_safe_regs(u32 regs[8]); +extern int rdmsr_safe_regs(u32 regs[8]); +extern int wrmsr_safe_regs(u32 regs[8]); static __always_inline unsigned long long __native_read_tsc(void) { @@ -187,43 +187,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) return err; } -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) -{ - u32 gprs[8] = { 0 }; - int err; - - gprs[1] = msr; - gprs[7] = 0x9c5a203a; - - err = native_rdmsr_safe_regs(gprs); - - *p = gprs[0] | ((u64)gprs[2] << 32); - - return err; -} - -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) -{ - u32 gprs[8] = { 0 }; - - gprs[0] = (u32)val; - gprs[1] = msr; - gprs[2] = val >> 32; - gprs[7] = 0x9c5a203a; - - return native_wrmsr_safe_regs(gprs); -} - -static inline int rdmsr_safe_regs(u32 regs[8]) -{ - return native_rdmsr_safe_regs(regs); -} - -static inline int wrmsr_safe_regs(u32 regs[8]) -{ - return native_wrmsr_safe_regs(regs); -} - #define rdtscl(low) \ ((low) = (u32)__native_read_tsc()) @@ -248,6 +211,32 @@ do { \ #endif /* !CONFIG_PARAVIRT */ +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ + u32 gprs[8] = { 0 }; + int err; + + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = rdmsr_safe_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + + return err; +} + +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + u32 gprs[8] = { 0 }; + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return wrmsr_safe_regs(gprs); +} #define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ (u32)((val) >> 32)) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6cbbabf52707..ebb0cdb60a89 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -128,21 +128,11 @@ static inline u64 paravirt_read_msr(unsigned msr, int *err) return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); } -static inline int paravirt_rdmsr_regs(u32 *regs) -{ - return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs); -} - static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); } -static inline int paravirt_wrmsr_regs(u32 *regs) -{ - return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs); -} - /* These should all do BUG_ON(_err), but our headers are too tangled. */ #define rdmsr(msr, val1, val2) \ do { \ @@ -176,9 +166,6 @@ do { \ _err; \ }) -#define rdmsr_safe_regs(regs) paravirt_rdmsr_regs(regs) -#define wrmsr_safe_regs(regs) paravirt_wrmsr_regs(regs) - static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; @@ -186,32 +173,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) *p = paravirt_read_msr(msr, &err); return err; } -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) -{ - u32 gprs[8] = { 0 }; - int err; - - gprs[1] = msr; - gprs[7] = 0x9c5a203a; - - err = paravirt_rdmsr_regs(gprs); - - *p = gprs[0] | ((u64)gprs[2] << 32); - - return err; -} - -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) -{ - u32 gprs[8] = { 0 }; - - gprs[0] = (u32)val; - gprs[1] = msr; - gprs[2] = val >> 32; - gprs[7] = 0x9c5a203a; - - return paravirt_wrmsr_regs(gprs); -} static inline u64 paravirt_read_tsc(void) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8e8b9a4987ee..8613cbb7ba41 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -153,9 +153,7 @@ struct pv_cpu_ops { /* MSR, PMC and TSR operations. err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ u64 (*read_msr)(unsigned int msr, int *err); - int (*rdmsr_regs)(u32 *regs); int (*write_msr)(unsigned int msr, unsigned low, unsigned high); - int (*wrmsr_regs)(u32 *regs); u64 (*read_tsc)(void); u64 (*read_pmc)(int counter); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9ce885996fd7..17fff18a1031 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -352,9 +352,7 @@ struct pv_cpu_ops pv_cpu_ops = { #endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = native_write_msr_safe, - .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .read_tscp = native_read_tscp, diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c index a311cc59b65d..8d6ef78b5d01 100644 --- a/arch/x86/lib/msr-reg-export.c +++ b/arch/x86/lib/msr-reg-export.c @@ -1,5 +1,5 @@ #include #include -EXPORT_SYMBOL(native_rdmsr_safe_regs); -EXPORT_SYMBOL(native_wrmsr_safe_regs); +EXPORT_SYMBOL(rdmsr_safe_regs); +EXPORT_SYMBOL(wrmsr_safe_regs); diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index 69fa10623f21..f6d13eefad10 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -6,13 +6,13 @@ #ifdef CONFIG_X86_64 /* - * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]); + * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]); * * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] * */ .macro op_safe_regs op -ENTRY(native_\op\()_safe_regs) +ENTRY(\op\()_safe_regs) CFI_STARTPROC pushq_cfi %rbx pushq_cfi %rbp @@ -45,13 +45,13 @@ ENTRY(native_\op\()_safe_regs) _ASM_EXTABLE(1b, 3b) CFI_ENDPROC -ENDPROC(native_\op\()_safe_regs) +ENDPROC(\op\()_safe_regs) .endm #else /* X86_32 */ .macro op_safe_regs op -ENTRY(native_\op\()_safe_regs) +ENTRY(\op\()_safe_regs) CFI_STARTPROC pushl_cfi %ebx pushl_cfi %ebp @@ -92,7 +92,7 @@ ENTRY(native_\op\()_safe_regs) _ASM_EXTABLE(1b, 3b) CFI_ENDPROC -ENDPROC(native_\op\()_safe_regs) +ENDPROC(\op\()_safe_regs) .endm #endif diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e74df9548a02..60f1131eb94f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1116,9 +1116,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = xen_write_msr_safe, - .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, From ecd431d95aa04257e977fd6878e4203ce462e7e9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 1 Jun 2012 16:52:36 +0200 Subject: [PATCH 044/612] x86, cpu: Fix show_msr MSR accessing function There's no real reason why, when showing the MSRs on a CPU at boottime, we should be using the AMD-specific variant. Simply use the generic safe one which handles #GPs just fine. Cc: Yinghai Lu Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1338562358-28182-3-git-send-email-bp@amd64.org Acked-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6b9333b429ba..5bbc082c47ad 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -947,7 +947,7 @@ static void __cpuinit __print_cpu_msr(void) index_max = msr_range_array[i].max; for (index = index_min; index < index_max; index++) { - if (rdmsrl_amd_safe(index, &val)) + if (rdmsrl_safe(index, &val)) continue; printk(KERN_INFO " MSR%08x: %016llx\n", index, val); } From 169e9cbd77db23fe50bc8ba68bf081adb67b4220 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 1 Jun 2012 16:52:37 +0200 Subject: [PATCH 045/612] x86, cpu, amd: Fix crash as Xen Dom0 on AMD Trinity systems f7f286a910221 ("x86/amd: Re-enable CPU topology extensions in case BIOS has disabled it") wrongfully added code which used the AMD-specific {rd,wr}msr variants for no real reason. This caused boot panics on xen which wasn't initializing the {rd,wr}msr_safe_regs pv_ops members properly. This, in turn, caused a heated discussion leading to us reviewing all uses of the AMD-specific variants and removing them where unneeded (almost everywhere except an obscure K8 BIOS fix, see 6b0f43ddfa358). Finally, this patch switches to the standard {rd,wr}msr*_safe* variants which should've been used in the first place anyway and avoided unneeded excitation with xen. Signed-off-by: Andre Przywara Link: http://lkml.kernel.org/r/1338562358-28182-4-git-send-email-bp@amd64.org Cc: Andreas Herrmann Link: [Boris: correct and expand commit message] Signed-off-by: Borislav Petkov Acked-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 146bb6218eec..80ccd99542e6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -586,9 +586,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_TOPOEXT)) { u64 val; - if (!rdmsrl_amd_safe(0xc0011005, &val)) { + if (!rdmsrl_safe(0xc0011005, &val)) { val |= 1ULL << 54; - wrmsrl_amd_safe(0xc0011005, val); + checking_wrmsrl(0xc0011005, val); rdmsrl(0xc0011005, val); if (val & (1ULL << 54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); From 2c929ce6f1ed1302be225512b433e6a6554f71a4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 1 Jun 2012 16:52:38 +0200 Subject: [PATCH 046/612] x86, cpu, amd: Deprecate AMD-specific MSR variants Now that all users of {rd,wr}msr_amd_safe have been fixed, deprecate its use by making them private to amd.c and adding warnings when used on anything else beside K8. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1338562358-28182-5-git-send-email-bp@amd64.org Acked-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr.h | 27 --------------------------- arch/x86/kernel/cpu/amd.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 81860cc012d1..cb33b5f00267 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -211,33 +211,6 @@ do { \ #endif /* !CONFIG_PARAVIRT */ -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) -{ - u32 gprs[8] = { 0 }; - int err; - - gprs[1] = msr; - gprs[7] = 0x9c5a203a; - - err = rdmsr_safe_regs(gprs); - - *p = gprs[0] | ((u64)gprs[2] << 32); - - return err; -} - -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) -{ - u32 gprs[8] = { 0 }; - - gprs[0] = (u32)val; - gprs[1] = msr; - gprs[2] = val >> 32; - gprs[7] = 0x9c5a203a; - - return wrmsr_safe_regs(gprs); -} - #define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ (u32)((val) >> 32)) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 80ccd99542e6..c928eb26ada6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -19,6 +19,39 @@ #include "cpu.h" +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + u32 gprs[8] = { 0 }; + int err; + + WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__); + + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = rdmsr_safe_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + + return err; +} + +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + u32 gprs[8] = { 0 }; + + WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__); + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return wrmsr_safe_regs(gprs); +} + #ifdef CONFIG_X86_32 /* * B step AMD K6 before B 9730xxxx have hardware bugs that can cause From 715c85b1fc824e9cd0ea07d6ceb80d2262f32e90 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 7 Jun 2012 13:32:04 -0700 Subject: [PATCH 047/612] x86, cpu: Rename checking_wrmsrl() to wrmsrl_safe() Rename checking_wrmsrl() to wrmsrl_safe(), to match the naming convention used by all the other MSR access functions/macros. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr.h | 2 +- arch/x86/kernel/cpu/amd.c | 4 ++-- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 6 +++--- arch/x86/kernel/cpu/perf_event_p4.c | 14 +++++++------- arch/x86/kernel/cpu/perf_event_p6.c | 4 ++-- arch/x86/kernel/process_64.c | 4 ++-- arch/x86/vdso/vdso32-setup.c | 6 +++--- 8 files changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index cb33b5f00267..fe83d74a920d 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -211,7 +211,7 @@ do { \ #endif /* !CONFIG_PARAVIRT */ -#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ +#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val), \ (u32)((val) >> 32)) #define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2)) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c928eb26ada6..9d92e19039f0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -621,7 +621,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (!rdmsrl_safe(0xc0011005, &val)) { val |= 1ULL << 54; - checking_wrmsrl(0xc0011005, val); + wrmsrl_safe(0xc0011005, val); rdmsrl(0xc0011005, val); if (val & (1ULL << 54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); @@ -712,7 +712,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); if (err == 0) { mask |= (1 << 10); - checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask); } } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e049d6da0183..4e3ba9cb5a4e 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -222,7 +222,7 @@ static bool check_hw_exists(void) * that don't trap on the MSR access and always return 0s. */ val = 0xabcdUL; - ret = checking_wrmsrl(x86_pmu_event_addr(0), val); + ret = wrmsrl_safe(x86_pmu_event_addr(0), val); ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); if (ret || val != val_new) goto msr_fail; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 166546ec6aef..7789aa37c746 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1003,11 +1003,11 @@ static void intel_pmu_reset(void) printk("clearing PMU state on CPU#%d\n", smp_processor_id()); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); - checking_wrmsrl(x86_pmu_event_addr(idx), 0ull); + wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); + wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); if (ds) ds->bts_index = ds->bts_buffer_base; diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 47124a73dd73..6c82e4037989 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -895,8 +895,8 @@ static void p4_pmu_disable_pebs(void) * So at moment let leave metrics turned on forever -- it's * ok for now but need to be revisited! * - * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); - * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); + * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)0); + * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)0); */ } @@ -909,7 +909,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event) * state we need to clear P4_CCCR_OVF, otherwise interrupt get * asserted again and again */ - (void)checking_wrmsrl(hwc->config_base, + (void)wrmsrl_safe(hwc->config_base, (u64)(p4_config_unpack_cccr(hwc->config)) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); } @@ -943,8 +943,8 @@ static void p4_pmu_enable_pebs(u64 config) bind = &p4_pebs_bind_map[idx]; - (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); - (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); + (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); + (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); } static void p4_pmu_enable_event(struct perf_event *event) @@ -978,8 +978,8 @@ static void p4_pmu_enable_event(struct perf_event *event) */ p4_pmu_enable_pebs(hwc->config); - (void)checking_wrmsrl(escr_addr, escr_conf); - (void)checking_wrmsrl(hwc->config_base, + (void)wrmsrl_safe(escr_addr, escr_conf); + (void)wrmsrl_safe(hwc->config_base, (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); } diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 32bcfc7dd230..e4dd0f7a0453 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -71,7 +71,7 @@ p6_pmu_disable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base, val); + (void)wrmsrl_safe(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) @@ -84,7 +84,7 @@ static void p6_pmu_enable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base, val); + (void)wrmsrl_safe(hwc->config_base, val); } PMU_FORMAT_ATTR(event, "config:0-7" ); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 61cdf7fdf099..3e215ba68766 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -466,7 +466,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) task->thread.gs = addr; if (doit) { load_gs_index(0); - ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); } } put_cpu(); @@ -494,7 +494,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) /* set the selector to 0 to not confuse __switch_to */ loadsegment(fs, 0); - ret = checking_wrmsrl(MSR_FS_BASE, addr); + ret = wrmsrl_safe(MSR_FS_BASE, addr); } } put_cpu(); diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 66e6d9359826..0faad646f5fd 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -205,9 +205,9 @@ void syscall32_cpu_init(void) { /* Load these always in case some future AMD CPU supports SYSENTER from compat mode too. */ - checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); - checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); wrmsrl(MSR_CSTAR, ia32_cstar_target); } From 9d8e10667624ea6411f04495aef1fa4a8a778ee8 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 7 Jun 2012 15:14:49 +0200 Subject: [PATCH 048/612] x86/apic: Factor out default vector_allocation_domain() operation Signed-off-by: Alexander Gordeev Acked-by: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120607131449.GC4759@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 21 +++++++++++++++++++++ arch/x86/kernel/apic/apic_flat_64.c | 22 +--------------------- arch/x86/kernel/apic/apic_noop.c | 3 +-- arch/x86/kernel/apic/apic_numachip.c | 8 +------- arch/x86/kernel/apic/bigsmp_32.c | 8 +------- arch/x86/kernel/apic/es7000_32.c | 19 ++----------------- arch/x86/kernel/apic/numaq_32.c | 16 +--------------- arch/x86/kernel/apic/probe_32.c | 17 +---------------- arch/x86/kernel/apic/summit_32.c | 16 +--------------- arch/x86/kernel/apic/x2apic_phys.c | 11 +---------- arch/x86/kernel/apic/x2apic_uv_x.c | 8 +------- 11 files changed, 32 insertions(+), 117 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bef571769e68..feb2dbdae9ec 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -615,6 +615,27 @@ extern unsigned int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask); +static inline void +flat_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_clear(retmask); + cpumask_bits(retmask)[0] = APIC_ALL_CPUS; +} + +static inline void +default_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_copy(retmask, cpumask_of(cpu)); +} + static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) { return physid_isset(apicid, *map); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 55b97ce4fa19..bddc92566d0a 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -36,20 +36,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - /* * Set up the logical destination ID. * @@ -257,12 +243,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) { default_send_IPI_mask_sequence_phys(cpumask, vector); @@ -309,7 +289,7 @@ static struct apic apic_physflat = { .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = physflat_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, /* not needed, but shouldn't hurt: */ .init_apic_ldr = flat_init_apic_ldr, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 7c3dd4fe0686..3e43cf528939 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -104,8 +104,7 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) { if (cpu != 0) pr_warning("APIC: Vector allocated for non-BSP cpu\n"); - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); + cpumask_copy(retmask, cpumask_of(cpu)); } static u32 noop_apic_read(u32 reg) diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index dba4bf2ed566..c028132ad358 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -72,12 +72,6 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) { union numachip_csr_g3_ext_irq_gen int_gen; @@ -222,7 +216,7 @@ static struct apic apic_numachip __refconst = { .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = numachip_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 907aa3d112a6..df342fe4d6aa 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -142,12 +142,6 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { { } /* NULL entry stops DMI scanning */ }; -static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static int probe_bigsmp(void) { if (def_to_bigsmp) @@ -176,7 +170,7 @@ static struct apic apic_bigsmp = { .check_apicid_used = bigsmp_check_apicid_used, .check_apicid_present = bigsmp_check_apicid_present, - .vector_allocation_domain = bigsmp_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = bigsmp_init_apic_ldr, .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index db4ab1be3c79..3c42865757e2 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -394,21 +394,6 @@ static void es7000_enable_apic_mode(void) WARN(1, "Command failed, status = %x\n", mip_status); } -static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - - static void es7000_wait_for_init_deassert(atomic_t *deassert) { while (!atomic_read(deassert)) @@ -638,7 +623,7 @@ static struct apic __refdata apic_es7000_cluster = { .check_apicid_used = es7000_check_apicid_used, .check_apicid_present = es7000_check_apicid_present, - .vector_allocation_domain = es7000_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = es7000_init_apic_ldr_cluster, .ioapic_phys_id_map = es7000_ioapic_phys_id_map, @@ -705,7 +690,7 @@ static struct apic __refdata apic_es7000 = { .check_apicid_used = es7000_check_apicid_used, .check_apicid_present = es7000_check_apicid_present, - .vector_allocation_domain = es7000_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = es7000_init_apic_ldr, .ioapic_phys_id_map = es7000_ioapic_phys_id_map, diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index f00a68cca37a..eb2d466fd81a 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -441,20 +441,6 @@ static int probe_numaq(void) return found_numaq; } -static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - static void numaq_setup_portio_remap(void) { int num_quads = num_online_nodes(); @@ -491,7 +477,7 @@ static struct apic __refdata apic_numaq = { .check_apicid_used = numaq_check_apicid_used, .check_apicid_present = numaq_check_apicid_present, - .vector_allocation_domain = numaq_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = numaq_init_apic_ldr, .ioapic_phys_id_map = numaq_ioapic_phys_id_map, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 71b6ac48ab26..2c6f003b2e4b 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -66,21 +66,6 @@ static void setup_apic_flat_routing(void) #endif } -static void default_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* - * Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - /* should be called last. */ static int probe_default(void) { @@ -105,7 +90,7 @@ static struct apic apic_default = { .check_apicid_used = default_check_apicid_used, .check_apicid_present = default_check_apicid_present, - .vector_allocation_domain = default_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 659897c00755..35d254c1fec2 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -320,20 +320,6 @@ static int probe_summit(void) return 0; } -static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - #ifdef CONFIG_X86_SUMMIT_NUMA static struct rio_table_hdr *rio_table_hdr; static struct scal_detail *scal_devs[MAX_NUMNODES]; @@ -509,7 +495,7 @@ static struct apic apic_summit = { .check_apicid_used = summit_check_apicid_used, .check_apicid_present = summit_check_apicid_present, - .vector_allocation_domain = summit_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = summit_init_apic_ldr, .ioapic_phys_id_map = summit_ioapic_phys_id_map, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index f730269edef2..f109388a0e80 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -88,15 +88,6 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } -/* - * Each logical cpu is in its own vector allocation domain. - */ -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static struct apic apic_x2apic_phys = { .name = "physical x2apic", @@ -114,7 +105,7 @@ static struct apic apic_x2apic_phys = { .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = x2apic_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 16efb92bfea5..df89a7d78748 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -185,12 +185,6 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { #ifdef CONFIG_SMP @@ -363,7 +357,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = uv_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = uv_init_apic_ldr, .ioapic_phys_id_map = NULL, From 1bccd58bfffc5a677051937b332b71f0686187c1 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 7 Jun 2012 15:15:15 +0200 Subject: [PATCH 049/612] x86/apic: Try to spread IRQ vectors to different priority levels When assigning a new vector it is primarially done by adding 8 to the previously given out vector number. Hence, two consequently allocated vector numbers would likely fall into the same priority level. Try to spread vector numbers to different priority levels better by changing the step from 8 to 16. Signed-off-by: Alexander Gordeev Acked-by: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120607131514.GD4759@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 74c569791e75..05af3d341aaa 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1112,7 +1112,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; - static int current_offset = VECTOR_OFFSET_START % 8; + static int current_offset = VECTOR_OFFSET_START % 16; unsigned int old_vector; int cpu, err; cpumask_var_t tmp_mask; @@ -1148,10 +1148,9 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) vector = current_vector; offset = current_offset; next: - vector += 8; + vector += 16; if (vector >= first_system_vector) { - /* If out of vectors on large boxen, must share them. */ - offset = (offset + 1) % 8; + offset = (offset + 1) % 16; vector = FIRST_EXTERNAL_VECTOR + offset; } if (unlikely(current_vector == vector)) From 8637e38aff14d048b649075114023023a2e80fba Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 7 Jun 2012 15:15:44 +0200 Subject: [PATCH 050/612] x86/apic: Avoid useless scanning thru a cpumask in assign_irq_vector() In case of static vector allocation domains (i.e. flat) if all vector numbers are exhausted, an attempt to assign a new vector will lead to useless scans through all CPUs in the cpumask, even though it is known that each new pass would fail. Make this corner case less painful by letting report whether the vector allocation domain depends on passed arguments or not and stop scanning early. The same could have been achived by introducing a static flag to the apic operations. But let's allow vector_allocation_domain() have more intelligence here and decide dynamically, in case we would need it in the future. Signed-off-by: Alexander Gordeev Acked-by: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120607131542.GE4759@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 8 +++++--- arch/x86/kernel/apic/apic_noop.c | 3 ++- arch/x86/kernel/apic/io_apic.c | 12 +++++++++--- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index feb2dbdae9ec..e3fecd50d5ca 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -306,7 +306,7 @@ struct apic { unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); unsigned long (*check_apicid_present)(int apicid); - void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); + bool (*vector_allocation_domain)(int cpu, struct cpumask *retmask); void (*init_apic_ldr)(void); void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); @@ -615,7 +615,7 @@ extern unsigned int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask); -static inline void +static inline bool flat_vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus @@ -628,12 +628,14 @@ flat_vector_allocation_domain(int cpu, struct cpumask *retmask) */ cpumask_clear(retmask); cpumask_bits(retmask)[0] = APIC_ALL_CPUS; + return false; } -static inline void +static inline bool default_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_copy(retmask, cpumask_of(cpu)); + return true; } static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 3e43cf528939..ac9edf247b15 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -100,11 +100,12 @@ static unsigned long noop_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } -static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +static bool noop_vector_allocation_domain(int cpu, struct cpumask *retmask) { if (cpu != 0) pr_warning("APIC: Vector allocated for non-BSP cpu\n"); cpumask_copy(retmask, cpumask_of(cpu)); + return true; } static u32 noop_apic_read(u32 reg) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 05af3d341aaa..4061a7dee5c9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1137,8 +1137,9 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) for_each_cpu_and(cpu, mask, cpu_online_mask) { int new_cpu; int vector, offset; + bool more_domains; - apic->vector_allocation_domain(cpu, tmp_mask); + more_domains = apic->vector_allocation_domain(cpu, tmp_mask); if (cpumask_subset(tmp_mask, cfg->domain)) { free_cpumask_var(tmp_mask); @@ -1153,8 +1154,13 @@ next: offset = (offset + 1) % 16; vector = FIRST_EXTERNAL_VECTOR + offset; } - if (unlikely(current_vector == vector)) - continue; + + if (unlikely(current_vector == vector)) { + if (more_domains) + continue; + else + break; + } if (test_bit(vector, used_vectors)) goto next; From ff164324123c0fe181d8de7dadcc7b3fbe25f2cf Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 7 Jun 2012 15:15:59 +0200 Subject: [PATCH 051/612] x86/apic: Make cpu_mask_to_apicid() operations return error code Current cpu_mask_to_apicid() and cpu_mask_to_apicid_and() implementations have few shortcomings: 1. A value returned by cpu_mask_to_apicid() is written to hardware registers unconditionally. Should BAD_APICID get ever returned it will be written to a hardware too. But the value of BAD_APICID is not universal across all hardware in all modes and might cause unexpected results, i.e. interrupts might get routed to CPUs that are not configured to receive it. 2. Because the value of BAD_APICID is not universal it is counter- intuitive to return it for a hardware where it does not make sense (i.e. x2apic). 3. cpu_mask_to_apicid_and() operation is thought as an complement to cpu_mask_to_apicid() that only applies a AND mask on top of a cpumask being passed. Yet, as consequence of 18374d8 commit the two operations are inconsistent in that of: cpu_mask_to_apicid() should not get a offline CPU with the cpumask cpu_mask_to_apicid_and() should not fail and return BAD_APICID These limitations are impossible to realize just from looking at the operations prototypes. Most of these shortcomings are resolved by returning a error code instead of BAD_APICID. As the result, faults are reported back early rather than possibilities to cause a unexpected behaviour exist (in case of [1]). The only exception is setup_timer_IRQ0_pin() routine. Although obviously controversial to this fix, its existing behaviour is preserved to not break the fragile check_timer() and would better addressed in a separate fix. Signed-off-by: Alexander Gordeev Acked-by: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120607131559.GF4759@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 44 ++++++++++---- arch/x86/kernel/apic/apic.c | 35 ++++++----- arch/x86/kernel/apic/es7000_32.c | 21 ++++--- arch/x86/kernel/apic/io_apic.c | 88 +++++++++++++++++---------- arch/x86/kernel/apic/numaq_32.c | 14 +++-- arch/x86/kernel/apic/summit_32.c | 22 ++++--- arch/x86/kernel/apic/x2apic_cluster.c | 24 +++++--- arch/x86/kernel/apic/x2apic_uv_x.c | 27 +++++--- arch/x86/platform/uv/uv_irq.c | 7 ++- drivers/iommu/intel_irq_remapping.c | 13 +++- 10 files changed, 189 insertions(+), 106 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index e3fecd50d5ca..ae91f9c7e360 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -331,9 +331,11 @@ struct apic { unsigned long (*set_apic_id)(unsigned int id); unsigned long apic_id_mask; - unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); - unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, - const struct cpumask *andmask); + int (*cpu_mask_to_apicid)(const struct cpumask *cpumask, + unsigned int *apicid); + int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid); /* ipi */ void (*send_IPI_mask)(const struct cpumask *mask, int vector); @@ -591,29 +593,45 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) #endif -static inline unsigned int -flat_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int +__flat_cpu_mask_to_apicid(unsigned long cpu_mask, unsigned int *apicid) { - return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; + cpu_mask &= APIC_ALL_CPUS; + if (likely(cpu_mask)) { + *apicid = (unsigned int)cpu_mask; + return 0; + } else { + return -EINVAL; + } } -static inline unsigned int +static inline int +flat_cpu_mask_to_apicid(const struct cpumask *cpumask, + unsigned int *apicid) +{ + return __flat_cpu_mask_to_apicid(cpumask_bits(cpumask)[0], apicid); +} + +static inline int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { unsigned long mask1 = cpumask_bits(cpumask)[0]; unsigned long mask2 = cpumask_bits(andmask)[0]; unsigned long mask3 = cpumask_bits(cpu_online_mask)[0]; - return (unsigned int)(mask1 & mask2 & mask3); + return __flat_cpu_mask_to_apicid(mask1 & mask2 & mask3, apicid); } -extern unsigned int -default_cpu_mask_to_apicid(const struct cpumask *cpumask); +extern int +default_cpu_mask_to_apicid(const struct cpumask *cpumask, + unsigned int *apicid); -extern unsigned int +extern int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask); + const struct cpumask *andmask, + unsigned int *apicid); static inline bool flat_vector_allocation_domain(int cpu, struct cpumask *retmask) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 96a2608252f1..b8d92606f84f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2123,24 +2123,26 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } -unsigned int default_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int __default_cpu_to_apicid(int cpu, unsigned int *apicid) { - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = cpumask_first(cpumask); - if (likely((unsigned)cpu < nr_cpu_ids)) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; + if (likely((unsigned int)cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu); + return 0; + } else { + return -EINVAL; + } } -unsigned int -default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) +int default_cpu_mask_to_apicid(const struct cpumask *cpumask, + unsigned int *apicid) +{ + int cpu = cpumask_first(cpumask); + return __default_cpu_to_apicid(cpu, apicid); +} + +int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid) { int cpu; @@ -2148,7 +2150,8 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - return per_cpu(x86_cpu_to_apicid, cpu); + + return __default_cpu_to_apicid(cpu, apicid); } /* diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 3c42865757e2..515ebb00a9fc 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -525,7 +525,8 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid) return 1; } -static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) +static int +es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; int cpu, uninitialized_var(apicid); @@ -539,31 +540,33 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { WARN(1, "Not a valid mask!"); - return BAD_APICID; + return -EINVAL; } apicid = new_apicid; round++; } - return apicid; + *dest_id = apicid; + return 0; } -static unsigned int +static int es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { - int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); + *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) - return apicid; + return 0; cpumask_and(cpumask, inmask, andmask); cpumask_and(cpumask, cpumask, cpu_online_mask); - apicid = es7000_cpu_mask_to_apicid(cpumask); + es7000_cpu_mask_to_apicid(cpumask, apicid); free_cpumask_var(cpumask); - return apicid; + return 0; } static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4061a7dee5c9..0deb773404e5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1359,7 +1359,14 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); + if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(), + &dest)) { + pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n", + mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); + __clear_irq_vector(irq, cfg); + + return; + } apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " @@ -1474,6 +1481,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, unsigned int pin, int vector) { struct IO_APIC_route_entry entry; + unsigned int dest; if (irq_remapping_enabled) return; @@ -1484,9 +1492,12 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, * We use logical delivery to get the timer IRQ * to the first CPU. */ + if (unlikely(apic->cpu_mask_to_apicid(apic->target_cpus(), &dest))) + dest = BAD_APICID; + entry.dest_mode = apic->irq_dest_mode; entry.mask = 0; /* don't mask IRQ for edge */ - entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus()); + entry.dest = dest; entry.delivery_mode = apic->irq_delivery_mode; entry.polarity = 0; entry.trigger = 0; @@ -2245,16 +2256,25 @@ int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, unsigned int *dest_id) { struct irq_cfg *cfg = data->chip_data; + unsigned int irq = data->irq; + int err; if (!cpumask_intersects(mask, cpu_online_mask)) - return -1; + return -EINVAL; - if (assign_irq_vector(data->irq, data->chip_data, mask)) - return -1; + err = assign_irq_vector(irq, cfg, mask); + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); + if (err) { + if (assign_irq_vector(irq, cfg, data->affinity)) + pr_err("Failed to recover vector for irq %d\n", irq); + return err; + } cpumask_copy(data->affinity, mask); - *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain); return 0; } @@ -3040,7 +3060,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, if (err) return err; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); + err = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus(), &dest); + if (err) + return err; if (irq_remapped(cfg)) { compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); @@ -3361,6 +3384,8 @@ static struct irq_chip ht_irq_chip = { int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { struct irq_cfg *cfg; + struct ht_irq_msg msg; + unsigned dest; int err; if (disable_apic) @@ -3368,36 +3393,37 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) cfg = irq_cfg(irq); err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (!err) { - struct ht_irq_msg msg; - unsigned dest; + if (err) + return err; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus()); + err = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus(), &dest); + if (err) + return err; - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(cfg->vector) | - ((apic->irq_dest_mode == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(cfg->vector) | + ((apic->irq_dest_mode == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; - write_ht_irq_msg(irq, &msg); + write_ht_irq_msg(irq, &msg); - irq_set_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); + irq_set_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); - dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); - } - return err; + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); + + return 0; } #endif /* CONFIG_HT_IRQ */ diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index eb2d466fd81a..2b55514c328b 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -406,16 +406,20 @@ static inline int numaq_check_phys_apicid_present(int phys_apicid) * We use physical apicids here, not logical, so just return the default * physical broadcast to stop people from breaking us */ -static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask) +static int +numaq_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) { - return 0x0F; + *apicid = 0x0F; + return 0; } -static inline unsigned int +static int numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { - return 0x0F; + *apicid = 0x0F; + return 0; } /* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 35d254c1fec2..5766d84f12d6 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -263,7 +263,8 @@ static int summit_check_phys_apicid_present(int physical_apicid) return 1; } -static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) +static int +summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; int cpu, apicid = 0; @@ -276,30 +277,33 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { printk("%s: Not a valid mask!\n", __func__); - return BAD_APICID; + return -EINVAL; } apicid |= new_apicid; round++; } - return apicid; + *dest_id = apicid; + return 0; } -static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, - const struct cpumask *andmask) +static int +summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, + const struct cpumask *andmask, + unsigned int *apicid) { - int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); + *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) - return apicid; + return 0; cpumask_and(cpumask, inmask, andmask); cpumask_and(cpumask, cpumask, cpu_online_mask); - apicid = summit_cpu_mask_to_apicid(cpumask); + summit_cpu_mask_to_apicid(cpumask, apicid); free_cpumask_var(cpumask); - return apicid; + return 0; } /* diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 612622c47dfb..5f86f79335f4 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -96,24 +96,26 @@ static void x2apic_send_IPI_all(int vector) __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } -static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) +static int +x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) { int cpu = cpumask_first(cpumask); - u32 dest = 0; int i; - if (cpu > nr_cpu_ids) - return BAD_APICID; + if (cpu >= nr_cpu_ids) + return -EINVAL; + *apicid = 0; for_each_cpu_and(i, cpumask, per_cpu(cpus_in_cluster, cpu)) - dest |= per_cpu(x86_cpu_to_logical_apicid, i); + *apicid |= per_cpu(x86_cpu_to_logical_apicid, i); - return dest; + return 0; } -static unsigned int +static int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { u32 dest = 0; u16 cluster; @@ -128,7 +130,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, } if (!dest) - return BAD_APICID; + return -EINVAL; for_each_cpu_and(i, cpumask, andmask) { if (!cpumask_test_cpu(i, cpu_online_mask)) @@ -138,7 +140,9 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, dest |= per_cpu(x86_cpu_to_logical_apicid, i); } - return dest; + *apicid = dest; + + return 0; } static void init_x2apic_ldr(void) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index df89a7d78748..2f3030fef31e 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -269,23 +269,31 @@ static void uv_init_apic_ldr(void) { } -static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int __uv_cpu_to_apicid(int cpu, unsigned int *apicid) +{ + if (likely((unsigned int)cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; + return 0; + } else { + return -EINVAL; + } +} + +static int +uv_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) { /* * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ int cpu = cpumask_first(cpumask); - - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; - else - return BAD_APICID; + return __uv_cpu_to_apicid(cpu, apicid); } -static unsigned int +static int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { int cpu; @@ -297,7 +305,8 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; + + return __uv_cpu_to_apicid(cpu, apicid); } static unsigned int x2apic_get_apic_id(unsigned long x) diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index f25c2765a5c9..dd1ff39a464c 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -135,6 +135,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; int mmr_pnode, err; + unsigned int dest; BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); @@ -143,6 +144,10 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, if (err != 0) return err; + err = apic->cpu_mask_to_apicid(eligible_cpu, &dest); + if (err != 0) + return err; + if (limit == UV_AFFINITY_CPU) irq_set_status_flags(irq, IRQ_NO_BALANCING); else @@ -159,7 +164,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, entry->polarity = 0; entry->trigger = 0; entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + entry->dest = dest; mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 6d347064b8b0..dafbad06390a 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -924,6 +924,7 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, struct irq_cfg *cfg = data->chip_data; unsigned int dest, irq = data->irq; struct irte irte; + int err; if (!cpumask_intersects(mask, cpu_online_mask)) return -EINVAL; @@ -931,10 +932,16 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, if (get_irte(irq, &irte)) return -EBUSY; - if (assign_irq_vector(irq, cfg, mask)) - return -EBUSY; + err = assign_irq_vector(irq, cfg, mask); + if (err) + return err; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); + err = apic->cpu_mask_to_apicid_and(cfg->domain, mask, &dest); + if (err) { + if (assign_irq_vector(irq, cfg, data->affinity)); + pr_err("Failed to recover vector for irq %d\n", irq); + return err; + } irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); From 4988a40c3981212fa8c64da68722affc1cb6697a Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 7 Jun 2012 15:16:25 +0200 Subject: [PATCH 052/612] x86/apic: Make cpu_mask_to_apicid() operations check cpu_online_mask Currently cpu_mask_to_apicid() should not get a offline CPU with the cpumask. Otherwise some apic drivers might try to access non-existent per-cpu variables (i.e. x2apic). In that regard cpu_mask_to_apicid() and cpu_mask_to_apicid_and() operations are inconsistent. This fix makes the two operations do not rely on calling functions and always return the apicid for only online CPUs. As result, the meaning and implementations of cpu_mask_to_apicid() and cpu_mask_to_apicid_and() operations become straight. Signed-off-by: Alexander Gordeev Acked-by: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120607131624.GG4759@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 6 ++---- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/es7000_32.c | 3 +-- arch/x86/kernel/apic/summit_32.c | 3 +-- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- 6 files changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index ae91f9c7e360..1ed3eead2039 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -596,7 +596,7 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) static inline int __flat_cpu_mask_to_apicid(unsigned long cpu_mask, unsigned int *apicid) { - cpu_mask &= APIC_ALL_CPUS; + cpu_mask = cpu_mask & APIC_ALL_CPUS & cpumask_bits(cpu_online_mask)[0]; if (likely(cpu_mask)) { *apicid = (unsigned int)cpu_mask; return 0; @@ -619,9 +619,7 @@ flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, { unsigned long mask1 = cpumask_bits(cpumask)[0]; unsigned long mask2 = cpumask_bits(andmask)[0]; - unsigned long mask3 = cpumask_bits(cpu_online_mask)[0]; - - return __flat_cpu_mask_to_apicid(mask1 & mask2 & mask3, apicid); + return __flat_cpu_mask_to_apicid(mask1 & mask2, apicid); } extern int diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b8d92606f84f..7e9bbe73bc5a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2136,7 +2136,7 @@ static inline int __default_cpu_to_apicid(int cpu, unsigned int *apicid) int default_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) { - int cpu = cpumask_first(cpumask); + int cpu = cpumask_first_and(cpumask, cpu_online_mask); return __default_cpu_to_apicid(cpu, apicid); } diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 515ebb00a9fc..b35cfb9b6962 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -534,7 +534,7 @@ es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) /* * The cpus in the mask must all be on the apic cluster. */ - for_each_cpu(cpu, cpumask) { + for_each_cpu_and(cpu, cpumask, cpu_online_mask) { int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { @@ -561,7 +561,6 @@ es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, return 0; cpumask_and(cpumask, inmask, andmask); - cpumask_and(cpumask, cpumask, cpu_online_mask); es7000_cpu_mask_to_apicid(cpumask, apicid); free_cpumask_var(cpumask); diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 5766d84f12d6..79d360f6729e 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -272,7 +272,7 @@ summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) /* * The cpus in the mask must all be on the apic cluster. */ - for_each_cpu(cpu, cpumask) { + for_each_cpu_and(cpu, cpumask, cpu_online_mask) { int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { @@ -298,7 +298,6 @@ summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, return 0; cpumask_and(cpumask, inmask, andmask); - cpumask_and(cpumask, cpumask, cpu_online_mask); summit_cpu_mask_to_apicid(cpumask, apicid); free_cpumask_var(cpumask); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 5f86f79335f4..23a46cf5b6fd 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -99,7 +99,7 @@ static void x2apic_send_IPI_all(int vector) static int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) { - int cpu = cpumask_first(cpumask); + int cpu = cpumask_first_and(cpumask, cpu_online_mask); int i; if (cpu >= nr_cpu_ids) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2f3030fef31e..307aa076bd62 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -286,7 +286,7 @@ uv_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - int cpu = cpumask_first(cpumask); + int cpu = cpumask_first_and(cpumask, cpu_online_mask); return __uv_cpu_to_apicid(cpu, apicid); } From bf947fcb77ff858f223c49c76e2d130095fa2585 Mon Sep 17 00:00:00 2001 From: Donald Dutile Date: Mon, 4 Jun 2012 17:29:01 -0400 Subject: [PATCH 053/612] iommu/dmar: Replace printks with appropriate pr_*() Just some cleanup so next patch can keep the info printing the same way throughout the file. Replace printk(KERN_* with pr_*() functions. Signed-off-by: Donald Dutile Cc: iommu@lists.linux-foundation.org Cc: chrisw@redhat.com Cc: suresh.b.siddha@intel.com Cc: dwmw2@infradead.org Link: http://lkml.kernel.org/r/1338845342-12464-2-git-send-email-ddutile@redhat.com Signed-off-by: Ingo Molnar --- drivers/iommu/dmar.c | 83 +++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 48 deletions(-) diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 3a74e4410fc0..1e5a10de3471 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -83,15 +83,14 @@ static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope, * ignore it */ if (!bus) { - printk(KERN_WARNING - PREFIX "Device scope bus [%d] not found\n", - scope->bus); + pr_warn(PREFIX "Device scope bus [%d] not found\n", + scope->bus); break; } pdev = pci_get_slot(bus, PCI_DEVFN(path->dev, path->fn)); if (!pdev) { - printk(KERN_WARNING PREFIX - "Device scope device [%04x:%02x:%02x.%02x] not found\n", + pr_warn(PREFIX "Device scope device" + "[%04x:%02x:%02x.%02x] not found\n", segment, bus->number, path->dev, path->fn); break; } @@ -100,9 +99,9 @@ static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope, bus = pdev->subordinate; } if (!pdev) { - printk(KERN_WARNING PREFIX - "Device scope device [%04x:%02x:%02x.%02x] not found\n", - segment, scope->bus, path->dev, path->fn); + pr_warn(PREFIX + "Device scope device [%04x:%02x:%02x.%02x] not found\n", + segment, scope->bus, path->dev, path->fn); *dev = NULL; return 0; } @@ -110,8 +109,7 @@ static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope, pdev->subordinate) || (scope->entry_type == \ ACPI_DMAR_SCOPE_TYPE_BRIDGE && !pdev->subordinate)) { pci_dev_put(pdev); - printk(KERN_WARNING PREFIX - "Device scope type does not match for %s\n", + pr_warn(PREFIX "Device scope type does not match for %s\n", pci_name(pdev)); return -EINVAL; } @@ -134,8 +132,7 @@ int __init dmar_parse_dev_scope(void *start, void *end, int *cnt, scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE) (*cnt)++; else if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_IOAPIC) { - printk(KERN_WARNING PREFIX - "Unsupported device scope\n"); + pr_warn(PREFIX "Unsupported device scope\n"); } start += scope->length; } @@ -261,25 +258,23 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header) case ACPI_DMAR_TYPE_HARDWARE_UNIT: drhd = container_of(header, struct acpi_dmar_hardware_unit, header); - printk (KERN_INFO PREFIX - "DRHD base: %#016Lx flags: %#x\n", + pr_info(PREFIX "DRHD base: %#016Lx flags: %#x\n", (unsigned long long)drhd->address, drhd->flags); break; case ACPI_DMAR_TYPE_RESERVED_MEMORY: rmrr = container_of(header, struct acpi_dmar_reserved_memory, header); - printk (KERN_INFO PREFIX - "RMRR base: %#016Lx end: %#016Lx\n", + pr_info(PREFIX "RMRR base: %#016Lx end: %#016Lx\n", (unsigned long long)rmrr->base_address, (unsigned long long)rmrr->end_address); break; case ACPI_DMAR_TYPE_ATSR: atsr = container_of(header, struct acpi_dmar_atsr, header); - printk(KERN_INFO PREFIX "ATSR flags: %#x\n", atsr->flags); + pr_info(PREFIX "ATSR flags: %#x\n", atsr->flags); break; case ACPI_DMAR_HARDWARE_AFFINITY: rhsa = container_of(header, struct acpi_dmar_rhsa, header); - printk(KERN_INFO PREFIX "RHSA base: %#016Lx proximity domain: %#x\n", + pr_info(PREFIX "RHSA base: %#016Lx proximity domain: %#x\n", (unsigned long long)rhsa->base_address, rhsa->proximity_domain); break; @@ -299,7 +294,7 @@ static int __init dmar_table_detect(void) &dmar_tbl_size); if (ACPI_SUCCESS(status) && !dmar_tbl) { - printk (KERN_WARNING PREFIX "Unable to map DMAR\n"); + pr_warn(PREFIX "Unable to map DMAR\n"); status = AE_NOT_FOUND; } @@ -333,20 +328,18 @@ parse_dmar_table(void) return -ENODEV; if (dmar->width < PAGE_SHIFT - 1) { - printk(KERN_WARNING PREFIX "Invalid DMAR haw\n"); + pr_warn(PREFIX "Invalid DMAR haw\n"); return -EINVAL; } - printk (KERN_INFO PREFIX "Host address width %d\n", - dmar->width + 1); + pr_info(PREFIX "Host address width %d\n", dmar->width + 1); entry_header = (struct acpi_dmar_header *)(dmar + 1); while (((unsigned long)entry_header) < (((unsigned long)dmar) + dmar_tbl->length)) { /* Avoid looping forever on bad ACPI tables */ if (entry_header->length == 0) { - printk(KERN_WARNING PREFIX - "Invalid 0-length structure\n"); + pr_warn(PREFIX "Invalid 0-length structure\n"); ret = -EINVAL; break; } @@ -369,8 +362,7 @@ parse_dmar_table(void) #endif break; default: - printk(KERN_WARNING PREFIX - "Unknown DMAR structure type %d\n", + pr_warn(PREFIX "Unknown DMAR structure type %d\n", entry_header->type); ret = 0; /* for forward compatibility */ break; @@ -469,12 +461,12 @@ int __init dmar_table_init(void) ret = parse_dmar_table(); if (ret) { if (ret != -ENODEV) - printk(KERN_INFO PREFIX "parse DMAR table failure.\n"); + pr_info(PREFIX "parse DMAR table failure.\n"); return ret; } if (list_empty(&dmar_drhd_units)) { - printk(KERN_INFO PREFIX "No DMAR devices found\n"); + pr_info(PREFIX "No DMAR devices found\n"); return -ENODEV; } @@ -506,8 +498,7 @@ int __init check_zero_address(void) (((unsigned long)dmar) + dmar_tbl->length)) { /* Avoid looping forever on bad ACPI tables */ if (entry_header->length == 0) { - printk(KERN_WARNING PREFIX - "Invalid 0-length structure\n"); + pr_warn(PREFIX "Invalid 0-length structure\n"); return 0; } @@ -558,8 +549,8 @@ int __init detect_intel_iommu(void) if (ret && irq_remapping_enabled && cpu_has_x2apic && dmar->flags & 0x1) - printk(KERN_INFO - "Queued invalidation will be enabled to support x2apic and Intr-remapping.\n"); + pr_info("Queued invalidation will be enabled to " + "support x2apic and Intr-remapping.\n"); if (ret && !no_iommu && !iommu_detected && !dmar_disabled) { iommu_detected = 1; @@ -602,7 +593,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE); if (!iommu->reg) { - printk(KERN_ERR "IOMMU: can't map the region\n"); + pr_err("IOMMU: can't map the region\n"); goto error; } iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG); @@ -615,15 +606,13 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) agaw = iommu_calculate_agaw(iommu); if (agaw < 0) { - printk(KERN_ERR - "Cannot get a valid agaw for iommu (seq_id = %d)\n", - iommu->seq_id); + pr_err("Cannot get a valid agaw for iommu (seq_id = %d)\n", + iommu->seq_id); goto err_unmap; } msagaw = iommu_calculate_max_sagaw(iommu); if (msagaw < 0) { - printk(KERN_ERR - "Cannot get a valid max agaw for iommu (seq_id = %d)\n", + pr_err("Cannot get a valid max agaw for iommu (seq_id = %d)\n", iommu->seq_id); goto err_unmap; } @@ -640,7 +629,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) iounmap(iommu->reg); iommu->reg = ioremap(drhd->reg_base_addr, map_size); if (!iommu->reg) { - printk(KERN_ERR "IOMMU: can't map the region\n"); + pr_err("IOMMU: can't map the region\n"); goto error; } } @@ -710,7 +699,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index) if (fault & DMA_FSTS_IQE) { head = readl(iommu->reg + DMAR_IQH_REG); if ((head >> DMAR_IQ_SHIFT) == index) { - printk(KERN_ERR "VT-d detected invalid descriptor: " + pr_err("VT-d detected invalid descriptor: " "low=%llx, high=%llx\n", (unsigned long long)qi->desc[index].low, (unsigned long long)qi->desc[index].high); @@ -1129,15 +1118,14 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type, reason = dmar_get_fault_reason(fault_reason, &fault_type); if (fault_type == INTR_REMAP) - printk(KERN_ERR "INTR-REMAP: Request device [[%02x:%02x.%d] " + pr_err("INTR-REMAP: Request device [[%02x:%02x.%d] " "fault index %llx\n" "INTR-REMAP:[fault reason %02d] %s\n", (source_id >> 8), PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr >> 48, fault_reason, reason); else - printk(KERN_ERR - "DMAR:[%s] Request device [%02x:%02x.%d] " + pr_err("DMAR:[%s] Request device [%02x:%02x.%d] " "fault addr %llx \n" "DMAR:[fault reason %02d] %s\n", (type ? "DMA Read" : "DMA Write"), @@ -1157,8 +1145,7 @@ irqreturn_t dmar_fault(int irq, void *dev_id) raw_spin_lock_irqsave(&iommu->register_lock, flag); fault_status = readl(iommu->reg + DMAR_FSTS_REG); if (fault_status) - printk(KERN_ERR "DRHD: handling fault status reg %x\n", - fault_status); + pr_err("DRHD: handling fault status reg %x\n", fault_status); /* TBD: ignore advanced fault log currently */ if (!(fault_status & DMA_FSTS_PPF)) @@ -1224,7 +1211,7 @@ int dmar_set_interrupt(struct intel_iommu *iommu) irq = create_irq(); if (!irq) { - printk(KERN_ERR "IOMMU: no free vectors\n"); + pr_err("IOMMU: no free vectors\n"); return -EINVAL; } @@ -1241,7 +1228,7 @@ int dmar_set_interrupt(struct intel_iommu *iommu) ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu); if (ret) - printk(KERN_ERR "IOMMU: can't request irq\n"); + pr_err("IOMMU: can't request irq\n"); return ret; } @@ -1258,7 +1245,7 @@ int __init enable_drhd_fault_handling(void) ret = dmar_set_interrupt(iommu); if (ret) { - printk(KERN_ERR "DRHD %Lx: failed to enable fault, " + pr_err("DRHD %Lx: failed to enable fault, " " interrupt, ret %d\n", (unsigned long long)drhd->reg_base_addr, ret); return -1; From 6f5cf52114dd87f9ed091678f7dfc8ff21bbe2b3 Mon Sep 17 00:00:00 2001 From: Donald Dutile Date: Mon, 4 Jun 2012 17:29:02 -0400 Subject: [PATCH 054/612] iommu/dmar: Reserve mmio space used by the IOMMU, if the BIOS forgets to Intel-iommu initialization doesn't currently reserve the memory used for the IOMMU registers. This can allow the pci resource allocator to assign a device BAR to the same address as the IOMMU registers. This can cause some not so nice side affects when the driver ioremap's that region. Introduced two helper functions to map & unmap the IOMMU registers as well as simplify the init and exit paths. Signed-off-by: Donald Dutile Acked-by: Chris Wright Cc: iommu@lists.linux-foundation.org Cc: suresh.b.siddha@intel.com Cc: dwmw2@infradead.org Link: http://lkml.kernel.org/r/1338845342-12464-3-git-send-email-ddutile@redhat.com Signed-off-by: Ingo Molnar --- drivers/iommu/dmar.c | 111 +++++++++++++++++++++++++++--------- include/linux/intel-iommu.h | 2 + 2 files changed, 86 insertions(+), 27 deletions(-) diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 1e5a10de3471..9ab6ebf46f7a 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -570,14 +570,89 @@ int __init detect_intel_iommu(void) } +static void unmap_iommu(struct intel_iommu *iommu) +{ + iounmap(iommu->reg); + release_mem_region(iommu->reg_phys, iommu->reg_size); +} + +/** + * map_iommu: map the iommu's registers + * @iommu: the iommu to map + * @phys_addr: the physical address of the base resgister + * + * Memory map the iommu's registers. Start w/ a single page, and + * possibly expand if that turns out to be insufficent. + */ +static int map_iommu(struct intel_iommu *iommu, u64 phys_addr) +{ + int map_size, err=0; + + iommu->reg_phys = phys_addr; + iommu->reg_size = VTD_PAGE_SIZE; + + if (!request_mem_region(iommu->reg_phys, iommu->reg_size, iommu->name)) { + pr_err("IOMMU: can't reserve memory\n"); + err = -EBUSY; + goto out; + } + + iommu->reg = ioremap(iommu->reg_phys, iommu->reg_size); + if (!iommu->reg) { + pr_err("IOMMU: can't map the region\n"); + err = -ENOMEM; + goto release; + } + + iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG); + iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG); + + if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) { + err = -EINVAL; + warn_invalid_dmar(phys_addr, " returns all ones"); + goto unmap; + } + + /* the registers might be more than one page */ + map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap), + cap_max_fault_reg_offset(iommu->cap)); + map_size = VTD_PAGE_ALIGN(map_size); + if (map_size > iommu->reg_size) { + iounmap(iommu->reg); + release_mem_region(iommu->reg_phys, iommu->reg_size); + iommu->reg_size = map_size; + if (!request_mem_region(iommu->reg_phys, iommu->reg_size, + iommu->name)) { + pr_err("IOMMU: can't reserve memory\n"); + err = -EBUSY; + goto out; + } + iommu->reg = ioremap(iommu->reg_phys, iommu->reg_size); + if (!iommu->reg) { + pr_err("IOMMU: can't map the region\n"); + err = -ENOMEM; + goto release; + } + } + err = 0; + goto out; + +unmap: + iounmap(iommu->reg); +release: + release_mem_region(iommu->reg_phys, iommu->reg_size); +out: + return err; +} + int alloc_iommu(struct dmar_drhd_unit *drhd) { struct intel_iommu *iommu; - int map_size; u32 ver; static int iommu_allocated = 0; int agaw = 0; int msagaw = 0; + int err; if (!drhd->reg_base_addr) { warn_invalid_dmar(0, ""); @@ -591,19 +666,13 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) iommu->seq_id = iommu_allocated++; sprintf (iommu->name, "dmar%d", iommu->seq_id); - iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE); - if (!iommu->reg) { - pr_err("IOMMU: can't map the region\n"); + err = map_iommu(iommu, drhd->reg_base_addr); + if (err) { + pr_err("IOMMU: failed to map %s\n", iommu->name); goto error; } - iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG); - iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG); - - if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) { - warn_invalid_dmar(drhd->reg_base_addr, " returns all ones"); - goto err_unmap; - } + err = -EINVAL; agaw = iommu_calculate_agaw(iommu); if (agaw < 0) { pr_err("Cannot get a valid agaw for iommu (seq_id = %d)\n", @@ -621,19 +690,6 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) iommu->node = -1; - /* the registers might be more than one page */ - map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap), - cap_max_fault_reg_offset(iommu->cap)); - map_size = VTD_PAGE_ALIGN(map_size); - if (map_size > VTD_PAGE_SIZE) { - iounmap(iommu->reg); - iommu->reg = ioremap(drhd->reg_base_addr, map_size); - if (!iommu->reg) { - pr_err("IOMMU: can't map the region\n"); - goto error; - } - } - ver = readl(iommu->reg + DMAR_VER_REG); pr_info("IOMMU %d: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n", iommu->seq_id, @@ -648,10 +704,10 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) return 0; err_unmap: - iounmap(iommu->reg); + unmap_iommu(iommu); error: kfree(iommu); - return -1; + return err; } void free_iommu(struct intel_iommu *iommu) @@ -662,7 +718,8 @@ void free_iommu(struct intel_iommu *iommu) free_dmar_iommu(iommu); if (iommu->reg) - iounmap(iommu->reg); + unmap_iommu(iommu); + kfree(iommu); } diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index e6ca56de9936..78e2ada50cd5 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -308,6 +308,8 @@ enum { struct intel_iommu { void __iomem *reg; /* Pointer to hardware regs, virtual addr */ + u64 reg_phys; /* physical address of hw register set */ + u64 reg_size; /* size of hw register set */ u64 cap; u64 ecap; u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */ From 7eb9ba5ed312ec6ed9d22259c5da1acb7cf4bd29 Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Fri, 8 Jun 2012 15:02:57 +0530 Subject: [PATCH 055/612] uprobes: Pass probed vaddr to arch_uprobe_analyze_insn() On RISC architectures like powerpc, instructions are fixed size. Instruction analysis on such platforms is just a matter of (insn % 4). Pass the vaddr at which the uprobe is to be inserted so that arch_uprobe_analyze_insn() can flag misaligned registration requests. Signed-off-by: Ananth N Mavinakaynahalli Cc: michael@ellerman.id.au Cc: antonb@thinktux.localdomain Cc: Paul Mackerras Cc: benh@kernel.crashing.org Cc: peterz@infradead.org Cc: Srikar Dronamraju Cc: Jim Keniston Cc: oleg@redhat.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20120608093257.GG13409@in.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uprobes.h | 2 +- arch/x86/kernel/uprobes.c | 3 ++- kernel/events/uprobes.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 1e9bed14f7ae..f3971bbcd1de 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -48,7 +48,7 @@ struct arch_uprobe_task { #endif }; -extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm); +extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index dc4e910a7d96..36fd42091fa7 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -409,9 +409,10 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. * @arch_uprobe: the probepoint information. + * @addr: virtual address at which to install the probepoint * Return 0 on success or a -ve number on error. */ -int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { int ret; struct insn insn; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8c5e043cd309..b52376d02332 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -706,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) return -EEXIST; - ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, addr); if (ret) return ret; From c7d65a78fc18ed70353baeb7497ec71a7c775ac5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 Jun 2012 11:03:00 -0400 Subject: [PATCH 056/612] x86: Remove cmpxchg from i386 NMI nesting code I've been informed by someone on LWN called 'slashdot' that some i386 machines do not support a true cmpxchg. The cmpxchg used by the i386 NMI nesting code must be a true cmpxchg as disabling interrupts will not work for NMIs (which is the work around for i386s that do not have a true cmpxchg). This 'slashdot' character also suggested a fix to the issue. As the state of the nesting NMIs goes as follows: NOT_RUNNING -> EXECUTING EXECUTING -> NOT_RUNNING EXECUTING -> LATCHED LATCHED -> EXECUTING Having these states as enum values of: NOT_RUNNING = 0 EXECUTING = 1 LATCHED = 2 Instead of a cmpxchg to make EXECUTING -> NOT_RUNNING a dec_and_test() would work as well. If the dec_and_test brings the state to NOT_RUNNING, that is the same as a cmpxchg succeeding to change EXECUTING to NOT_RUNNING. If a nested NMI were to come in and change it to LATCHED, the dec_and_test() would convert the state to EXECUTING (what we want it to be in such a case anyway). I asked 'slashdot' to post this as a patch, but it never came to be. I decided to do the work instead. Thanks to H. Peter Anvin for suggesting to use this_cpu_dec_and_return() instead of local_dec_and_test(&__get_cpu_var()). Link: http://lwn.net/Articles/484932/ Cc: H. Peter Anvin Signed-off-by: Steven Rostedt --- arch/x86/kernel/nmi.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index a0b2f84457be..a15a88800661 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -365,8 +365,9 @@ static __kprobes void default_do_nmi(struct pt_regs *regs) #ifdef CONFIG_X86_32 /* * For i386, NMIs use the same stack as the kernel, and we can - * add a workaround to the iret problem in C. Simply have 3 states - * the NMI can be in. + * add a workaround to the iret problem in C (preventing nested + * NMIs if an NMI takes a trap). Simply have 3 states the NMI + * can be in: * * 1) not running * 2) executing @@ -383,13 +384,20 @@ static __kprobes void default_do_nmi(struct pt_regs *regs) * If an NMI hits a breakpoint that executes an iret, another * NMI can preempt it. We do not want to allow this new NMI * to run, but we want to execute it when the first one finishes. - * We set the state to "latched", and the first NMI will perform - * an cmpxchg on the state, and if it doesn't successfully - * reset the state to "not running" it will restart the next - * NMI. + * We set the state to "latched", and the exit of the first NMI will + * perform a dec_return, if the result is zero (NOT_RUNNING), then + * it will simply exit the NMI handler. If not, the dec_return + * would have set the state to NMI_EXECUTING (what we want it to + * be when we are running). In this case, we simply jump back + * to rerun the NMI handler again, and restart the 'latched' NMI. + * + * No trap (breakpoint or page fault) should be hit before nmi_restart, + * thus there is no race between the first check of state for NOT_RUNNING + * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs + * at this point. */ enum nmi_states { - NMI_NOT_RUNNING, + NMI_NOT_RUNNING = 0, NMI_EXECUTING, NMI_LATCHED, }; @@ -397,18 +405,17 @@ static DEFINE_PER_CPU(enum nmi_states, nmi_state); #define nmi_nesting_preprocess(regs) \ do { \ - if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ - __get_cpu_var(nmi_state) = NMI_LATCHED; \ + if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \ + this_cpu_write(nmi_state, NMI_LATCHED); \ return; \ } \ - nmi_restart: \ - __get_cpu_var(nmi_state) = NMI_EXECUTING; \ - } while (0) + this_cpu_write(nmi_state, NMI_EXECUTING); \ + } while (0); \ + nmi_restart: #define nmi_nesting_postprocess() \ do { \ - if (cmpxchg(&__get_cpu_var(nmi_state), \ - NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ + if (this_cpu_dec_return(nmi_state)) \ goto nmi_restart; \ } while (0) #else /* x86_64 */ From 70fb74a5420f9caa3e001d65004e4b669124283e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 Jun 2012 11:54:37 -0400 Subject: [PATCH 057/612] x86: Save cr2 in NMI in case NMIs take a page fault (for i386) Avi Kivity reported that page faults in NMIs could cause havic if the NMI preempted another page fault handler: The recent changes to NMI allow exceptions to take place in NMI handlers, but I think that a #PF (say, due to access to vmalloc space) is still problematic. Consider the sequence #PF (cr2 set by processor) NMI ... #PF (cr2 clobbered) do_page_fault() IRET ... IRET do_page_fault() address = read_cr2() The last line reads the overwritten cr2 value. This is the i386 version, which has the luxury of doing the work in C code. Link: http://lkml.kernel.org/r/4FBB8C40.6080304@redhat.com Reported-by: Avi Kivity Cc: Linus Torvalds Cc: H. Peter Anvin Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- arch/x86/kernel/nmi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index a15a88800661..f84f5c57de35 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -395,6 +395,14 @@ static __kprobes void default_do_nmi(struct pt_regs *regs) * thus there is no race between the first check of state for NOT_RUNNING * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs * at this point. + * + * In case the NMI takes a page fault, we need to save off the CR2 + * because the NMI could have preempted another page fault and corrupt + * the CR2 that is about to be read. As nested NMIs must be restarted + * and they can not take breakpoints or page faults, the update of the + * CR2 must be done before converting the nmi state back to NOT_RUNNING. + * Otherwise, there would be a race of another nested NMI coming in + * after setting state to NOT_RUNNING but before updating the nmi_cr2. */ enum nmi_states { NMI_NOT_RUNNING = 0, @@ -402,6 +410,7 @@ enum nmi_states { NMI_LATCHED, }; static DEFINE_PER_CPU(enum nmi_states, nmi_state); +static DEFINE_PER_CPU(unsigned long, nmi_cr2); #define nmi_nesting_preprocess(regs) \ do { \ @@ -410,11 +419,14 @@ static DEFINE_PER_CPU(enum nmi_states, nmi_state); return; \ } \ this_cpu_write(nmi_state, NMI_EXECUTING); \ + this_cpu_write(nmi_cr2, read_cr2()); \ } while (0); \ nmi_restart: #define nmi_nesting_postprocess() \ do { \ + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \ + write_cr2(this_cpu_read(nmi_cr2)); \ if (this_cpu_dec_return(nmi_state)) \ goto nmi_restart; \ } while (0) From e9071b0be5e7ce4903b7f7c370769d485774d3e3 Mon Sep 17 00:00:00 2001 From: Donald Dutile Date: Fri, 8 Jun 2012 17:13:11 -0400 Subject: [PATCH 058/612] iommu/dmar: Use pr_format() instead of PREFIX to tidy up pr_*() calls Joe Perches recommended getting rid of the redundant formatting of adding "PREFIX" to all the uses of pr_*() calls. The recommendation helps to reduce source and improve readibility. While cleaning up the PREFIX's, I saw that one of the pr_warn() was redundant in dmar_parse_one_dev_scope(), since the same message was printed after breaking out of the while loop for the same condition, !pdev. So, to avoid a duplicate message, I removed the one in the while loop. Reported-by: Joe Perches Signed-off-by: Donald Dutile Cc: iommu@lists.linux-foundation.org Cc: chrisw@redhat.com Cc: suresh.b.siddha@intel.com Cc: dwmw2@infradead.org Link: http://lkml.kernel.org/r/1339189991-13129-1-git-send-email-ddutile@redhat.com [ Small whitespace fixes. ] Signed-off-by: Ingo Molnar --- drivers/iommu/dmar.c | 54 ++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 9ab6ebf46f7a..86e2f4a62b9a 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -26,6 +26,8 @@ * These routines are used by both DMA-remapping and Interrupt-remapping */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* has to precede printk.h */ + #include #include #include @@ -39,8 +41,6 @@ #include #include -#define PREFIX "DMAR: " - /* No locks are needed as DMA remapping hardware unit * list is constructed at boot time and hotplug of * these units are not supported by the architecture. @@ -83,15 +83,12 @@ static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope, * ignore it */ if (!bus) { - pr_warn(PREFIX "Device scope bus [%d] not found\n", - scope->bus); + pr_warn("Device scope bus [%d] not found\n", scope->bus); break; } pdev = pci_get_slot(bus, PCI_DEVFN(path->dev, path->fn)); if (!pdev) { - pr_warn(PREFIX "Device scope device" - "[%04x:%02x:%02x.%02x] not found\n", - segment, bus->number, path->dev, path->fn); + /* warning will be printed below */ break; } path ++; @@ -99,8 +96,7 @@ static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope, bus = pdev->subordinate; } if (!pdev) { - pr_warn(PREFIX - "Device scope device [%04x:%02x:%02x.%02x] not found\n", + pr_warn("Device scope device [%04x:%02x:%02x.%02x] not found\n", segment, scope->bus, path->dev, path->fn); *dev = NULL; return 0; @@ -109,8 +105,8 @@ static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope, pdev->subordinate) || (scope->entry_type == \ ACPI_DMAR_SCOPE_TYPE_BRIDGE && !pdev->subordinate)) { pci_dev_put(pdev); - pr_warn(PREFIX "Device scope type does not match for %s\n", - pci_name(pdev)); + pr_warn("Device scope type does not match for %s\n", + pci_name(pdev)); return -EINVAL; } *dev = pdev; @@ -132,7 +128,7 @@ int __init dmar_parse_dev_scope(void *start, void *end, int *cnt, scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE) (*cnt)++; else if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_IOAPIC) { - pr_warn(PREFIX "Unsupported device scope\n"); + pr_warn("Unsupported device scope\n"); } start += scope->length; } @@ -258,23 +254,23 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header) case ACPI_DMAR_TYPE_HARDWARE_UNIT: drhd = container_of(header, struct acpi_dmar_hardware_unit, header); - pr_info(PREFIX "DRHD base: %#016Lx flags: %#x\n", + pr_info("DRHD base: %#016Lx flags: %#x\n", (unsigned long long)drhd->address, drhd->flags); break; case ACPI_DMAR_TYPE_RESERVED_MEMORY: rmrr = container_of(header, struct acpi_dmar_reserved_memory, header); - pr_info(PREFIX "RMRR base: %#016Lx end: %#016Lx\n", + pr_info("RMRR base: %#016Lx end: %#016Lx\n", (unsigned long long)rmrr->base_address, (unsigned long long)rmrr->end_address); break; case ACPI_DMAR_TYPE_ATSR: atsr = container_of(header, struct acpi_dmar_atsr, header); - pr_info(PREFIX "ATSR flags: %#x\n", atsr->flags); + pr_info("ATSR flags: %#x\n", atsr->flags); break; case ACPI_DMAR_HARDWARE_AFFINITY: rhsa = container_of(header, struct acpi_dmar_rhsa, header); - pr_info(PREFIX "RHSA base: %#016Lx proximity domain: %#x\n", + pr_info("RHSA base: %#016Lx proximity domain: %#x\n", (unsigned long long)rhsa->base_address, rhsa->proximity_domain); break; @@ -294,7 +290,7 @@ static int __init dmar_table_detect(void) &dmar_tbl_size); if (ACPI_SUCCESS(status) && !dmar_tbl) { - pr_warn(PREFIX "Unable to map DMAR\n"); + pr_warn("Unable to map DMAR\n"); status = AE_NOT_FOUND; } @@ -328,18 +324,18 @@ parse_dmar_table(void) return -ENODEV; if (dmar->width < PAGE_SHIFT - 1) { - pr_warn(PREFIX "Invalid DMAR haw\n"); + pr_warn("Invalid DMAR haw\n"); return -EINVAL; } - pr_info(PREFIX "Host address width %d\n", dmar->width + 1); + pr_info("Host address width %d\n", dmar->width + 1); entry_header = (struct acpi_dmar_header *)(dmar + 1); while (((unsigned long)entry_header) < (((unsigned long)dmar) + dmar_tbl->length)) { /* Avoid looping forever on bad ACPI tables */ if (entry_header->length == 0) { - pr_warn(PREFIX "Invalid 0-length structure\n"); + pr_warn("Invalid 0-length structure\n"); ret = -EINVAL; break; } @@ -362,7 +358,7 @@ parse_dmar_table(void) #endif break; default: - pr_warn(PREFIX "Unknown DMAR structure type %d\n", + pr_warn("Unknown DMAR structure type %d\n", entry_header->type); ret = 0; /* for forward compatibility */ break; @@ -461,12 +457,12 @@ int __init dmar_table_init(void) ret = parse_dmar_table(); if (ret) { if (ret != -ENODEV) - pr_info(PREFIX "parse DMAR table failure.\n"); + pr_info("parse DMAR table failure.\n"); return ret; } if (list_empty(&dmar_drhd_units)) { - pr_info(PREFIX "No DMAR devices found\n"); + pr_info("No DMAR devices found\n"); return -ENODEV; } @@ -498,7 +494,7 @@ int __init check_zero_address(void) (((unsigned long)dmar) + dmar_tbl->length)) { /* Avoid looping forever on bad ACPI tables */ if (entry_header->length == 0) { - pr_warn(PREFIX "Invalid 0-length structure\n"); + pr_warn("Invalid 0-length structure\n"); return 0; } @@ -549,8 +545,7 @@ int __init detect_intel_iommu(void) if (ret && irq_remapping_enabled && cpu_has_x2apic && dmar->flags & 0x1) - pr_info("Queued invalidation will be enabled to " - "support x2apic and Intr-remapping.\n"); + pr_info("Queued invalidation will be enabled to support x2apic and Intr-remapping.\n"); if (ret && !no_iommu && !iommu_detected && !dmar_disabled) { iommu_detected = 1; @@ -580,9 +575,9 @@ static void unmap_iommu(struct intel_iommu *iommu) * map_iommu: map the iommu's registers * @iommu: the iommu to map * @phys_addr: the physical address of the base resgister - * + * * Memory map the iommu's registers. Start w/ a single page, and - * possibly expand if that turns out to be insufficent. + * possibly expand if that turns out to be insufficent. */ static int map_iommu(struct intel_iommu *iommu, u64 phys_addr) { @@ -1302,8 +1297,7 @@ int __init enable_drhd_fault_handling(void) ret = dmar_set_interrupt(iommu); if (ret) { - pr_err("DRHD %Lx: failed to enable fault, " - " interrupt, ret %d\n", + pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n", (unsigned long long)drhd->reg_base_addr, ret); return -1; } From e2b297fcf17fc03734e93387fb8195c782286b35 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Sun, 10 Jun 2012 21:13:41 -0600 Subject: [PATCH 059/612] perf/x86: Convert obsolete simple_strtoul() usage to kstrtoul() Signed-off-by: Shuah Khan Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1339384421.3025.8.camel@lorien2 Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 000a4746c7ce..766c76d5ec4c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1640,7 +1640,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) { - unsigned long val = simple_strtoul(buf, NULL, 0); + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; if (!!val != !!x86_pmu.attr_rdpmc) { x86_pmu.attr_rdpmc = !!val; From 110c1e1f1bf61e5dca53ff5c9dc75243ce87c002 Mon Sep 17 00:00:00 2001 From: Ravikiran Thirumalai Date: Sun, 3 Jun 2012 01:11:35 +0300 Subject: [PATCH 060/612] x86/vsmp: Ignore IOAPIC IRQ affinity if possible vSMP can route interrupts more optimally based on internal knowledge the OS does not have. In order to support this optimization, all CPUs must be able to handle all possible IOAPIC interrupts. Fix this by setting the vector allocation domain for all CPUs and by enabling this feature in vSMP. Signed-off-by: Ravikiran Thirumalai Signed-off-by: Shai Fultheim [ Rebased, simplified, and reworded the commit message. ] Signed-off-by: Ido Yariv Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsmp_64.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 59eea855f451..6b96a7374f94 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -95,6 +96,15 @@ static void __init set_vsmp_pv_ops(void) ctl = readl(address + 4); printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl); + + /* If possible, let the vSMP foundation route the interrupt optimally */ +#ifdef CONFIG_SMP + if (cap & ctl & BIT(8)) { + ctl &= ~BIT(8); + no_irq_affinity = 1; + } +#endif + if (cap & ctl & (1 << 4)) { /* Setup irq ops and turn on vSMP IRQ fastpath handling */ pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); @@ -102,12 +112,11 @@ static void __init set_vsmp_pv_ops(void) pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); pv_init_ops.patch = vsmp_patch; - ctl &= ~(1 << 4); - writel(ctl, address + 4); - ctl = readl(address + 4); - printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl); } + writel(ctl, address + 4); + ctl = readl(address + 4); + pr_info("vSMP CTL: control set to:0x%08x\n", ctl); early_iounmap(address, 8); } @@ -192,10 +201,20 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) return hard_smp_processor_id() >> index_msb; } +/* + * In vSMP, all cpus should be capable of handling interrupts, regardless of + * the APIC used. + */ +static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_setall(retmask); +} + static void vsmp_apic_post_init(void) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; + apic->vector_allocation_domain = fill_vector_allocation_domain; } void __init vsmp_init(void) From b871a42b6091b720e82ddff237659534c525c25b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 11 Jun 2012 15:07:08 +0200 Subject: [PATCH 061/612] smpboot: Remove leftover declaration Signed-off-by: Thomas Gleixner --- kernel/smpboot.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 80c0acfb8472..6ef9433e1c70 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -3,8 +3,6 @@ struct task_struct; -int smpboot_prepare(unsigned int cpu); - #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD struct task_struct *idle_thread_get(unsigned int cpu); void idle_thread_set_boot_cpu(void); From 19f5f7364a1cc770b14692f609bb9b802fc334d5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 27 Jul 2011 17:29:28 +0200 Subject: [PATCH 062/612] nohz: Separate idle sleeping time accounting from nohz logic As we plan to be able to stop the tick outside the idle task, we need to prepare for separating nohz logic from idle. As a start, this pulls the idle sleeping time accounting out of the tick stop/restart API to the callers on idle entry/exit. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 77 ++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index da70c6db496c..81409bba2425 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -271,10 +271,10 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -static void tick_nohz_stop_sched_tick(struct tick_sched *ts) +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; - ktime_t last_update, expires, now; + ktime_t last_update, expires; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; int cpu; @@ -282,8 +282,6 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); - now = tick_nohz_start_idle(cpu, ts); - /* * If this cpu is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by @@ -444,6 +442,14 @@ out: ts->sleep_length = ktime_sub(dev->next_event, now); } +static void __tick_nohz_idle_enter(struct tick_sched *ts) +{ + ktime_t now; + + now = tick_nohz_start_idle(smp_processor_id(), ts); + tick_nohz_stop_sched_tick(ts, now); +} + /** * tick_nohz_idle_enter - stop the idle tick from the idle task * @@ -479,7 +485,7 @@ void tick_nohz_idle_enter(void) * update of the idle time accounting in tick_nohz_start_idle(). */ ts->inidle = 1; - tick_nohz_stop_sched_tick(ts); + __tick_nohz_idle_enter(ts); local_irq_enable(); } @@ -499,7 +505,7 @@ void tick_nohz_irq_exit(void) if (!ts->inidle) return; - tick_nohz_stop_sched_tick(ts); + __tick_nohz_idle_enter(ts); } /** @@ -540,39 +546,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) } } -/** - * tick_nohz_idle_exit - restart the idle tick from the idle task - * - * Restart the idle tick when the CPU is woken up from idle - * This also exit the RCU extended quiescent state. The CPU - * can use RCU again after this function is called. - */ -void tick_nohz_idle_exit(void) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); #ifndef CONFIG_VIRT_CPU_ACCOUNTING unsigned long ticks; #endif - ktime_t now; - - local_irq_disable(); - - WARN_ON_ONCE(!ts->inidle); - - ts->inidle = 0; - - if (ts->idle_active || ts->tick_stopped) - now = ktime_get(); - - if (ts->idle_active) - tick_nohz_stop_idle(cpu, now); - - if (!ts->tick_stopped) { - local_irq_enable(); - return; - } - /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); @@ -600,6 +578,35 @@ void tick_nohz_idle_exit(void) ts->idle_exittime = now; tick_nohz_restart(ts, now); +} + +/** + * tick_nohz_idle_exit - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + * This also exit the RCU extended quiescent state. The CPU + * can use RCU again after this function is called. + */ +void tick_nohz_idle_exit(void) +{ + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now; + + local_irq_disable(); + + WARN_ON_ONCE(!ts->inidle); + + ts->inidle = 0; + + if (ts->idle_active || ts->tick_stopped) + now = ktime_get(); + + if (ts->idle_active) + tick_nohz_stop_idle(cpu, now); + + if (ts->tick_stopped) + tick_nohz_restart_sched_tick(ts, now); local_irq_enable(); } From 2ac0d98fd624ae50f5e6ae9c800977a9dbbfcde6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jul 2011 04:00:47 +0200 Subject: [PATCH 063/612] nohz: Make nohz API agnostic against idle ticks cputime accounting When the timer tick fires, it accounts the new jiffy as either part of system, user or idle time. This is how we record the cputime statistics. But when the tick is stopped from the idle task, we still need to record the number of jiffies spent tickless until we restart the tick and fall back to traditional tick-based cputime accounting. To do this, we take a snapshot of jiffies when the tick is stopped and compute the difference against the new value of jiffies when the tick is restarted. Then we account this whole difference to the idle cputime. However we are preparing to be able to stop the tick from other places than idle. So this idle time accounting needs to be performed from the callers of nohz APIs, not from the nohz APIs themselves because we now want them to be agnostic against places that stop/restart tick. Therefore, we pull the tickless idle time accounting out of generic nohz helpers up to idle entry/exit callers. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 81409bba2425..911834b33b8a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -402,7 +402,6 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now) ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; - ts->idle_jiffies = last_jiffies; } ts->idle_sleeps++; @@ -445,9 +444,13 @@ out: static void __tick_nohz_idle_enter(struct tick_sched *ts) { ktime_t now; + int was_stopped = ts->tick_stopped; now = tick_nohz_start_idle(smp_processor_id(), ts); tick_nohz_stop_sched_tick(ts, now); + + if (!was_stopped && ts->tick_stopped) + ts->idle_jiffies = ts->last_jiffies; } /** @@ -548,15 +551,25 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - unsigned long ticks; -#endif /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); update_cpu_load_nohz(); + touch_softlockup_watchdog(); + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + ts->idle_exittime = now; + + tick_nohz_restart(ts, now); +} + +static void tick_nohz_account_idle_ticks(struct tick_sched *ts) +{ #ifndef CONFIG_VIRT_CPU_ACCOUNTING + unsigned long ticks; /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick @@ -569,15 +582,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); #endif - - touch_softlockup_watchdog(); - /* - * Cancel the scheduled timer and restore the tick - */ - ts->tick_stopped = 0; - ts->idle_exittime = now; - - tick_nohz_restart(ts, now); } /** @@ -605,8 +609,10 @@ void tick_nohz_idle_exit(void) if (ts->idle_active) tick_nohz_stop_idle(cpu, now); - if (ts->tick_stopped) + if (ts->tick_stopped) { tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); + } local_irq_enable(); } @@ -811,7 +817,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (ts->tick_stopped) { touch_softlockup_watchdog(); - ts->idle_jiffies++; + if (idle_cpu(cpu)) + ts->idle_jiffies++; } update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); From f5d411c91ede162240f34e05a233f2759412988e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 31 Jul 2011 17:44:12 +0200 Subject: [PATCH 064/612] nohz: Rename ts->idle_tick to ts->last_tick Now that idle and nohz logics are going to be independant each others, ts->idle_tick becomes too much a biased name to describe the field that saves the last scheduled tick on top of which we re-calculate the next tick to schedule when the timer is restarted. We want to reuse this even to stop the tick outside idle cases. So let's rename it to some more generic name: ts->last_tick. This changes a bit the timer list stat export so we need to increase its version. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- include/linux/tick.h | 8 ++++---- kernel/time/tick-sched.c | 4 ++-- kernel/time/timer_list.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/tick.h b/include/linux/tick.h index ab8be90b5cc9..f37fceb69b73 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -31,10 +31,10 @@ enum tick_nohz_mode { * struct tick_sched - sched tick emulation and no idle tick control/stats * @sched_timer: hrtimer to schedule the periodic tick in high * resolution mode - * @idle_tick: Store the last idle tick expiry time when the tick - * timer is modified for idle sleeps. This is necessary + * @last_tick: Store the last tick expiry time when the tick + * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline - * when the CPU returns from idle + * when the CPU returns from nohz sleep. * @tick_stopped: Indicator that the idle tick has been stopped * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_calls: Total number of idle calls @@ -51,7 +51,7 @@ struct tick_sched { struct hrtimer sched_timer; unsigned long check_clocks; enum tick_nohz_mode nohz_mode; - ktime_t idle_tick; + ktime_t last_tick; int inidle; int tick_stopped; unsigned long idle_jiffies; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 911834b33b8a..73cc4901336d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -400,7 +400,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now) if (!ts->tick_stopped) { select_nohz_load_balancer(1); - ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); + ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; } @@ -526,7 +526,7 @@ ktime_t tick_nohz_get_sleep_length(void) static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); while (1) { /* Forward the time to expire in the future */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 3258455549f4..af5a7e9f164b 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) { struct tick_sched *ts = tick_get_tick_sched(cpu); P(nohz_mode); - P_ns(idle_tick); + P_ns(last_tick); P(tick_stopped); P(idle_jiffies); P(idle_calls); @@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.6\n"); + SEQ_printf(m, "Timer List Version: v0.7\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); From 5b39939a40801f0c17e31adaf643d6e974227856 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Aug 2011 00:06:10 +0200 Subject: [PATCH 065/612] nohz: Move ts->idle_calls incrementation into strict idle logic Since we want to prepare for making the nohz API to work further the idle case, we need to pull ts->idle_calls incrementation up to the callers in idle. To perform this, we split tick_nohz_stop_sched_tick() in two parts: a first one that checks if we can really stop the tick for idle, and another that actually stops it. Then from the callers in idle, we check if we can stop the tick and only then we increment idle_calls and finally relay to the nohz API that won't care about these details anymore. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 86 ++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 73cc4901336d..430e1b6901cc 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -271,47 +271,15 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now) +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, + ktime_t now, int cpu) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; ktime_t last_update, expires; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; - int cpu; - cpu = smp_processor_id(); - ts = &per_cpu(tick_cpu_sched, cpu); - /* - * If this cpu is offline and it is the one which updates - * jiffies, then give up the assignment and let it be taken by - * the cpu which runs the tick timer next. If we don't drop - * this here the jiffies might be stale and do_timer() never - * invoked. - */ - if (unlikely(!cpu_online(cpu))) { - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - } - - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) - return; - - if (need_resched()) - return; - - if (unlikely(local_softirq_pending() && cpu_online(cpu))) { - static int ratelimit; - - if (ratelimit < 10) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - (unsigned int) local_softirq_pending()); - ratelimit++; - } - return; - } - - ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&xtime_lock); @@ -441,16 +409,56 @@ out: ts->sleep_length = ktime_sub(dev->next_event, now); } +static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) +{ + /* + * If this cpu is offline and it is the one which updates + * jiffies, then give up the assignment and let it be taken by + * the cpu which runs the tick timer next. If we don't drop + * this here the jiffies might be stale and do_timer() never + * invoked. + */ + if (unlikely(!cpu_online(cpu))) { + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + } + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + return false; + + if (need_resched()) + return false; + + if (unlikely(local_softirq_pending() && cpu_online(cpu))) { + static int ratelimit; + + if (ratelimit < 10) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); + ratelimit++; + } + return false; + } + + return true; +} + static void __tick_nohz_idle_enter(struct tick_sched *ts) { ktime_t now; - int was_stopped = ts->tick_stopped; + int cpu = smp_processor_id(); - now = tick_nohz_start_idle(smp_processor_id(), ts); - tick_nohz_stop_sched_tick(ts, now); + now = tick_nohz_start_idle(cpu, ts); - if (!was_stopped && ts->tick_stopped) - ts->idle_jiffies = ts->last_jiffies; + if (can_stop_idle_tick(cpu, ts)) { + int was_stopped = ts->tick_stopped; + + ts->idle_calls++; + tick_nohz_stop_sched_tick(ts, now, cpu); + + if (!was_stopped && ts->tick_stopped) + ts->idle_jiffies = ts->last_jiffies; + } } /** From 84bf1bccc60cc64376125ea2eae05e4ba12f795b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Aug 2011 01:25:38 +0200 Subject: [PATCH 066/612] nohz: Move next idle expiry time record into idle logic area The next idle expiry time record and idle sleeps tracking are statistics that only concern idle. Since we want the nohz APIs to become usable further idle context, let's pull up the handling of these statistics to the callers in idle. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 430e1b6901cc..60c9c60e9108 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -271,11 +271,11 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -static void tick_nohz_stop_sched_tick(struct tick_sched *ts, - ktime_t now, int cpu) +static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, + ktime_t now, int cpu) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; - ktime_t last_update, expires; + ktime_t last_update, expires, ret = { .tv64 = 0 }; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; @@ -358,6 +358,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) goto out; + ret = expires; + /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -372,11 +374,6 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ts->tick_stopped = 1; } - ts->idle_sleeps++; - - /* Mark expires */ - ts->idle_expires = expires; - /* * If the expiration time == KTIME_MAX, then * in this case we simply stop the tick timer. @@ -407,6 +404,8 @@ out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); + + return ret; } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) @@ -445,7 +444,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) static void __tick_nohz_idle_enter(struct tick_sched *ts) { - ktime_t now; + ktime_t now, expires; int cpu = smp_processor_id(); now = tick_nohz_start_idle(cpu, ts); @@ -454,7 +453,12 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) int was_stopped = ts->tick_stopped; ts->idle_calls++; - tick_nohz_stop_sched_tick(ts, now, cpu); + + expires = tick_nohz_stop_sched_tick(ts, now, cpu); + if (expires.tv64 > 0LL) { + ts->idle_sleeps++; + ts->idle_expires = expires; + } if (!was_stopped && ts->tick_stopped) ts->idle_jiffies = ts->last_jiffies; From ed88bed881c9948c4035828c5d63f60c7b015f86 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 12 Jun 2012 19:26:33 +0300 Subject: [PATCH 067/612] x86/apic/irq_remap: Silence a bogus pr_err() There is an extra semicolon here so the pr_err() message is printed when it is not intended. Signed-off-by: Dan Carpenter Acked-by: Yinghai Lu Cc: Alexander Gordeev Cc: Suresh Siddha Cc: Joerg Roedel Link: http://lkml.kernel.org/r/20120612162633.GA11077@elgon.mountain Signed-off-by: Ingo Molnar --- drivers/iommu/intel_irq_remapping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index dafbad06390a..853902a1b7db 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -938,7 +938,7 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, err = apic->cpu_mask_to_apicid_and(cfg->domain, mask, &dest); if (err) { - if (assign_irq_vector(irq, cfg, data->affinity)); + if (assign_irq_vector(irq, cfg, data->affinity)) pr_err("Failed to recover vector for irq %d\n", irq); return err; } From 2f74759056797054122cdc70844137f70bb3f626 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 7 Jun 2012 22:20:18 +0900 Subject: [PATCH 068/612] x86/alternatives: Use atomic_xchg() instead atomic_dec_and_test() for stop_machine_text_poke() stop_machine_text_poke() uses atomic_dec_and_test() to select one of the CPUs executing that function to actually modify the code. Since the variable is initialized to 1, subsequent CPUs will make the variable go negative. Since going negative is uncommon/unexpected in typical dec_and_test usage change this user to atomic_xchg(). This was found using a patch that warns on dec_and_test going negative. Signed-off-by: OGAWA Hirofumi Acked-by: Steven Rostedt [ Rewrote changelog ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/87zk8fgsx9.fsf@devron.myhome.or.jp Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1f84794f0759..53231a045d3d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -664,7 +664,7 @@ static int __kprobes stop_machine_text_poke(void *data) struct text_poke_param *p; int i; - if (atomic_dec_and_test(&stop_machine_first)) { + if (atomic_xchg(&stop_machine_first, 0)) { for (i = 0; i < tpp->nparams; i++) { p = &tpp->params[i]; text_poke(p->addr, p->opcode, p->len); From cac4afbc3da58d9e5701b34bd4c1f11ea13328d4 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 14 Jun 2012 12:39:34 +0200 Subject: [PATCH 069/612] x86/x2apic/cluster: Vector_allocation_domain() should return a value Since commit 8637e38 ("x86/apic: Avoid useless scanning thru a cpumask in assign_irq_vector()") vector_allocation_domain() operation indicates if a cpumask is dynamic or static. This update fixes the oversight and makes the operation to return a value. Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120614103933.GJ3383@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 23a46cf5b6fd..1885a73b7f33 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -228,10 +228,11 @@ static int x2apic_cluster_probe(void) /* * Each x2apic cluster is an allocation domain. */ -static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask) +static bool cluster_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_clear(retmask); cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu)); + return true; } static struct apic apic_x2apic_cluster = { From a5a391561bc25898ba1a702a0c4b028aa5b11ce9 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 14 Jun 2012 09:49:35 +0200 Subject: [PATCH 070/612] x86/apic: Eliminate cpu_mask_to_apicid() operation Since there are only two locations where cpu_mask_to_apicid() is called from, remove the operation and use only cpu_mask_to_apicid_and() instead. Signed-off-by: Alexander Gordeev Suggested-and-acked-by: Suresh Siddha Acked-by: Yinghai Lu Link: http://lkml.kernel.org/r/20120614074935.GE3383@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 33 +++++++-------------------- arch/x86/kernel/apic/apic.c | 24 +++++-------------- arch/x86/kernel/apic/apic_flat_64.c | 2 -- arch/x86/kernel/apic/apic_noop.c | 1 - arch/x86/kernel/apic/apic_numachip.c | 1 - arch/x86/kernel/apic/bigsmp_32.c | 1 - arch/x86/kernel/apic/es7000_32.c | 4 +--- arch/x86/kernel/apic/io_apic.c | 3 ++- arch/x86/kernel/apic/numaq_32.c | 8 ------- arch/x86/kernel/apic/probe_32.c | 1 - arch/x86/kernel/apic/summit_32.c | 3 +-- arch/x86/kernel/apic/x2apic_cluster.c | 17 -------------- arch/x86/kernel/apic/x2apic_phys.c | 1 - arch/x86/kernel/apic/x2apic_uv_x.c | 29 +++++------------------ arch/x86/platform/uv/uv_irq.c | 2 +- 15 files changed, 25 insertions(+), 105 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 1ed3eead2039..eec240e12091 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -331,8 +331,6 @@ struct apic { unsigned long (*set_apic_id)(unsigned int id); unsigned long apic_id_mask; - int (*cpu_mask_to_apicid)(const struct cpumask *cpumask, - unsigned int *apicid); int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, const struct cpumask *andmask, unsigned int *apicid); @@ -594,9 +592,15 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) #endif static inline int -__flat_cpu_mask_to_apicid(unsigned long cpu_mask, unsigned int *apicid) +flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid) { - cpu_mask = cpu_mask & APIC_ALL_CPUS & cpumask_bits(cpu_online_mask)[0]; + unsigned long cpu_mask = cpumask_bits(cpumask)[0] & + cpumask_bits(andmask)[0] & + cpumask_bits(cpu_online_mask)[0] & + APIC_ALL_CPUS; + if (likely(cpu_mask)) { *apicid = (unsigned int)cpu_mask; return 0; @@ -605,27 +609,6 @@ __flat_cpu_mask_to_apicid(unsigned long cpu_mask, unsigned int *apicid) } } -static inline int -flat_cpu_mask_to_apicid(const struct cpumask *cpumask, - unsigned int *apicid) -{ - return __flat_cpu_mask_to_apicid(cpumask_bits(cpumask)[0], apicid); -} - -static inline int -flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask, - unsigned int *apicid) -{ - unsigned long mask1 = cpumask_bits(cpumask)[0]; - unsigned long mask2 = cpumask_bits(andmask)[0]; - return __flat_cpu_mask_to_apicid(mask1 & mask2, apicid); -} - -extern int -default_cpu_mask_to_apicid(const struct cpumask *cpumask, - unsigned int *apicid); - extern int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7e9bbe73bc5a..048a4f806d46 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2123,23 +2123,6 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } -static inline int __default_cpu_to_apicid(int cpu, unsigned int *apicid) -{ - if (likely((unsigned int)cpu < nr_cpu_ids)) { - *apicid = per_cpu(x86_cpu_to_apicid, cpu); - return 0; - } else { - return -EINVAL; - } -} - -int default_cpu_mask_to_apicid(const struct cpumask *cpumask, - unsigned int *apicid) -{ - int cpu = cpumask_first_and(cpumask, cpu_online_mask); - return __default_cpu_to_apicid(cpu, apicid); -} - int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, unsigned int *apicid) @@ -2151,7 +2134,12 @@ int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, break; } - return __default_cpu_to_apicid(cpu, apicid); + if (likely((unsigned int)cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu); + return 0; + } else { + return -EINVAL; + } } /* diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index bddc92566d0a..00c77cf78e9e 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -191,7 +191,6 @@ static struct apic apic_flat = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFu << 24, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = flat_send_IPI_mask, @@ -308,7 +307,6 @@ static struct apic apic_physflat = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFu << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = physflat_send_IPI_mask, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index ac9edf247b15..65c07fc630a1 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -159,7 +159,6 @@ struct apic apic_noop = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = noop_send_IPI_mask, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index c028132ad358..bc552cff2578 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -234,7 +234,6 @@ static struct apic apic_numachip __refconst = { .set_apic_id = set_apic_id, .apic_id_mask = 0xffU << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = numachip_send_IPI_mask, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index df342fe4d6aa..d50e3640d5ae 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -188,7 +188,6 @@ static struct apic apic_bigsmp = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = bigsmp_send_IPI_mask, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index b35cfb9b6962..2c5317ea1b83 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -525,7 +525,7 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid) return 1; } -static int +static inline int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; @@ -643,7 +643,6 @@ static struct apic __refdata apic_es7000_cluster = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, .send_IPI_mask = es7000_send_IPI_mask, @@ -710,7 +709,6 @@ static struct apic __refdata apic_es7000 = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, .send_IPI_mask = es7000_send_IPI_mask, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 0deb773404e5..0540f083f452 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1492,7 +1492,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, * We use logical delivery to get the timer IRQ * to the first CPU. */ - if (unlikely(apic->cpu_mask_to_apicid(apic->target_cpus(), &dest))) + if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(), + apic->target_cpus(), &dest))) dest = BAD_APICID; entry.dest_mode = apic->irq_dest_mode; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 2b55514c328b..d661ee95cabf 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -406,13 +406,6 @@ static inline int numaq_check_phys_apicid_present(int phys_apicid) * We use physical apicids here, not logical, so just return the default * physical broadcast to stop people from breaking us */ -static int -numaq_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) -{ - *apicid = 0x0F; - return 0; -} - static int numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, @@ -499,7 +492,6 @@ static struct apic __refdata apic_numaq = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and, .send_IPI_mask = numaq_send_IPI_mask, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 2c6f003b2e4b..eef6bcd1bf1e 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -108,7 +108,6 @@ static struct apic apic_default = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = default_send_IPI_mask_logical, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 79d360f6729e..bbad180f2890 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -263,7 +263,7 @@ static int summit_check_phys_apicid_present(int physical_apicid) return 1; } -static int +static inline int summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; @@ -516,7 +516,6 @@ static struct apic apic_summit = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = summit_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and, .send_IPI_mask = summit_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 1885a73b7f33..943d03fc6fc4 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -96,22 +96,6 @@ static void x2apic_send_IPI_all(int vector) __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } -static int -x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) -{ - int cpu = cpumask_first_and(cpumask, cpu_online_mask); - int i; - - if (cpu >= nr_cpu_ids) - return -EINVAL; - - *apicid = 0; - for_each_cpu_and(i, cpumask, per_cpu(cpus_in_cluster, cpu)) - *apicid |= per_cpu(x86_cpu_to_logical_apicid, i); - - return 0; -} - static int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, @@ -270,7 +254,6 @@ static struct apic apic_x2apic_cluster = { .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index f109388a0e80..e03a1e180e81 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -123,7 +123,6 @@ static struct apic apic_x2apic_phys = { .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 307aa076bd62..026de0114d15 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -269,27 +269,6 @@ static void uv_init_apic_ldr(void) { } -static inline int __uv_cpu_to_apicid(int cpu, unsigned int *apicid) -{ - if (likely((unsigned int)cpu < nr_cpu_ids)) { - *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; - return 0; - } else { - return -EINVAL; - } -} - -static int -uv_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) -{ - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - int cpu = cpumask_first_and(cpumask, cpu_online_mask); - return __uv_cpu_to_apicid(cpu, apicid); -} - static int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, @@ -306,7 +285,12 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, break; } - return __uv_cpu_to_apicid(cpu, apicid); + if (likely((unsigned int)cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; + return 0; + } else { + return -EINVAL; + } } static unsigned int x2apic_get_apic_id(unsigned long x) @@ -384,7 +368,6 @@ static struct apic __refdata apic_x2apic_uv_x = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, .send_IPI_mask = uv_send_IPI_mask, diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index dd1ff39a464c..a67c7a6bac7e 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -144,7 +144,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, if (err != 0) return err; - err = apic->cpu_mask_to_apicid(eligible_cpu, &dest); + err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest); if (err != 0) return err; From ea3807ea52a53f2cdfd60c89d8491fc9a8208d1c Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 14 Jun 2012 09:49:55 +0200 Subject: [PATCH 071/612] x86/apic: Fix ugly casting and branching in cpu_mask_to_apicid_and() Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120614074954.GF3383@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 8 ++++---- arch/x86/kernel/apic/es7000_32.c | 2 +- arch/x86/kernel/apic/summit_32.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 8 ++++---- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 048a4f806d46..c421512ca5eb 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2127,19 +2127,19 @@ int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, unsigned int *apicid) { - int cpu; + unsigned int cpu; for_each_cpu_and(cpu, cpumask, andmask) { if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (likely((unsigned int)cpu < nr_cpu_ids)) { + if (likely(cpu < nr_cpu_ids)) { *apicid = per_cpu(x86_cpu_to_apicid, cpu); return 0; - } else { - return -EINVAL; } + + return -EINVAL; } /* diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 2c5317ea1b83..effece2ea0db 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -529,7 +529,7 @@ static inline int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; - int cpu, uninitialized_var(apicid); + unsigned int cpu, uninitialized_var(apicid); /* * The cpus in the mask must all be on the apic cluster. diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index bbad180f2890..b53fd6c9993a 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -267,7 +267,7 @@ static inline int summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; - int cpu, apicid = 0; + unsigned int cpu, apicid = 0; /* * The cpus in the mask must all be on the apic cluster. diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 026de0114d15..8cfade9510a4 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -274,7 +274,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, unsigned int *apicid) { - int cpu; + int unsigned cpu; /* * We're using fixed IRQ delivery, can only return one phys APIC ID. @@ -285,12 +285,12 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, break; } - if (likely((unsigned int)cpu < nr_cpu_ids)) { + if (likely(cpu < nr_cpu_ids)) { *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; return 0; - } else { - return -EINVAL; } + + return -EINVAL; } static unsigned int x2apic_get_apic_id(unsigned long x) From 49ad3fd4834182cce9725abb98e080b479fed464 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 14 Jun 2012 09:50:11 +0200 Subject: [PATCH 072/612] x86/apic/es7000+summit: Fix compile warning in cpu_mask_to_apicid() Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120614075010.GG3383@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 2 +- arch/x86/kernel/apic/summit_32.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index effece2ea0db..0c1347df3ad0 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -554,8 +554,8 @@ es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, const struct cpumask *andmask, unsigned int *apicid) { - *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; + *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) return 0; diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index b53fd6c9993a..e6cc1829f7c8 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -291,8 +291,8 @@ summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, const struct cpumask *andmask, unsigned int *apicid) { - *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; + *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) return 0; From 214e270b5f5f6a85400a817d5305c797b2b7467a Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 14 Jun 2012 09:50:27 +0200 Subject: [PATCH 073/612] x86/apic/es7000+summit: Always make valid apicid from a cpumask In case of invalid parameters cpu_mask_to_apicid_and() might return apicid value of 0 (on Summit) or a uninitialized value (on ES7000), although it is supposed to return apicid of cpu-0 at least. Fix the operation to always return a valid apicid. Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120614075026.GH3383@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 2 ++ arch/x86/kernel/apic/summit_32.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 0c1347df3ad0..9882093f26e8 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -545,6 +545,8 @@ es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) apicid = new_apicid; round++; } + if (!round) + return -EINVAL; *dest_id = apicid; return 0; } diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index e6cc1829f7c8..b6e61857c29f 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -282,6 +282,8 @@ summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) apicid |= new_apicid; round++; } + if (!round) + return -EINVAL; *dest_id = apicid; return 0; } From 5a0a2a308113086cc800a203d903271c9caa1611 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 14 Jun 2012 09:50:44 +0200 Subject: [PATCH 074/612] x86/apic/es7000: Make apicid of a cluster (not CPU) from a cpumask cpu_mask_to_apicid_and() always returns apicid of a single CPU, even in case multiple CPUs were requested. This update fixes a typo and forces apicid of a cluster to be returned. Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120614075043.GI3383@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 9882093f26e8..0874799a98c6 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -542,7 +542,7 @@ es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) return -EINVAL; } - apicid = new_apicid; + apicid |= new_apicid; round++; } if (!round) From 8d240dd88cca33b704adf3fe281aa64b5aac2dd8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 29 Mar 2012 19:11:40 +0200 Subject: [PATCH 075/612] ftrace: Remove a superfluous check register_ftrace_function() checks ftrace_disabled and calls __register_ftrace_function which does it again. Drop the first check and add the unlikely hint to the second one. Also, drop the label as John correctly notices. No functional change. Link: http://lkml.kernel.org/r/20120329171140.GE6409@aftab Cc: Borislav Petkov Cc: John Kacur Signed-off-by: Borislav Petkov Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a008663d86c8..b4f20fba09fc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, static int __register_ftrace_function(struct ftrace_ops *ops) { - if (ftrace_disabled) + if (unlikely(ftrace_disabled)) return -ENODEV; if (FTRACE_WARN_ON(ops == &global_ops)) @@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); - if (unlikely(ftrace_disabled)) - goto out_unlock; - ret = __register_ftrace_function(ops); if (!ret) ret = ftrace_startup(ops, 0); - - out_unlock: mutex_unlock(&ftrace_lock); + return ret; } EXPORT_SYMBOL_GPL(register_ftrace_function); From 5da43bed800770906fca24deef7ae3d456823b86 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 31 May 2012 21:28:58 -0400 Subject: [PATCH 076/612] tracing: Add comments for the other bits of ftrace_event_call.flags TRACE_EVENT_FL_ENABLED_BIT, TRACE_EVENT_FL_FILTERED_BIT, TRACE_EVENT_FL_RECORDED_CMD_BIT, Have comments about what they are, but: TRACE_EVENT_FL_CAP_ANY_BIT, TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, do not, making them second class citizens. To prevent another class warfare, these bits have protested for their right to be commented. And By Golly! I'll give them what they want! Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 176a939d1547..1aff18346c71 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -207,6 +207,9 @@ struct ftrace_event_call { * bit 1: enabled * bit 2: filter_active * bit 3: enabled cmd record + * bit 4: allow trace by non root (cap any) + * bit 5: failed to apply filter + * bit 6: ftrace internal event (do not enable) * * Changes to flags must hold the event_mutex. * From 7374e82771c6d5a9af2080be46f64a5826c7efb1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 31 May 2012 21:40:05 -0400 Subject: [PATCH 077/612] tracing: Register the ftrace internal events during early boot All trace events including ftrace internel events (like trace_printk and function tracing), register functions that describe how to print their output. The events may be recorded as soon as the ring buffer is allocated, but they are just raw binary in the buffer. The mapping of event ids to how to print them are held within a structure that is registered on system boot. If a crash happens in boot up before these functions are registered then their output (via ftrace_dump_on_oops) will be useless: Dumping ftrace buffer: --------------------------------- <...>-1 0.... 319705us : Unknown type 6 --------------------------------- This can be quite frustrating for a kernel developer trying to see what is going wrong. There's no reason to register them so late in the boot up process. They can be registered by early_initcall(). Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index df611a0e76c5..123b189c732c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1325,4 +1325,4 @@ __init static int init_events(void) return 0; } -device_initcall(init_events); +early_initcall(init_events); From d48daf37a3d2e2b28a61e615c0fc538301edb0dd Mon Sep 17 00:00:00 2001 From: Ido Yariv Date: Thu, 14 Jun 2012 18:43:08 +0300 Subject: [PATCH 078/612] x86/vsmp: Fix linker error when CONFIG_PROC_FS is not set set_vsmp_pv_ops() references no_irq_affinity which is undeclared if CONFIG_PROC_FS isn't set. Fix this by adding an #ifdef around this variable's access. Reported-by: Fengguang Wu Signed-off-by: Ido Yariv Acked-by: Shai Fultheim Link: http://lkml.kernel.org/r/1339688588-12674-1-git-send-email-ido@wizery.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsmp_64.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 6b96a7374f94..3f0285ac00fa 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -101,7 +101,10 @@ static void __init set_vsmp_pv_ops(void) #ifdef CONFIG_SMP if (cap & ctl & BIT(8)) { ctl &= ~BIT(8); +#ifdef CONFIG_PROC_FS + /* Don't let users change irq affinity via procfs */ no_irq_affinity = 1; +#endif } #endif From 7eb9ae0799b1e9f0b77733b432bc5f6f055b020b Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 14 Jun 2012 18:28:49 -0700 Subject: [PATCH 079/612] irq/apic: Use config_enabled(CONFIG_SMP) checks to clean up irq_set_affinity() for UP Move the ->irq_set_affinity() routines out of the #ifdef CONFIG_SMP sections and use config_enabled(CONFIG_SMP) checks inside those routines. Thus making those routines simple null stubs for !CONFIG_SMP and retaining those routines with no additional runtime overhead for CONFIG_SMP kernels. Cleans up the ifdef CONFIG_SMP in and around routines related to irq_set_affinity in io_apic and irq_remapping subsystems. Signed-off-by: Suresh Siddha Cc: torvalds@linux-foundation.org Cc: joerg.roedel@amd.com Cc: Sam Ravnborg Cc: Paul Gortmaker Link: http://lkml.kernel.org/r/1339723729.3475.63.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 180 +++++++++++++--------------- drivers/iommu/intel_irq_remapping.c | 7 +- drivers/iommu/irq_remapping.c | 5 +- drivers/iommu/irq_remapping.h | 2 - include/linux/irq.h | 2 - 5 files changed, 86 insertions(+), 110 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7cbd397884f5..a951ef7decb1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2224,81 +2224,6 @@ void send_cleanup_vector(struct irq_cfg *cfg) cfg->move_in_progress = 0; } -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - - apic = entry->apic; - pin = entry->pin; - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(cfg)) - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - } -} - -/* - * Either sets data->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and - * leaves data->affinity untouched. - */ -int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned int irq = data->irq; - int err; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - - err = assign_irq_vector(irq, cfg, mask); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); - if (err) { - if (assign_irq_vector(irq, cfg, data->affinity)) - pr_err("Failed to recover vector for irq %d\n", irq); - return err; - } - - cpumask_copy(data->affinity, mask); - - return 0; -} - -static int -ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - unsigned int dest, irq = data->irq; - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - ret = __ioapic_set_affinity(data, mask, &dest); - if (!ret) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, data->chip_data); - ret = IRQ_SET_MASK_OK_NOCOPY; - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return ret; -} - asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -2386,6 +2311,87 @@ void irq_force_complete_move(int irq) static inline void irq_complete_move(struct irq_cfg *cfg) { } #endif +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + u8 vector = cfg->vector; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + unsigned int reg; + + apic = entry->apic; + pin = entry->pin; + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(cfg)) + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + } +} + +/* + * Either sets data->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and + * leaves data->affinity untouched. + */ +int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + unsigned int *dest_id) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned int irq = data->irq; + int err; + + if (!config_enabled(CONFIG_SMP)) + return -1; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return -EINVAL; + + err = assign_irq_vector(irq, cfg, mask); + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); + if (err) { + if (assign_irq_vector(irq, cfg, data->affinity)) + pr_err("Failed to recover vector for irq %d\n", irq); + return err; + } + + cpumask_copy(data->affinity, mask); + + return 0; +} + +static int +ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + unsigned int dest, irq = data->irq; + unsigned long flags; + int ret; + + if (!config_enabled(CONFIG_SMP)) + return -1; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + ret = __ioapic_set_affinity(data, mask, &dest); + if (!ret) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, data->chip_data); + ret = IRQ_SET_MASK_OK_NOCOPY; + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return ret; +} + static void ack_apic_edge(struct irq_data *data) { irq_complete_move(data->chip_data); @@ -2565,9 +2571,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip) chip->irq_ack = ir_ack_apic_edge; chip->irq_eoi = ir_ack_apic_level; -#ifdef CONFIG_SMP chip->irq_set_affinity = set_remapped_irq_affinity; -#endif } #endif /* CONFIG_IRQ_REMAP */ @@ -2578,9 +2582,7 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_unmask = unmask_ioapic_irq, .irq_ack = ack_apic_edge, .irq_eoi = ack_apic_level, -#ifdef CONFIG_SMP .irq_set_affinity = ioapic_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3099,7 +3101,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, return err; } -#ifdef CONFIG_SMP static int msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -3121,7 +3122,6 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) return IRQ_SET_MASK_OK_NOCOPY; } -#endif /* CONFIG_SMP */ /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, @@ -3132,9 +3132,7 @@ static struct irq_chip msi_chip = { .irq_unmask = unmask_msi_irq, .irq_mask = mask_msi_irq, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = msi_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3219,7 +3217,6 @@ void native_teardown_msi_irq(unsigned int irq) } #ifdef CONFIG_DMAR_TABLE -#ifdef CONFIG_SMP static int dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) @@ -3244,16 +3241,12 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, return IRQ_SET_MASK_OK_NOCOPY; } -#endif /* CONFIG_SMP */ - static struct irq_chip dmar_msi_type = { .name = "DMAR_MSI", .irq_unmask = dmar_msi_unmask, .irq_mask = dmar_msi_mask, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = dmar_msi_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3274,7 +3267,6 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER -#ifdef CONFIG_SMP static int hpet_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -3297,16 +3289,12 @@ static int hpet_msi_set_affinity(struct irq_data *data, return IRQ_SET_MASK_OK_NOCOPY; } -#endif /* CONFIG_SMP */ - static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = hpet_msi_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3341,8 +3329,6 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) */ #ifdef CONFIG_HT_IRQ -#ifdef CONFIG_SMP - static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) { struct ht_irq_msg msg; @@ -3370,16 +3356,12 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) return IRQ_SET_MASK_OK_NOCOPY; } -#endif - static struct irq_chip ht_irq_chip = { .name = "PCI-HT", .irq_mask = mask_ht_irq, .irq_unmask = unmask_ht_irq, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = ht_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 853902a1b7db..e0b18f3ae9a8 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -902,7 +902,6 @@ static int intel_setup_ioapic_entry(int irq, return 0; } -#ifdef CONFIG_SMP /* * Migrate the IO-APIC irq in the presence of intr-remapping. * @@ -926,6 +925,9 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, struct irte irte; int err; + if (!config_enabled(CONFIG_SMP)) + return -EINVAL; + if (!cpumask_intersects(mask, cpu_online_mask)) return -EINVAL; @@ -963,7 +965,6 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, cpumask_copy(data->affinity, mask); return 0; } -#endif static void intel_compose_msi_msg(struct pci_dev *pdev, unsigned int irq, unsigned int dest, @@ -1065,9 +1066,7 @@ struct irq_remap_ops intel_irq_remap_ops = { .reenable = reenable_irq_remapping, .enable_faulting = enable_drhd_fault_handling, .setup_ioapic_entry = intel_setup_ioapic_entry, -#ifdef CONFIG_SMP .set_affinity = intel_ioapic_set_affinity, -#endif .free_irq = free_irte, .compose_msi_msg = intel_compose_msi_msg, .msi_alloc_irq = intel_msi_alloc_irq, diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 40cda8e98d87..1d29b1c66e72 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -111,16 +111,15 @@ int setup_ioapic_remapped_entry(int irq, vector, attr); } -#ifdef CONFIG_SMP int set_remapped_irq_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - if (!remap_ops || !remap_ops->set_affinity) + if (!config_enabled(CONFIG_SMP) || !remap_ops || + !remap_ops->set_affinity) return 0; return remap_ops->set_affinity(data, mask, force); } -#endif void free_remapped_irq(int irq) { diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h index be9d72950c51..b12974cc1dfe 100644 --- a/drivers/iommu/irq_remapping.h +++ b/drivers/iommu/irq_remapping.h @@ -59,11 +59,9 @@ struct irq_remap_ops { unsigned int, int, struct io_apic_irq_attr *); -#ifdef CONFIG_SMP /* Set the CPU affinity of a remapped interrupt */ int (*set_affinity)(struct irq_data *data, const struct cpumask *mask, bool force); -#endif /* Free an IRQ */ int (*free_irq)(int); diff --git a/include/linux/irq.h b/include/linux/irq.h index 61f5cec031e0..47a937cd84af 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -150,9 +150,7 @@ struct irq_data { void *handler_data; void *chip_data; struct msi_desc *msi_desc; -#ifdef CONFIG_SMP cpumask_var_t affinity; -#endif }; /* From ea131377148cdfe90641b42ae9aa5a6b3a4fa327 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:22 +0200 Subject: [PATCH 080/612] uprobes: Valid_vma() should reject VM_HUGETLB __replace_page() obviously can't work with the hugetlbfs mappings, uprobe_register() will likely crash the kernel. Change valid_vma() to check VM_HUGETLB as well. As for PageTransHuge() no need to worry, vma->vm_file != NULL. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154322.GA9561@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b52376d02332..f0d04530af63 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -99,7 +99,8 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register) if (!is_register) return true; - if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) + if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) + == (VM_READ|VM_EXEC)) return true; return false; From cc359d180fa9c25a4c1819f17e07a422d788353d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:25 +0200 Subject: [PATCH 081/612] uprobes: __copy_insn() should ensure a_ops->readpage != NULL __copy_insn() blindly calls read_mapping_page(), this will crash the kernel if ->readpage == NULL, add the necessary check. For example, hugetlbfs_aops->readpage is NULL. Perhaps we should change read_mapping_page() instead. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154325.GA9568@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f0d04530af63..604930bf9c92 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -610,6 +610,9 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins if (!filp) return -EINVAL; + if (!mapping->a_ops->readpage) + return -EIO; + idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); off1 = offset &= ~PAGE_MASK; From 5323ce71e4b4e1f188ebbc0cc7776885ea6c75fb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:28 +0200 Subject: [PATCH 082/612] uprobes: Write_opcode()->__replace_page() can race with try_to_unmap() write_opcode() gets old_page via get_user_pages() and then calls __replace_page() which assumes that this old_page is still mapped after pte_offset_map_lock(). This is not true if this old_page was already try_to_unmap()'ed, and in this case everything __replace_page() does with old_page is wrong. Just for example, put_page() is not balanced. I think it is possible to teach __replace_page() to handle this unlikely case correctly, but this patch simply changes it to use page_check_address() and return -EAGAIN if it fails. The caller should notice this error code and retry. Note: write_opcode() asks for the cleanups, I'll try to do this in a separate patch. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154328.GA9571@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 604930bf9c92..3ccdb29ee8d6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -129,33 +129,17 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) { struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; - spinlock_t *ptl; unsigned long addr; - int err = -EFAULT; + spinlock_t *ptl; + pte_t *ptep; addr = page_address_in_vma(page, vma); if (addr == -EFAULT) - goto out; + return -EFAULT; - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - goto out; - - ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) - goto out; + return -EAGAIN; get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); @@ -174,10 +158,8 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct try_to_free_swap(page); put_page(page); pte_unmap_unlock(ptep, ptl); - err = 0; -out: - return err; + return 0; } /** @@ -222,9 +204,10 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; struct uprobe *uprobe; + unsigned long pgoff; loff_t addr; int ret; - +retry: /* Read the page with vaddr into memory */ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); if (ret <= 0) @@ -269,9 +252,9 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, memcpy(vaddr_new, vaddr_old, PAGE_SIZE); /* poke the new insn in, ASSUMES we don't cross page boundary */ - vaddr &= ~PAGE_MASK; - BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + pgoff = (vaddr & ~PAGE_MASK); + BUG_ON(pgoff + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + memcpy(vaddr_new + pgoff, &opcode, UPROBE_SWBP_INSN_SIZE); kunmap_atomic(vaddr_new); kunmap_atomic(vaddr_old); @@ -291,6 +274,8 @@ unlock_out: put_out: put_page(old_page); + if (unlikely(ret == -EAGAIN)) + goto retry; return ret; } From c1914a0936f79ed0236f670122e06e36e4d332ee Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:31 +0200 Subject: [PATCH 083/612] uprobes: Install_breakpoint() should fail if is_swbp_insn() == T install_breakpoint() returns -EEXIST if is_swbp_insn(orig_insn) == T, the caller treats this code as success. This is doubly wrong. The successful return should set UPROBE_COPY_INSN, but the real problem is that it shouldn't succeed. If the probed insn is int3 the application should get SIGTRAP, this won't happen with uprobe. Probably we can fix this, we can add the UPROBE_SHARED_BP flag and teach handle_swbp/set_orig_insn to handle this case correctly. But this needs some complications and we have other insns which can't be probed, lets make a simple fix for now. I think this needs a cleanup. UPROBE_COPY_INSN should die, copy_insn() should be called by alloc_uprobe(). arch_uprobe_analyze_insn() depends on ->mm (ia32_compat) but it is called only once. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154331.GA9578@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3ccdb29ee8d6..ec78152e32e9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -693,7 +693,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, return ret; if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) - return -EEXIST; + return -ENOTSUPP; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, addr); if (ret) From 268720903f87e0b84b161626c4447b81671b5d18 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:33 +0200 Subject: [PATCH 084/612] uprobes: Rework register_for_each_vma() to make it O(n) Currently register_for_each_vma() is O(n ** 2) + O(n ** 3), every time find_next_vma_info() "restarts" the vma_prio_tree_foreach() loop and each iteration rechecks the whole try_list. This also means that try_list can grow "indefinitely" if register/unregister races with munmap/mmap activity even if the number of mapping is bounded at any time. With this patch register_for_each_vma() builds the list of mm/vaddr structures only once and does install_breakpoint() for each entry. We do not care about the new mappings which can be created after build_map_info() drops mapping->i_mmap_mutex, uprobe_mmap() should do its work. Note that we do not allocate map_info under i_mmap_mutex, this can deadlock with page reclaim (but see the next patch). So we use 2 lists, "curr" which we are going to return, and "prev" which holds the already allocated memory. The main loop deques the entry from "prev" (initially it is empty), and if "prev" becomes empty again it counts the number of entries we need to pre-allocate outside of i_mmap_mutex. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Link: http://lkml.kernel.org/r/20120615154333.GA9581@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 199 +++++++++++++++++----------------------- 1 file changed, 86 insertions(+), 113 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ec78152e32e9..4e0db3496d70 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -60,17 +60,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; */ static atomic_t uprobe_events = ATOMIC_INIT(0); -/* - * Maintain a temporary per vma info that can be used to search if a vma - * has already been handled. This structure is introduced since extending - * vm_area_struct wasnt recommended. - */ -struct vma_info { - struct list_head probe_list; - struct mm_struct *mm; - loff_t vaddr; -}; - struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; @@ -742,139 +731,123 @@ static void delete_uprobe(struct uprobe *uprobe) atomic_dec(&uprobe_events); } -static struct vma_info * -__find_next_vma_info(struct address_space *mapping, struct list_head *head, - struct vma_info *vi, loff_t offset, bool is_register) +struct map_info { + struct map_info *next; + struct mm_struct *mm; + loff_t vaddr; +}; + +static inline struct map_info *free_map_info(struct map_info *info) { + struct map_info *next = info->next; + kfree(info); + return next; +} + +static struct map_info * +build_map_info(struct address_space *mapping, loff_t offset, bool is_register) +{ + unsigned long pgoff = offset >> PAGE_SHIFT; struct prio_tree_iter iter; struct vm_area_struct *vma; - struct vma_info *tmpvi; - unsigned long pgoff; - int existing_vma; - loff_t vaddr; - - pgoff = offset >> PAGE_SHIFT; + struct map_info *curr = NULL; + struct map_info *prev = NULL; + struct map_info *info; + int more = 0; + again: + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; - existing_vma = 0; - vaddr = vma_address(vma, offset); - - list_for_each_entry(tmpvi, head, probe_list) { - if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { - existing_vma = 1; - break; - } + if (!prev) { + more++; + continue; } - /* - * Another vma needs a probe to be installed. However skip - * installing the probe if the vma is about to be unlinked. - */ - if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) { - vi->mm = vma->vm_mm; - vi->vaddr = vaddr; - list_add(&vi->probe_list, head); + if (!atomic_inc_not_zero(&vma->vm_mm->mm_users)) + continue; - return vi; - } + info = prev; + prev = prev->next; + info->next = curr; + curr = info; + + info->mm = vma->vm_mm; + info->vaddr = vma_address(vma, offset); } - - return NULL; -} - -/* - * Iterate in the rmap prio tree and find a vma where a probe has not - * yet been inserted. - */ -static struct vma_info * -find_next_vma_info(struct address_space *mapping, struct list_head *head, - loff_t offset, bool is_register) -{ - struct vma_info *vi, *retvi; - - vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); - if (!vi) - return ERR_PTR(-ENOMEM); - - mutex_lock(&mapping->i_mmap_mutex); - retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); mutex_unlock(&mapping->i_mmap_mutex); - if (!retvi) - kfree(vi); + if (!more) + goto out; - return retvi; + prev = curr; + while (curr) { + mmput(curr->mm); + curr = curr->next; + } + + do { + info = kmalloc(sizeof(struct map_info), GFP_KERNEL); + if (!info) { + curr = ERR_PTR(-ENOMEM); + goto out; + } + info->next = prev; + prev = info; + } while (--more); + + goto again; + out: + while (prev) + prev = free_map_info(prev); + return curr; } static int register_for_each_vma(struct uprobe *uprobe, bool is_register) { - struct list_head try_list; - struct vm_area_struct *vma; - struct address_space *mapping; - struct vma_info *vi, *tmpvi; - struct mm_struct *mm; - loff_t vaddr; - int ret; + struct map_info *info; + int err = 0; - mapping = uprobe->inode->i_mapping; - INIT_LIST_HEAD(&try_list); + info = build_map_info(uprobe->inode->i_mapping, + uprobe->offset, is_register); + if (IS_ERR(info)) + return PTR_ERR(info); - ret = 0; + while (info) { + struct mm_struct *mm = info->mm; + struct vm_area_struct *vma; + loff_t vaddr; - for (;;) { - vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); - if (!vi) - break; + if (err) + goto free; - if (IS_ERR(vi)) { - ret = PTR_ERR(vi); - break; - } - - mm = vi->mm; down_write(&mm->mmap_sem); - vma = find_vma(mm, (unsigned long)vi->vaddr); - if (!vma || !valid_vma(vma, is_register)) { - list_del(&vi->probe_list); - kfree(vi); - up_write(&mm->mmap_sem); - mmput(mm); - continue; - } + vma = find_vma(mm, (unsigned long)info->vaddr); + if (!vma || !valid_vma(vma, is_register)) + goto unlock; + vaddr = vma_address(vma, uprobe->offset); if (vma->vm_file->f_mapping->host != uprobe->inode || - vaddr != vi->vaddr) { - list_del(&vi->probe_list); - kfree(vi); - up_write(&mm->mmap_sem); - mmput(mm); - continue; - } + vaddr != info->vaddr) + goto unlock; - if (is_register) - ret = install_breakpoint(uprobe, mm, vma, vi->vaddr); - else - remove_breakpoint(uprobe, mm, vi->vaddr); - - up_write(&mm->mmap_sem); - mmput(mm); if (is_register) { - if (ret && ret == -EEXIST) - ret = 0; - if (ret) - break; + err = install_breakpoint(uprobe, mm, vma, info->vaddr); + if (err == -EEXIST) + err = 0; + } else { + remove_breakpoint(uprobe, mm, info->vaddr); } + unlock: + up_write(&mm->mmap_sem); + free: + mmput(mm); + info = free_map_info(info); } - list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { - list_del(&vi->probe_list); - kfree(vi); - } - - return ret; + return err; } static int __uprobe_register(struct uprobe *uprobe) From 7a5bfb66b07f22d2429db776da7bb8b57bfb5cff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:36 +0200 Subject: [PATCH 085/612] uprobes: Change build_map_info() to try kmalloc(GFP_NOWAIT) first build_map_info() doesn't allocate the memory under i_mmap_mutex to avoid the deadlock with page reclaim. But it can try GFP_NOWAIT first, it should work in the likely case and thus we almost never need the pre-alloc-and-retry path. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Link: http://lkml.kernel.org/r/20120615154336.GA9588@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4e0db3496d70..897417dbca8e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -761,6 +761,16 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) if (!valid_vma(vma, is_register)) continue; + if (!prev && !more) { + /* + * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through + * reclaim. This is optimistic, no harm done if it fails. + */ + prev = kmalloc(sizeof(struct map_info), + GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (prev) + prev->next = NULL; + } if (!prev) { more++; continue; From c5784de2b351fe871bb57487878f7fc7ec5b075c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jun 2012 17:43:39 +0200 Subject: [PATCH 086/612] uprobes: Document uprobe_register() vs uprobe_mmap() race Because the mind is treacherous and makes us forget we need to write stuff down. Signed-off-by: Peter Zijlstra Signed-off-by: Oleg Nesterov Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120615154339.GA9591@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 897417dbca8e..2671d9ad49be 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -44,6 +44,23 @@ static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 +/* + * We need separate register/unregister and mmap/munmap lock hashes because + * of mmap_sem nesting. + * + * uprobe_register() needs to install probes on (potentially) all processes + * and thus needs to acquire multiple mmap_sems (consequtively, not + * concurrently), whereas uprobe_mmap() is called while holding mmap_sem + * for the particular process doing the mmap. + * + * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem + * because of lock order against i_mmap_mutex. This means there's a hole in + * the register vma iteration where a mmap() can happen. + * + * Thus uprobe_register() can race with uprobe_mmap() and we can try and + * install a probe where one is already installed. + */ + /* serialize (un)register */ static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; @@ -339,7 +356,9 @@ out: int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { int result; - + /* + * See the comment near uprobes_hash(). + */ result = is_swbp_at_addr(mm, vaddr); if (result == 1) return -EEXIST; @@ -845,6 +864,10 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (is_register) { err = install_breakpoint(uprobe, mm, vma, info->vaddr); + /* + * We can race against uprobe_mmap(), see the + * comment near uprobe_hash(). + */ if (err == -EEXIST) err = 0; } else { @@ -1054,8 +1077,10 @@ int uprobe_mmap(struct vm_area_struct *vma) } ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); - - /* Ignore double add: */ + /* + * We can race against uprobe_register(), see the + * comment near uprobe_hash(). + */ if (ret == -EEXIST) { ret = 0; From d436615e60c386095dac4a9bf72b08868d2a7564 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:42 +0200 Subject: [PATCH 087/612] uprobes: Copy_insn() shouldn't depend on mm/vma/vaddr 1. copy_insn() doesn't need "addr", it can use uprobe->offset. Remove this argument. 2. Change copy_insn/__copy_insn to accept "struct file*" instead of vma. copy_insn() is called only once and mm/vma/vaddr are random, it shouldn't depend on them. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154342.GA9598@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2671d9ad49be..08ef566da763 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -591,10 +591,9 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) } static int -__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, +__copy_insn(struct address_space *mapping, struct file *filp, char *insn, unsigned long nbytes, unsigned long offset) { - struct file *filp = vma->vm_file; struct page *page; void *vaddr; unsigned long off1; @@ -625,15 +624,13 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins return 0; } -static int -copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) +static int copy_insn(struct uprobe *uprobe, struct file *filp) { struct address_space *mapping; unsigned long nbytes; int bytes; - addr &= ~PAGE_MASK; - nbytes = PAGE_SIZE - addr; + nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); mapping = uprobe->inode->i_mapping; /* Instruction at end of binary; copy only available bytes */ @@ -644,13 +641,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) /* Instruction at the page-boundary; copy bytes in second page */ if (nbytes < bytes) { - if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, + if (__copy_insn(mapping, filp, uprobe->arch.insn + nbytes, bytes - nbytes, uprobe->offset + nbytes)) return -ENOMEM; bytes = nbytes; } - return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); + return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); } /* @@ -696,7 +693,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, addr = (unsigned long)vaddr; if (!(uprobe->flags & UPROBE_COPY_INSN)) { - ret = copy_insn(uprobe, vma, addr); + ret = copy_insn(uprobe, vma->vm_file); if (ret) return ret; From fc36f59565861af2e897225bc3782479a26c5d5a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:44 +0200 Subject: [PATCH 088/612] uprobes: Copy_insn() should not return -ENOMEM if __copy_insn() fails copy_insn() returns -ENOMEM if the first __copy_insn() fails, it should return the correct error code. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154344.GA9601@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 08ef566da763..2db1d94d7dfc 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -641,10 +641,10 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) /* Instruction at the page-boundary; copy bytes in second page */ if (nbytes < bytes) { - if (__copy_insn(mapping, filp, uprobe->arch.insn + nbytes, - bytes - nbytes, uprobe->offset + nbytes)) - return -ENOMEM; - + int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, + bytes - nbytes, uprobe->offset + nbytes); + if (err) + return err; bytes = nbytes; } return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); From eb2bf57bee42c7565032f93adaa211e2c9fcc52c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:47 +0200 Subject: [PATCH 089/612] uprobes: No need to re-check vma_address() in write_opcode() write_opcode() is called by register_for_each_vma() and uprobe_mmap() paths. In both cases the caller has already verified this vaddr under mmap_sem, no need to re-check. Note also that this check is wrong anyway, we should not truncate loff_t returned by vma_address() if we do not trust this mapping. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154347.GA9604@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2db1d94d7dfc..14c71a2aadad 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -211,7 +211,6 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct vm_area_struct *vma; struct uprobe *uprobe; unsigned long pgoff; - loff_t addr; int ret; retry: /* Read the page with vaddr into memory */ @@ -235,10 +234,6 @@ retry: if (mapping != vma->vm_file->f_mapping) goto put_out; - addr = vma_address(vma, uprobe->offset); - if (vaddr != (unsigned long)addr) - goto put_out; - ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) From d9c4a30e82614d43b55893a73f31e7284007ce82 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:50 +0200 Subject: [PATCH 090/612] uprobes: Move BUG_ON(UPROBE_SWBP_INSN_SIZE) from write_opcode() to install_breakpoint() write_opcode() ensures that UPROBE_SWBP_INSN doesn't cross the page boundary. This looks a bit confusing, the check does not depend on vaddr and it is enough to do it only once right after install_breakpoint()->arch_uprobe_analyze_insn(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154350.GA9611@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 14c71a2aadad..b9c61bda9029 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -210,7 +210,6 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; struct uprobe *uprobe; - unsigned long pgoff; int ret; retry: /* Read the page with vaddr into memory */ @@ -251,11 +250,7 @@ retry: vaddr_new = kmap_atomic(new_page); memcpy(vaddr_new, vaddr_old, PAGE_SIZE); - - /* poke the new insn in, ASSUMES we don't cross page boundary */ - pgoff = (vaddr & ~PAGE_MASK); - BUG_ON(pgoff + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - memcpy(vaddr_new + pgoff, &opcode, UPROBE_SWBP_INSN_SIZE); + memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); kunmap_atomic(vaddr_new); kunmap_atomic(vaddr_old); @@ -699,6 +694,10 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (ret) return ret; + /* write_opcode() assumes we don't cross page boundary */ + BUG_ON((uprobe->offset & ~PAGE_MASK) + + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + uprobe->flags |= UPROBE_COPY_INSN; } From 449d0d7c9fb87277175db34c009c70cb348004a8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:53 +0200 Subject: [PATCH 091/612] uprobes: Simplify the usage of uprobe->pending_list uprobe->pending_list is only used to create the temporary list, it has no meaning after we drop uprobes_mmap_hash(inode). No need to initialize this node or remove it from tmp_list, and we can use list_for_each_entry(). Signed-off-by: Oleg Nesterov Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120615154353.GA9614@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b9c61bda9029..7d5c78f063ae 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -513,7 +513,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = igrab(inode); uprobe->offset = offset; init_rwsem(&uprobe->consumer_rwsem); - INIT_LIST_HEAD(&uprobe->pending_list); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); @@ -1037,7 +1036,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head) int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; - struct uprobe *uprobe, *u; + struct uprobe *uprobe; struct inode *inode; int ret, count; @@ -1055,10 +1054,9 @@ int uprobe_mmap(struct vm_area_struct *vma) ret = 0; count = 0; - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { + list_for_each_entry(uprobe, &tmp_list, pending_list) { loff_t vaddr; - list_del(&uprobe->pending_list); if (!ret) { vaddr = vma_address(vma, uprobe->offset); @@ -1106,7 +1104,7 @@ int uprobe_mmap(struct vm_area_struct *vma) void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct list_head tmp_list; - struct uprobe *uprobe, *u; + struct uprobe *uprobe; struct inode *inode; if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) @@ -1123,12 +1121,10 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, &tmp_list); - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { + list_for_each_entry(uprobe, &tmp_list, pending_list) { loff_t vaddr; - list_del(&uprobe->pending_list); vaddr = vma_address(vma, uprobe->offset); - if (vaddr >= start && vaddr < end) { /* * An unregister could have removed the probe before From 816c03fbabe64fa09f66fbb64e932081af381415 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:55 +0200 Subject: [PATCH 092/612] uprobes: Don't use loff_t for the valid virtual address loff_t looks confusing when it is used for the virtual address. Change map_info and install_breakpoint/remove_breakpoint paths to use "unsigned long". The patch doesn't change vma_address(), it can't return "long" because it is used to verify the mapping. But probably this needs some cleanups too. Signed-off-by: Oleg Nesterov Signed-off-by: Anton Arapov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154355.GA9622@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7d5c78f063ae..4df84b76dd48 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -664,9 +664,8 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) */ static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, - struct vm_area_struct *vma, loff_t vaddr) + struct vm_area_struct *vma, unsigned long vaddr) { - unsigned long addr; int ret; /* @@ -679,8 +678,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (!uprobe->consumers) return -EEXIST; - addr = (unsigned long)vaddr; - if (!(uprobe->flags & UPROBE_COPY_INSN)) { ret = copy_insn(uprobe, vma->vm_file); if (ret) @@ -689,7 +686,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) return -ENOTSUPP; - ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, addr); + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); if (ret) return ret; @@ -709,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, * Hence increment before and decrement on failure. */ atomic_inc(&mm->uprobes_state.count); - ret = set_swbp(&uprobe->arch, mm, addr); + ret = set_swbp(&uprobe->arch, mm, vaddr); if (ret) atomic_dec(&mm->uprobes_state.count); @@ -717,9 +714,9 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, } static void -remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) +remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) + if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) atomic_dec(&mm->uprobes_state.count); } @@ -743,7 +740,7 @@ static void delete_uprobe(struct uprobe *uprobe) struct map_info { struct map_info *next; struct mm_struct *mm; - loff_t vaddr; + unsigned long vaddr; }; static inline struct map_info *free_map_info(struct map_info *info) @@ -837,7 +834,6 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) while (info) { struct mm_struct *mm = info->mm; struct vm_area_struct *vma; - loff_t vaddr; if (err) goto free; @@ -847,9 +843,8 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (!vma || !valid_vma(vma, is_register)) goto unlock; - vaddr = vma_address(vma, uprobe->offset); if (vma->vm_file->f_mapping->host != uprobe->inode || - vaddr != info->vaddr) + vma_address(vma, uprobe->offset) != info->vaddr) goto unlock; if (is_register) { @@ -1055,10 +1050,8 @@ int uprobe_mmap(struct vm_area_struct *vma) count = 0; list_for_each_entry(uprobe, &tmp_list, pending_list) { - loff_t vaddr; - if (!ret) { - vaddr = vma_address(vma, uprobe->offset); + loff_t vaddr = vma_address(vma, uprobe->offset); if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { put_uprobe(uprobe); @@ -1122,9 +1115,8 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon build_probe_list(inode, &tmp_list); list_for_each_entry(uprobe, &tmp_list, pending_list) { - loff_t vaddr; + loff_t vaddr = vma_address(vma, uprobe->offset); - vaddr = vma_address(vma, uprobe->offset); if (vaddr >= start && vaddr < end) { /* * An unregister could have removed the probe before From 593609a59600c8377f311b300f14deacb155b9a4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:59 +0200 Subject: [PATCH 093/612] uprobes: __copy_insn() needs "loff_t offset" 1. __copy_insn() needs "loff_t offset", not "unsigned long", to read the file. 2. use pgoff_t for "idx" and remove the unnecessary typecast. 3. fix the typo, "&=" is not what we want 4. can't resist, rename off1 to off. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154359.GA9625@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4df84b76dd48..d1b2eeb80837 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -581,12 +581,12 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) static int __copy_insn(struct address_space *mapping, struct file *filp, char *insn, - unsigned long nbytes, unsigned long offset) + unsigned long nbytes, loff_t offset) { struct page *page; void *vaddr; - unsigned long off1; - unsigned long idx; + unsigned long off; + pgoff_t idx; if (!filp) return -EINVAL; @@ -594,8 +594,8 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, if (!mapping->a_ops->readpage) return -EIO; - idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); - off1 = offset &= ~PAGE_MASK; + idx = offset >> PAGE_CACHE_SHIFT; + off = offset & ~PAGE_MASK; /* * Ensure that the page that has the original instruction is @@ -606,7 +606,7 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, return PTR_ERR(page); vaddr = kmap_atomic(page); - memcpy(insn, vaddr + off1, nbytes); + memcpy(insn, vaddr + off, nbytes); kunmap_atomic(vaddr); page_cache_release(page); From e227051b13956b8f71c0abecc41ad351e64671c8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:44:01 +0200 Subject: [PATCH 094/612] uprobes: Remove the unnecessary initialization in add_utask() Trivial cleanup. No need to nullify ->active_uprobe after kzalloc(). Signed-off-by: Oleg Nesterov Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120615154401.GA9633@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d1b2eeb80837..f93532748bca 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1392,7 +1392,6 @@ static struct uprobe_task *add_utask(void) if (unlikely(!utask)) return NULL; - utask->active_uprobe = NULL; current->utask = utask; return utask; } From 650513979a437c32d7a0a84f0ed952a55bbb5583 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 16 Jun 2012 21:47:37 -0700 Subject: [PATCH 095/612] x86-64, reboot: Allow reboot=bios and reboot-cpu override on x86-64 With the revamped realmode trampoline code, it is trivial to extend support for reboot=bios to x86-64. Furthermore, while we are at it, remove the restriction that only we can only override the reboot CPU on 32 bits. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-jopx7y6g6dbcx4tpal8q0jlr@git.kernel.org --- arch/x86/include/asm/emergency-restart.h | 2 - arch/x86/include/asm/realmode.h | 3 +- arch/x86/include/asm/reboot.h | 4 +- arch/x86/kernel/reboot.c | 52 +++++++++---------- arch/x86/realmode/rm/Makefile | 2 +- arch/x86/realmode/rm/header.S | 4 +- .../x86/realmode/rm/{reboot_32.S => reboot.S} | 26 ++++++++-- 7 files changed, 55 insertions(+), 38 deletions(-) rename arch/x86/realmode/rm/{reboot_32.S => reboot.S} (88%) diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h index cc70c1c78ca4..75ce3f47d204 100644 --- a/arch/x86/include/asm/emergency-restart.h +++ b/arch/x86/include/asm/emergency-restart.h @@ -4,9 +4,7 @@ enum reboot_type { BOOT_TRIPLE = 't', BOOT_KBD = 'k', -#ifdef CONFIG_X86_32 BOOT_BIOS = 'b', -#endif BOOT_ACPI = 'a', BOOT_EFI = 'e', BOOT_CF9 = 'p', diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index fce3f4ae5bd6..fe1ec5bcd846 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -21,8 +21,9 @@ struct real_mode_header { u32 wakeup_header; #endif /* APM/BIOS reboot */ -#ifdef CONFIG_X86_32 u32 machine_real_restart_asm; +#ifdef CONFIG_X86_64 + u32 machine_real_restart_seg; #endif }; diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 92f297069e87..a82c4f1b4d83 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -18,8 +18,8 @@ extern struct machine_ops machine_ops; void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); -void machine_real_restart(unsigned int type); -/* These must match dispatch_table in reboot_32.S */ +void __noreturn machine_real_restart(unsigned int type); +/* These must match dispatch in arch/x86/realmore/rm/reboot.S */ #define MRR_BIOS 0 #define MRR_APM 1 diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 25b48edb847c..6ddb9cd0ced0 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -20,14 +20,12 @@ #include #include #include +#include -#ifdef CONFIG_X86_32 -# include -# include -# include -#else -# include -#endif +#include +#include +#include +#include /* * Power off function, if any @@ -49,7 +47,7 @@ int reboot_force; */ static int reboot_default = 1; -#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP static int reboot_cpu = -1; #endif @@ -67,8 +65,8 @@ bool port_cf9_safe = false; * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] * warm Don't set the cold reboot flag * cold Set the cold reboot flag - * bios Reboot by jumping through the BIOS (only for X86_32) - * smp Reboot by executing reset on BSP or other CPU (only for X86_32) + * bios Reboot by jumping through the BIOS + * smp Reboot by executing reset on BSP or other CPU * triple Force a triple fault (init) * kbd Use the keyboard controller. cold reset (default) * acpi Use the RESET_REG in the FADT @@ -95,7 +93,6 @@ static int __init reboot_setup(char *str) reboot_mode = 0; break; -#ifdef CONFIG_X86_32 #ifdef CONFIG_SMP case 's': if (isdigit(*(str+1))) { @@ -112,7 +109,6 @@ static int __init reboot_setup(char *str) #endif /* CONFIG_SMP */ case 'b': -#endif case 'a': case 'k': case 't': @@ -138,7 +134,6 @@ static int __init reboot_setup(char *str) __setup("reboot=", reboot_setup); -#ifdef CONFIG_X86_32 /* * Reboot options and system auto-detection code provided by * Dell Inc. so their systems "just work". :-) @@ -157,11 +152,8 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } -void machine_real_restart(unsigned int type) +void __noreturn machine_real_restart(unsigned int type) { - void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int)) - real_mode_header->machine_real_restart_asm; - local_irq_disable(); /* @@ -181,7 +173,11 @@ void machine_real_restart(unsigned int type) /* * Switch back to the initial page table. */ +#ifdef CONFIG_X86_32 load_cr3(initial_page_table); +#else + write_cr3(real_mode_header->trampoline_pgd); +#endif /* * Write 0x1234 to absolute memory location 0x472. The BIOS reads @@ -192,14 +188,21 @@ void machine_real_restart(unsigned int type) *((unsigned short *)0x472) = reboot_mode; /* Jump to the identity-mapped low memory code */ - restart_lowmem(type); +#ifdef CONFIG_X86_32 + asm volatile("jmpl *%0" : : + "rm" (real_mode_header->machine_real_restart_asm), + "a" (type)); +#else + asm volatile("ljmpl *%0" : : + "m" (real_mode_header->machine_real_restart_asm), + "D" (type)); +#endif + unreachable(); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); #endif -#endif /* CONFIG_X86_32 */ - /* * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot */ @@ -223,11 +226,9 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) } /* - * This is a single dmi_table handling all reboot quirks. Note that - * REBOOT_BIOS is only available for 32bit + * This is a single dmi_table handling all reboot quirks. */ static struct dmi_system_id __initdata reboot_dmi_table[] = { -#ifdef CONFIG_X86_32 { /* Handle problems with rebooting on Dell E520's */ .callback = set_bios_reboot, .ident = "Dell E520", @@ -377,7 +378,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, -#endif /* CONFIG_X86_32 */ { /* Handle reboot issue on Acer Aspire one */ .callback = set_kbd_reboot, @@ -576,13 +576,11 @@ static void native_machine_emergency_restart(void) reboot_type = BOOT_KBD; break; -#ifdef CONFIG_X86_32 case BOOT_BIOS: machine_real_restart(MRR_BIOS); reboot_type = BOOT_KBD; break; -#endif case BOOT_ACPI: acpi_reboot(); @@ -624,12 +622,10 @@ void native_machine_shutdown(void) /* The boot cpu is always logical cpu 0 */ int reboot_cpu_id = 0; -#ifdef CONFIG_X86_32 /* See if there has been given a command line override */ if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) && cpu_online(reboot_cpu)) reboot_cpu_id = reboot_cpu; -#endif /* Make certain the cpu I'm about to reboot on is online */ if (!cpu_online(reboot_cpu_id)) diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 5b84a2d30888..b2d534cab25f 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -22,7 +22,7 @@ wakeup-objs += video-bios.o realmode-y += header.o realmode-y += trampoline_$(BITS).o realmode-y += stack.o -realmode-$(CONFIG_X86_32) += reboot_32.o +realmode-y += reboot.o realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs) targets += $(realmode-y) diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index fadf48378ada..a28221d94e69 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -6,6 +6,7 @@ #include #include +#include #include "realmode.h" @@ -28,8 +29,9 @@ GLOBAL(real_mode_header) .long pa_wakeup_header #endif /* APM/BIOS reboot */ -#ifdef CONFIG_X86_32 .long pa_machine_real_restart_asm +#ifdef CONFIG_X86_64 + .long __KERNEL32_CS #endif END(real_mode_header) diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot.S similarity index 88% rename from arch/x86/realmode/rm/reboot_32.S rename to arch/x86/realmode/rm/reboot.S index 114044876b3d..6bf8feac5557 100644 --- a/arch/x86/realmode/rm/reboot_32.S +++ b/arch/x86/realmode/rm/reboot.S @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include "realmode.h" /* @@ -12,13 +14,31 @@ * doesn't work with at least one type of 486 motherboard. It is easy * to stop this code working; hence the copious comments. * - * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. + * This code is called with the restart type (0 = BIOS, 1 = APM) in + * the primary argument register (%eax for 32 bit, %edi for 64 bit). */ .section ".text32", "ax" .code32 - - .balign 16 ENTRY(machine_real_restart_asm) + +#ifdef CONFIG_X86_64 + + /* Disable paging to drop us out of long mode */ + movl %cr0, %eax + andl $~X86_CR0_PG, %eax + movl %eax, %cr0 + jmp 1f /* "A branch" may be needed here, assume near is OK */ + +1: + xorl %eax, %eax + xorl %edx, %edx + movl $MSR_EFER, %ecx + wrmsr + + movl %edi, %eax + +#endif /* CONFIG_X86_64 */ + /* Set up the IDT for real mode. */ lidtl pa_machine_real_restart_idt From abf71f3066740f3b59c3f731b4b68ed335f7b24d Mon Sep 17 00:00:00 2001 From: Ido Yariv Date: Fri, 15 Jun 2012 18:10:55 +0300 Subject: [PATCH 096/612] x86/vsmp: Fix vector_allocation_domain's return value Commit 8637e38a ("x86/apic: Avoid useless scanning thru a cpumask in assign_irq_vector()") modified vector_allocation_domain() to return a boolean indicating if cpumask is dynamic or static. Adjust vSMP's callback implementation accordingly. Signed-off-by: Ido Yariv Acked-by: Shai Fultheim Cc: Alexander Gordeev Link: http://lkml.kernel.org/r/1339773055-27397-1-git-send-email-ido@wizery.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsmp_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 3f0285ac00fa..fa5adb7c228c 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -208,9 +208,10 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) * In vSMP, all cpus should be capable of handling interrupts, regardless of * the APIC used. */ -static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask) +static bool fill_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_setall(retmask); + return false; } static void vsmp_apic_post_init(void) From 76958a61e42fb6277a8431eb17e4bdb24176f1b7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 15 Jun 2012 19:06:44 +0200 Subject: [PATCH 097/612] perf/x86/amd: Fix RDPMC index calculation for AMD family 15h The RDPMC index calculation is wrong for AMD family 15h (X86_FEATURE_ PERFCTR_CORE set). This leads to a #GP when accessing the counter: Pid: 2237, comm: syslog-ng Not tainted 3.5.0-rc1-perf-x86_64-standard-g130ff90 #135 AMD Pike/Pike RIP: 0010:[] [] x86_perf_event_update+0x27/0x66 While the msr address offset is (index << 1) we must use index to select the correct rdpmc. Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 766c76d5ec4c..d1f38c9509d0 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -823,7 +823,7 @@ static inline void x86_assign_hw_event(struct perf_event *event, } else { hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); - hwc->event_base_rdpmc = x86_pmu_addr_offset(hwc->idx); + hwc->event_base_rdpmc = hwc->idx; } } From 4b4969b14490a4f65b572b8f180164181104b5e1 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:30 +0800 Subject: [PATCH 098/612] perf: Export perf_assign_events() Export perf_assign_events() so the uncore code can use it to schedule events. Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-2-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 +++--- arch/x86/kernel/cpu/perf_event.h | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index d1f38c9509d0..6d32aefc9dbd 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -626,7 +626,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ - if (x86_pmu.num_counters_fixed) { + if (c->idxmsk64 & (~0ULL << X86_PMC_IDX_FIXED)) { idx = X86_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { if (!__test_and_set_bit(idx, sched->state.used)) @@ -693,8 +693,8 @@ static bool perf_sched_next_event(struct perf_sched *sched) /* * Assign a counter for each event. */ -static int perf_assign_events(struct event_constraint **constraints, int n, - int wmin, int wmax, int *assign) +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign) { struct perf_sched sched; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 3df3de9452a9..83238f2a12b2 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -481,6 +481,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, void x86_pmu_enable_all(int added); +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); void x86_pmu_stop(struct perf_event *event, int flags); From fbfc623f8231c8d8c78aab5841e9c6e5811ab638 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:31 +0800 Subject: [PATCH 099/612] perf: Avoid race between cpu hotplug and installing event perf_event_open() requires the cpu on which to install event is online, but the cpu can go offline after perf_event_open checks that. Add a get_online_cpus()/put_online_cpus() pair to avoid the race. Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-3-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index d7d71d6ec972..31d182e01549 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6252,6 +6252,8 @@ SYSCALL_DEFINE5(perf_event_open, } } + get_online_cpus(); + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL); if (IS_ERR(event)) { @@ -6391,6 +6393,8 @@ SYSCALL_DEFINE5(perf_event_open, perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); + put_online_cpus(); + event->owner = current; mutex_lock(¤t->perf_event_mutex); @@ -6419,6 +6423,7 @@ err_context: err_alloc: free_event(event); err_task: + put_online_cpus(); if (task) put_task_struct(task); err_group_fd: From e2d37cd213dcc0aeb3db4b37b9bd1710fe36fbf7 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:32 +0800 Subject: [PATCH 100/612] perf: Allow the PMU driver to choose the CPU on which to install events Allow the pmu->event_init callback to change event->cpu, so the PMU driver can choose the CPU on which to install events. Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-4-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 31d182e01549..fa36a39e8bb7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6306,7 +6306,7 @@ SYSCALL_DEFINE5(perf_event_open, /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, cpu); + ctx = find_get_context(pmu, task, event->cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; @@ -6379,16 +6379,16 @@ SYSCALL_DEFINE5(perf_event_open, mutex_lock(&ctx->mutex); if (move_group) { - perf_install_in_context(ctx, group_leader, cpu); + perf_install_in_context(ctx, group_leader, event->cpu); get_ctx(ctx); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_install_in_context(ctx, sibling, cpu); + perf_install_in_context(ctx, sibling, event->cpu); get_ctx(ctx); } } - perf_install_in_context(ctx, event, cpu); + perf_install_in_context(ctx, event, event->cpu); ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); From 0cda4c023132aa93f2dd94811061f812e88daf4c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:33 +0800 Subject: [PATCH 101/612] perf: Introduce perf_pmu_migrate_context() Originally from Peter Zijlstra. The helper migrates perf events from one cpu to another cpu. Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-5-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 ++ kernel/events/core.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1ce887abcc5c..76c5c8b724a7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1107,6 +1107,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, struct task_struct *task, perf_overflow_handler_t callback, void *context); +extern void perf_pmu_migrate_context(struct pmu *pmu, + int src_cpu, int dst_cpu); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); diff --git a/kernel/events/core.c b/kernel/events/core.c index fa36a39e8bb7..f1cf0edeb39a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1645,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); event->ctx = ctx; + if (event->cpu != -1) + event->cpu = cpu; if (!task) { /* @@ -6379,6 +6381,7 @@ SYSCALL_DEFINE5(perf_event_open, mutex_lock(&ctx->mutex); if (move_group) { + synchronize_rcu(); perf_install_in_context(ctx, group_leader, event->cpu); get_ctx(ctx); list_for_each_entry(sibling, &group_leader->sibling_list, @@ -6484,6 +6487,39 @@ err: } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); +void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +{ + struct perf_event_context *src_ctx; + struct perf_event_context *dst_ctx; + struct perf_event *event, *tmp; + LIST_HEAD(events); + + src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; + dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; + + mutex_lock(&src_ctx->mutex); + list_for_each_entry_safe(event, tmp, &src_ctx->event_list, + event_entry) { + perf_remove_from_context(event); + put_ctx(src_ctx); + list_add(&event->event_entry, &events); + } + mutex_unlock(&src_ctx->mutex); + + synchronize_rcu(); + + mutex_lock(&dst_ctx->mutex); + list_for_each_entry_safe(event, tmp, &events, event_entry) { + list_del(&event->event_entry); + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + perf_install_in_context(dst_ctx, event, dst_cpu); + get_ctx(dst_ctx); + } + mutex_unlock(&dst_ctx->mutex); +} +EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); + static void sync_child_event(struct perf_event *child_event, struct task_struct *child) { From 087bfbb032691262f2f7d52b910652450c5554b8 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:34 +0800 Subject: [PATCH 102/612] perf/x86: Add generic Intel uncore PMU support This patch adds the generic Intel uncore PMU support, including helper functions that add/delete uncore events, a hrtimer that periodically polls the counters to avoid overflow and code that places all events for a particular socket onto a single cpu. The code design is based on the structure of Sandy Bridge-EP's uncore subsystem, which consists of a variety of components, each component contains one or more "boxes". (Tooling support follows in the next patches.) Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1339741902-8449-6-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 4 +- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 878 ++++++++++++++++++ arch/x86/kernel/cpu/perf_event_intel_uncore.h | 204 ++++ 3 files changed, 1085 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/perf_event_intel_uncore.c create mode 100644 arch/x86/kernel/cpu/perf_event_intel_uncore.h diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 6ab6aa2fdfdd..bac4c3804cc7 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -32,7 +32,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifdef CONFIG_PERF_EVENTS obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o endif obj-$(CONFIG_X86_MCE) += mcheck/ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c new file mode 100644 index 000000000000..fe76a07dfdbc --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -0,0 +1,878 @@ +#include "perf_event_intel_uncore.h" + +static struct intel_uncore_type *empty_uncore[] = { NULL, }; +static struct intel_uncore_type **msr_uncores = empty_uncore; + +/* mask of cpus that collect uncore events */ +static cpumask_t uncore_cpu_mask; + +/* constraint for the fixed counter */ +static struct event_constraint constraint_fixed = + EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); + +static void uncore_assign_hw_event(struct intel_uncore_box *box, + struct perf_event *event, int idx) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = idx; + hwc->last_tag = ++box->tags[idx]; + + if (hwc->idx == UNCORE_PMC_IDX_FIXED) { + hwc->event_base = uncore_msr_fixed_ctr(box); + hwc->config_base = uncore_msr_fixed_ctl(box); + return; + } + + hwc->config_base = uncore_msr_event_ctl(box, hwc->idx); + hwc->event_base = uncore_msr_perf_ctr(box, hwc->idx); +} + +static void uncore_perf_event_update(struct intel_uncore_box *box, + struct perf_event *event) +{ + u64 prev_count, new_count, delta; + int shift; + + if (event->hw.idx >= UNCORE_PMC_IDX_FIXED) + shift = 64 - uncore_fixed_ctr_bits(box); + else + shift = 64 - uncore_perf_ctr_bits(box); + + /* the hrtimer might modify the previous event value */ +again: + prev_count = local64_read(&event->hw.prev_count); + new_count = uncore_read_counter(box, event); + if (local64_xchg(&event->hw.prev_count, new_count) != prev_count) + goto again; + + delta = (new_count << shift) - (prev_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); +} + +/* + * The overflow interrupt is unavailable for SandyBridge-EP, is broken + * for SandyBridge. So we use hrtimer to periodically poll the counter + * to avoid overflow. + */ +static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) +{ + struct intel_uncore_box *box; + unsigned long flags; + int bit; + + box = container_of(hrtimer, struct intel_uncore_box, hrtimer); + if (!box->n_active || box->cpu != smp_processor_id()) + return HRTIMER_NORESTART; + /* + * disable local interrupt to prevent uncore_pmu_event_start/stop + * to interrupt the update process + */ + local_irq_save(flags); + + for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) + uncore_perf_event_update(box, box->events[bit]); + + local_irq_restore(flags); + + hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL)); + return HRTIMER_RESTART; +} + +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) +{ + __hrtimer_start_range_ns(&box->hrtimer, + ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_cancel(&box->hrtimer); +} + +static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + box->hrtimer.function = uncore_pmu_hrtimer; +} + +struct intel_uncore_box *uncore_alloc_box(int cpu) +{ + struct intel_uncore_box *box; + + box = kmalloc_node(sizeof(*box), GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); + if (!box) + return NULL; + + uncore_pmu_init_hrtimer(box); + atomic_set(&box->refcnt, 1); + box->cpu = -1; + box->phys_id = -1; + + return box; +} + +static struct intel_uncore_box * +uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +{ + return *per_cpu_ptr(pmu->box, cpu); +} + +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ + /* + * perf core schedules event on the basis of cpu, uncore events are + * collected by one of the cpus inside a physical package. + */ + return uncore_pmu_to_box(uncore_event_to_pmu(event), + smp_processor_id()); +} + +static int uncore_collect_events(struct intel_uncore_box *box, + struct perf_event *leader, bool dogrp) +{ + struct perf_event *event; + int n, max_count; + + max_count = box->pmu->type->num_counters; + if (box->pmu->type->fixed_ctl) + max_count++; + + if (box->n_events >= max_count) + return -EINVAL; + + n = box->n_events; + box->event_list[n] = leader; + n++; + if (!dogrp) + return n; + + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -EINVAL; + + box->event_list[n] = event; + n++; + } + return n; +} + +static struct event_constraint * +uncore_event_constraint(struct intel_uncore_type *type, + struct perf_event *event) +{ + struct event_constraint *c; + + if (event->hw.config == ~0ULL) + return &constraint_fixed; + + if (type->constraints) { + for_each_event_constraint(c, type->constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &type->unconstrainted; +} + +static int uncore_assign_events(struct intel_uncore_box *box, + int assign[], int n) +{ + unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX]; + int i, ret, wmin, wmax; + struct hw_perf_event *hwc; + + bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); + + for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { + c = uncore_event_constraint(box->pmu->type, + box->event_list[i]); + constraints[i] = c; + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); + } + + /* fastpath, try to reuse previous register */ + for (i = 0; i < n; i++) { + hwc = &box->event_list[i]->hw; + c = constraints[i]; + + /* never assigned */ + if (hwc->idx == -1) + break; + + /* constraint still honored */ + if (!test_bit(hwc->idx, c->idxmsk)) + break; + + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; + + __set_bit(hwc->idx, used_mask); + assign[i] = hwc->idx; + } + if (i == n) + return 0; + + /* slow path */ + ret = perf_assign_events(constraints, n, wmin, wmax, assign); + return ret ? -EINVAL : 0; +} + +static void uncore_pmu_event_start(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int idx = event->hw.idx; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX)) + return; + + event->hw.state = 0; + box->events[idx] = event; + box->n_active++; + __set_bit(idx, box->active_mask); + + local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); + uncore_enable_event(box, event); + + if (box->n_active == 1) { + uncore_enable_box(box); + uncore_pmu_start_hrtimer(box); + } +} + +static void uncore_pmu_event_stop(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (__test_and_clear_bit(hwc->idx, box->active_mask)) { + uncore_disable_event(box, event); + box->n_active--; + box->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + if (box->n_active == 0) { + uncore_disable_box(box); + uncore_pmu_cancel_hrtimer(box); + } + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + uncore_perf_event_update(box, event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int uncore_pmu_event_add(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + int assign[UNCORE_PMC_IDX_MAX]; + int i, n, ret; + + if (!box) + return -ENODEV; + + ret = n = uncore_collect_events(box, event, false); + if (ret < 0) + return ret; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + ret = uncore_assign_events(box, assign, n); + if (ret) + return ret; + + /* save events moving to new counters */ + for (i = 0; i < box->n_events; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx == assign[i] && + hwc->last_tag == box->tags[assign[i]]) + continue; + /* + * Ensure we don't accidentally enable a stopped + * counter simply because we rescheduled. + */ + if (hwc->state & PERF_HES_STOPPED) + hwc->state |= PERF_HES_ARCH; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + } + + /* reprogram moved events into new counters */ + for (i = 0; i < n; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx != assign[i] || + hwc->last_tag != box->tags[assign[i]]) + uncore_assign_hw_event(box, event, assign[i]); + else if (i < box->n_events) + continue; + + if (hwc->state & PERF_HES_ARCH) + continue; + + uncore_pmu_event_start(event, 0); + } + box->n_events = n; + + return 0; +} + +static void uncore_pmu_event_del(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int i; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < box->n_events; i++) { + if (event == box->event_list[i]) { + while (++i < box->n_events) + box->event_list[i - 1] = box->event_list[i]; + + --box->n_events; + break; + } + } + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; +} + +static void uncore_pmu_event_read(struct perf_event *event) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + uncore_perf_event_update(box, event); +} + +/* + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ +static int uncore_validate_group(struct intel_uncore_pmu *pmu, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + struct intel_uncore_box *fake_box; + int assign[UNCORE_PMC_IDX_MAX]; + int ret = -EINVAL, n; + + fake_box = uncore_alloc_box(smp_processor_id()); + if (!fake_box) + return -ENOMEM; + + fake_box->pmu = pmu; + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + n = uncore_collect_events(fake_box, leader, true); + if (n < 0) + goto out; + + fake_box->n_events = n; + n = uncore_collect_events(fake_box, event, false); + if (n < 0) + goto out; + + fake_box->n_events = n; + + ret = uncore_assign_events(fake_box, assign, n); +out: + kfree(fake_box); + return ret; +} + +int uncore_pmu_event_init(struct perf_event *event) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct hw_perf_event *hwc = &event->hw; + int ret; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + pmu = uncore_event_to_pmu(event); + /* no device found for this pmu */ + if (pmu->func_id < 0) + return -ENOENT; + + /* + * Uncore PMU does measure at all privilege level all the time. + * So it doesn't make sense to specify any exclude bits. + */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_hv || event->attr.exclude_idle) + return -EINVAL; + + /* Sampling not supported yet */ + if (hwc->sample_period) + return -EINVAL; + + /* + * Place all uncore events for a particular physical package + * onto a single cpu + */ + if (event->cpu < 0) + return -EINVAL; + box = uncore_pmu_to_box(pmu, event->cpu); + if (!box || box->cpu < 0) + return -EINVAL; + event->cpu = box->cpu; + + if (event->attr.config == UNCORE_FIXED_EVENT) { + /* no fixed counter */ + if (!pmu->type->fixed_ctl) + return -EINVAL; + /* + * if there is only one fixed counter, only the first pmu + * can access the fixed counter + */ + if (pmu->type->single_fixed && pmu->pmu_idx > 0) + return -EINVAL; + hwc->config = ~0ULL; + } else { + hwc->config = event->attr.config & pmu->type->event_mask; + } + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + + if (event->group_leader != event) + ret = uncore_validate_group(pmu, event); + else + ret = 0; + + return ret; +} + +static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) +{ + int ret; + + pmu->pmu = (struct pmu) { + .attr_groups = pmu->type->attr_groups, + .task_ctx_nr = perf_invalid_context, + .event_init = uncore_pmu_event_init, + .add = uncore_pmu_event_add, + .del = uncore_pmu_event_del, + .start = uncore_pmu_event_start, + .stop = uncore_pmu_event_stop, + .read = uncore_pmu_event_read, + }; + + if (pmu->type->num_boxes == 1) { + if (strlen(pmu->type->name) > 0) + sprintf(pmu->name, "uncore_%s", pmu->type->name); + else + sprintf(pmu->name, "uncore"); + } else { + sprintf(pmu->name, "uncore_%s_%d", pmu->type->name, + pmu->pmu_idx); + } + + ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); + return ret; +} + +static void __init uncore_type_exit(struct intel_uncore_type *type) +{ + int i; + + for (i = 0; i < type->num_boxes; i++) + free_percpu(type->pmus[i].box); + kfree(type->pmus); + type->pmus = NULL; + kfree(type->attr_groups[1]); + type->attr_groups[1] = NULL; +} + +static int __init uncore_type_init(struct intel_uncore_type *type) +{ + struct intel_uncore_pmu *pmus; + struct attribute_group *events_group; + struct attribute **attrs; + int i, j; + + pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL); + if (!pmus) + return -ENOMEM; + + type->unconstrainted = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, + 0, type->num_counters, 0); + + for (i = 0; i < type->num_boxes; i++) { + pmus[i].func_id = -1; + pmus[i].pmu_idx = i; + pmus[i].type = type; + pmus[i].box = alloc_percpu(struct intel_uncore_box *); + if (!pmus[i].box) + goto fail; + } + + if (type->event_descs) { + i = 0; + while (type->event_descs[i].attr.attr.name) + i++; + + events_group = kzalloc(sizeof(struct attribute *) * (i + 1) + + sizeof(*events_group), GFP_KERNEL); + if (!events_group) + goto fail; + + attrs = (struct attribute **)(events_group + 1); + events_group->name = "events"; + events_group->attrs = attrs; + + for (j = 0; j < i; j++) + attrs[j] = &type->event_descs[j].attr.attr; + + type->attr_groups[1] = events_group; + } + + type->pmus = pmus; + return 0; +fail: + uncore_type_exit(type); + return -ENOMEM; +} + +static int __init uncore_types_init(struct intel_uncore_type **types) +{ + int i, ret; + + for (i = 0; types[i]; i++) { + ret = uncore_type_init(types[i]); + if (ret) + goto fail; + } + return 0; +fail: + while (--i >= 0) + uncore_type_exit(types[i]); + return ret; +} + +static void __cpuinit uncore_cpu_dying(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + *per_cpu_ptr(pmu->box, cpu) = NULL; + if (box && atomic_dec_and_test(&box->refcnt)) + kfree(box); + } + } +} + +static int __cpuinit uncore_cpu_starting(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box, *exist; + int i, j, k, phys_id; + + phys_id = topology_physical_package_id(cpu); + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + /* called by uncore_cpu_init? */ + if (box && box->phys_id >= 0) { + uncore_box_init(box); + continue; + } + + for_each_online_cpu(k) { + exist = *per_cpu_ptr(pmu->box, k); + if (exist && exist->phys_id == phys_id) { + atomic_inc(&exist->refcnt); + *per_cpu_ptr(pmu->box, cpu) = exist; + kfree(box); + box = NULL; + break; + } + } + + if (box) { + box->phys_id = phys_id; + uncore_box_init(box); + } + } + } + return 0; +} + +static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (pmu->func_id < 0) + pmu->func_id = j; + + box = uncore_alloc_box(cpu); + if (!box) + return -ENOMEM; + + box->pmu = pmu; + box->phys_id = phys_id; + *per_cpu_ptr(pmu->box, cpu) = box; + } + } + return 0; +} + +static void __cpuinit uncore_change_context(struct intel_uncore_type **uncores, + int old_cpu, int new_cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; uncores[i]; i++) { + type = uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (old_cpu < 0) + box = uncore_pmu_to_box(pmu, new_cpu); + else + box = uncore_pmu_to_box(pmu, old_cpu); + if (!box) + continue; + + if (old_cpu < 0) { + WARN_ON_ONCE(box->cpu != -1); + box->cpu = new_cpu; + continue; + } + + WARN_ON_ONCE(box->cpu != old_cpu); + if (new_cpu >= 0) { + uncore_pmu_cancel_hrtimer(box); + perf_pmu_migrate_context(&pmu->pmu, + old_cpu, new_cpu); + box->cpu = new_cpu; + } else { + box->cpu = -1; + } + } + } +} + +static void __cpuinit uncore_event_exit_cpu(int cpu) +{ + int i, phys_id, target; + + /* if exiting cpu is used for collecting uncore events */ + if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) + return; + + /* find a new cpu to collect uncore events */ + phys_id = topology_physical_package_id(cpu); + target = -1; + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + + /* migrate uncore events to the new cpu */ + if (target >= 0) + cpumask_set_cpu(target, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, cpu, target); +} + +static void __cpuinit uncore_event_init_cpu(int cpu) +{ + int i, phys_id; + + phys_id = topology_physical_package_id(cpu); + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + + cpumask_set_cpu(cpu, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, -1, cpu); +} + +static int __cpuinit uncore_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + /* allocate/free data structure for uncore box */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + uncore_cpu_prepare(cpu, -1); + break; + case CPU_STARTING: + uncore_cpu_starting(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + uncore_cpu_dying(cpu); + break; + default: + break; + } + + /* select the cpu that collects uncore events */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: + case CPU_STARTING: + uncore_event_init_cpu(cpu); + break; + case CPU_DOWN_PREPARE: + uncore_event_exit_cpu(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block uncore_cpu_nb __cpuinitdata = { + .notifier_call = uncore_cpu_notifier, + /* + * to migrate uncore events, our notifier should be executed + * before perf core's notifier. + */ + .priority = CPU_PRI_PERF + 1, +}; + +static void __init uncore_cpu_setup(void *dummy) +{ + uncore_cpu_starting(smp_processor_id()); +} + +static int __init uncore_cpu_init(void) +{ + int ret, cpu; + + switch (boot_cpu_data.x86_model) { + default: + return 0; + } + + ret = uncore_types_init(msr_uncores); + if (ret) + return ret; + + get_online_cpus(); + + for_each_online_cpu(cpu) { + int i, phys_id = topology_physical_package_id(cpu); + + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) { + phys_id = -1; + break; + } + } + if (phys_id < 0) + continue; + + uncore_cpu_prepare(cpu, phys_id); + uncore_event_init_cpu(cpu); + } + on_each_cpu(uncore_cpu_setup, NULL, 1); + + register_cpu_notifier(&uncore_cpu_nb); + + put_online_cpus(); + + return 0; +} + +static int __init uncore_pmus_register(void) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_type *type; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + return 0; +} + +static int __init intel_uncore_init(void) +{ + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return -ENODEV; + + ret = uncore_cpu_init(); + if (ret) + goto fail; + + uncore_pmus_register(); + return 0; +fail: + return ret; +} +device_initcall(intel_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h new file mode 100644 index 000000000000..49a6bfbba0de --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -0,0 +1,204 @@ +#include +#include +#include +#include "perf_event.h" + +#define UNCORE_PMU_NAME_LEN 32 +#define UNCORE_BOX_HASH_SIZE 8 + +#define UNCORE_PMU_HRTIMER_INTERVAL (60 * NSEC_PER_SEC) + +#define UNCORE_FIXED_EVENT 0xffff +#define UNCORE_PMC_IDX_MAX_GENERIC 8 +#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC +#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) + +#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) + +struct intel_uncore_ops; +struct intel_uncore_pmu; +struct intel_uncore_box; +struct uncore_event_desc; + +struct intel_uncore_type { + const char *name; + int num_counters; + int num_boxes; + int perf_ctr_bits; + int fixed_ctr_bits; + int single_fixed; + unsigned perf_ctr; + unsigned event_ctl; + unsigned event_mask; + unsigned fixed_ctr; + unsigned fixed_ctl; + unsigned box_ctl; + unsigned msr_offset; + struct event_constraint unconstrainted; + struct event_constraint *constraints; + struct intel_uncore_pmu *pmus; + struct intel_uncore_ops *ops; + struct uncore_event_desc *event_descs; + const struct attribute_group *attr_groups[3]; +}; + +#define format_group attr_groups[0] + +struct intel_uncore_ops { + void (*init_box)(struct intel_uncore_box *); + void (*disable_box)(struct intel_uncore_box *); + void (*enable_box)(struct intel_uncore_box *); + void (*disable_event)(struct intel_uncore_box *, struct perf_event *); + void (*enable_event)(struct intel_uncore_box *, struct perf_event *); + u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *); +}; + +struct intel_uncore_pmu { + struct pmu pmu; + char name[UNCORE_PMU_NAME_LEN]; + int pmu_idx; + int func_id; + struct intel_uncore_type *type; + struct intel_uncore_box ** __percpu box; +}; + +struct intel_uncore_box { + int phys_id; + int n_active; /* number of active events */ + int n_events; + int cpu; /* cpu to collect events */ + unsigned long flags; + atomic_t refcnt; + struct perf_event *events[UNCORE_PMC_IDX_MAX]; + struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; + unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + u64 tags[UNCORE_PMC_IDX_MAX]; + struct intel_uncore_pmu *pmu; + struct hrtimer hrtimer; + struct list_head list; +}; + +#define UNCORE_BOX_FLAG_INITIATED 0 + +struct uncore_event_desc { + struct kobj_attribute attr; + const char *config; +}; + +#define INTEL_UNCORE_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ + .config = _config, \ +} + +#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __uncore_##_var##_show, NULL) + + +static ssize_t uncore_event_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uncore_event_desc *event = + container_of(attr, struct uncore_event_desc, attr); + return sprintf(buf, "%s", event->config); +} + +static inline +unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->box_ctl) + return 0; + return box->pmu->type->box_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->fixed_ctl) + return 0; + return box->pmu->type->fixed_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) +{ + return idx + box->pmu->type->event_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return idx + box->pmu->type->perf_ctr + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->perf_ctr_bits; +} + +static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr_bits; +} + +static inline int uncore_num_counters(struct intel_uncore_box *box) +{ + return box->pmu->type->num_counters; +} + +static inline void uncore_disable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->disable_box) + box->pmu->type->ops->disable_box(box); +} + +static inline void uncore_enable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->enable_box) + box->pmu->type->ops->enable_box(box); +} + +static inline void uncore_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->disable_event(box, event); +} + +static inline void uncore_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->enable_event(box, event); +} + +static inline u64 uncore_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + return box->pmu->type->ops->read_counter(box, event); +} + +static inline void uncore_box_init(struct intel_uncore_box *box) +{ + if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { + if (box->pmu->type->ops->init_box) + box->pmu->type->ops->init_box(box); + } +} From fcde10e916326545e8fec1807357c68ef08dc443 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:35 +0800 Subject: [PATCH 103/612] perf/x86: Add Intel Nehalem and Sandy Bridge uncore PMU support Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1339741902-8449-7-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 195 ++++++++++++++++++ arch/x86/kernel/cpu/perf_event_intel_uncore.h | 50 +++++ 2 files changed, 245 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index fe76a07dfdbc..3ed941ac3745 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -10,6 +10,192 @@ static cpumask_t uncore_cpu_mask; static struct event_constraint constraint_fixed = EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); + +/* Sandy Bridge uncore support */ +static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); +} + +static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static u64 snb_uncore_msr_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + u64 count; + rdmsrl(event->hw.event_base, count); + return count; +} + +static void snb_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) { + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); + } +} + +static struct attribute *snb_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask5.attr, + NULL, +}; + +static struct attribute_group snb_uncore_format_group = { + .name = "format", + .attrs = snb_uncore_formats_attr, +}; + +static struct intel_uncore_ops snb_uncore_msr_ops = { + .init_box = snb_uncore_msr_init_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = snb_uncore_msr_read_counter, +}; + +static struct event_constraint snb_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x80, 0x1), + UNCORE_EVENT_CONSTRAINT(0x83, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snb_uncore_cbox = { + .name = "cbox", + .num_counters = 2, + .num_boxes = 4, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .constraints = snb_uncore_cbox_constraints, + .ops = &snb_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + +static struct intel_uncore_type *snb_msr_uncores[] = { + &snb_uncore_cbox, + NULL, +}; +/* end of Sandy Bridge uncore support */ + +/* Nehalem uncore support */ +static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); +} + +static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, + NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); +} + +static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); +} + +static struct attribute *nhm_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask8.attr, + NULL, +}; + +static struct attribute_group nhm_uncore_format_group = { + .name = "format", + .attrs = nhm_uncore_formats_attr, +}; + +static struct uncore_event_desc nhm_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(CLOCKTICKS, "config=0xffff"), + /* full cache line writes to DRAM */ + INTEL_UNCORE_EVENT_DESC(QMC_WRITES_FULL_ANY, "event=0x2f,umask=0xf"), + /* Quickpath Memory Controller normal priority read requests */ + INTEL_UNCORE_EVENT_DESC(QMC_NORMAL_READS_ANY, "event=0x2c,umask=0xf"), + /* Quickpath Home Logic read requests from the IOH */ + INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_IOH_READS, + "event=0x20,umask=0x1"), + /* Quickpath Home Logic write requests from the IOH */ + INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_IOH_WRITES, + "event=0x20,umask=0x2"), + /* Quickpath Home Logic read requests from a remote socket */ + INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_REMOTE_READS, + "event=0x20,umask=0x4"), + /* Quickpath Home Logic write requests from a remote socket */ + INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_REMOTE_WRITES, + "event=0x20,umask=0x8"), + /* Quickpath Home Logic read requests from the local socket */ + INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_LOCAL_READS, + "event=0x20,umask=0x10"), + /* Quickpath Home Logic write requests from the local socket */ + INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_LOCAL_WRITES, + "event=0x20,umask=0x20"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhm_uncore_msr_ops = { + .disable_box = nhm_uncore_msr_disable_box, + .enable_box = nhm_uncore_msr_enable_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = nhm_uncore_msr_enable_event, + .read_counter = snb_uncore_msr_read_counter, +}; + +static struct intel_uncore_type nhm_uncore = { + .name = "", + .num_counters = 8, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .event_ctl = NHM_UNC_PERFEVTSEL0, + .perf_ctr = NHM_UNC_UNCORE_PMC0, + .fixed_ctr = NHM_UNC_FIXED_CTR, + .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL, + .event_mask = NHM_UNC_RAW_EVENT_MASK, + .event_descs = nhm_uncore_events, + .ops = &nhm_uncore_msr_ops, + .format_group = &nhm_uncore_format_group, +}; + +static struct intel_uncore_type *nhm_msr_uncores[] = { + &nhm_uncore, + NULL, +}; +/* end of Nehalem uncore support */ + static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx) { @@ -808,6 +994,15 @@ static int __init uncore_cpu_init(void) int ret, cpu; switch (boot_cpu_data.x86_model) { + case 26: /* Nehalem */ + case 30: + case 37: /* Westmere */ + case 44: + msr_uncores = nhm_msr_uncores; + break; + case 42: /* Sandy Bridge */ + msr_uncores = snb_msr_uncores; + break; default: return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 49a6bfbba0de..eeb5ca5815a8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -15,6 +15,56 @@ #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) +/* SNB event control */ +#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff +#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 +#define SNB_UNC_CTL_EDGE_DET (1 << 18) +#define SNB_UNC_CTL_EN (1 << 22) +#define SNB_UNC_CTL_INVERT (1 << 23) +#define SNB_UNC_CTL_CMASK_MASK 0x1f000000 +#define NHM_UNC_CTL_CMASK_MASK 0xff000000 +#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0) + +#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + SNB_UNC_CTL_CMASK_MASK) + +#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + NHM_UNC_CTL_CMASK_MASK) + +/* SNB global control register */ +#define SNB_UNC_PERF_GLOBAL_CTL 0x391 +#define SNB_UNC_FIXED_CTR_CTRL 0x394 +#define SNB_UNC_FIXED_CTR 0x395 + +/* SNB uncore global control */ +#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1) +#define SNB_UNC_GLOBAL_CTL_EN (1 << 29) + +/* SNB Cbo register */ +#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700 +#define SNB_UNC_CBO_0_PER_CTR0 0x706 +#define SNB_UNC_CBO_MSR_OFFSET 0x10 + +/* NHM global control register */ +#define NHM_UNC_PERF_GLOBAL_CTL 0x391 +#define NHM_UNC_FIXED_CTR 0x394 +#define NHM_UNC_FIXED_CTR_CTRL 0x395 + +/* NHM uncore global control */ +#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1) +#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32) + +/* NHM uncore register */ +#define NHM_UNC_PERFEVTSEL0 0x3c0 +#define NHM_UNC_UNCORE_PMC0 0x3b0 + + struct intel_uncore_ops; struct intel_uncore_pmu; struct intel_uncore_box; From 14371cce03c2fc393997e17f979e76674b7f392a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:36 +0800 Subject: [PATCH 104/612] perf: Add generic PCI uncore PMU device support This patch adds generic support for uncore PMUs presented as PCI devices. (These come in addition to the CPU/MSR based uncores.) Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-8-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 175 +++++++++++++++++- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 66 +++++++ 2 files changed, 236 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 3ed941ac3745..e20c65a0e108 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -2,6 +2,11 @@ static struct intel_uncore_type *empty_uncore[] = { NULL, }; static struct intel_uncore_type **msr_uncores = empty_uncore; +static struct intel_uncore_type **pci_uncores = empty_uncore; +/* pci bus to socket mapping */ +static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; + +static DEFINE_RAW_SPINLOCK(uncore_box_lock); /* mask of cpus that collect uncore events */ static cpumask_t uncore_cpu_mask; @@ -205,13 +210,13 @@ static void uncore_assign_hw_event(struct intel_uncore_box *box, hwc->last_tag = ++box->tags[idx]; if (hwc->idx == UNCORE_PMC_IDX_FIXED) { - hwc->event_base = uncore_msr_fixed_ctr(box); - hwc->config_base = uncore_msr_fixed_ctl(box); + hwc->event_base = uncore_fixed_ctr(box); + hwc->config_base = uncore_fixed_ctl(box); return; } - hwc->config_base = uncore_msr_event_ctl(box, hwc->idx); - hwc->event_base = uncore_msr_perf_ctr(box, hwc->idx); + hwc->config_base = uncore_event_ctl(box, hwc->idx); + hwc->event_base = uncore_perf_ctr(box, hwc->idx); } static void uncore_perf_event_update(struct intel_uncore_box *box, @@ -305,6 +310,22 @@ struct intel_uncore_box *uncore_alloc_box(int cpu) static struct intel_uncore_box * uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) { + static struct intel_uncore_box *box; + + box = *per_cpu_ptr(pmu->box, cpu); + if (box) + return box; + + raw_spin_lock(&uncore_box_lock); + list_for_each_entry(box, &pmu->box_list, list) { + if (box->phys_id == topology_physical_package_id(cpu)) { + atomic_inc(&box->refcnt); + *per_cpu_ptr(pmu->box, cpu) = box; + break; + } + } + raw_spin_unlock(&uncore_box_lock); + return *per_cpu_ptr(pmu->box, cpu); } @@ -706,6 +727,13 @@ static void __init uncore_type_exit(struct intel_uncore_type *type) type->attr_groups[1] = NULL; } +static void uncore_types_exit(struct intel_uncore_type **types) +{ + int i; + for (i = 0; types[i]; i++) + uncore_type_exit(types[i]); +} + static int __init uncore_type_init(struct intel_uncore_type *type) { struct intel_uncore_pmu *pmus; @@ -725,6 +753,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type) pmus[i].func_id = -1; pmus[i].pmu_idx = i; pmus[i].type = type; + INIT_LIST_HEAD(&pmus[i].box_list); pmus[i].box = alloc_percpu(struct intel_uncore_box *); if (!pmus[i].box) goto fail; @@ -773,6 +802,127 @@ fail: return ret; } +static struct pci_driver *uncore_pci_driver; +static bool pcidrv_registered; + +/* + * add a pci uncore device + */ +static int __devinit uncore_pci_add(struct intel_uncore_type *type, + struct pci_dev *pdev) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, phys_id; + + phys_id = pcibus_to_physid[pdev->bus->number]; + if (phys_id < 0) + return -ENODEV; + + box = uncore_alloc_box(0); + if (!box) + return -ENOMEM; + + /* + * for performance monitoring unit with multiple boxes, + * each box has a different function id. + */ + for (i = 0; i < type->num_boxes; i++) { + pmu = &type->pmus[i]; + if (pmu->func_id == pdev->devfn) + break; + if (pmu->func_id < 0) { + pmu->func_id = pdev->devfn; + break; + } + pmu = NULL; + } + + if (!pmu) { + kfree(box); + return -EINVAL; + } + + box->phys_id = phys_id; + box->pci_dev = pdev; + box->pmu = pmu; + uncore_box_init(box); + pci_set_drvdata(pdev, box); + + raw_spin_lock(&uncore_box_lock); + list_add_tail(&box->list, &pmu->box_list); + raw_spin_unlock(&uncore_box_lock); + + return 0; +} + +static void __devexit uncore_pci_remove(struct pci_dev *pdev) +{ + struct intel_uncore_box *box = pci_get_drvdata(pdev); + struct intel_uncore_pmu *pmu = box->pmu; + int cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + + if (WARN_ON_ONCE(phys_id != box->phys_id)) + return; + + raw_spin_lock(&uncore_box_lock); + list_del(&box->list); + raw_spin_unlock(&uncore_box_lock); + + for_each_possible_cpu(cpu) { + if (*per_cpu_ptr(pmu->box, cpu) == box) { + *per_cpu_ptr(pmu->box, cpu) = NULL; + atomic_dec(&box->refcnt); + } + } + + WARN_ON_ONCE(atomic_read(&box->refcnt) != 1); + kfree(box); +} + +static int __devinit uncore_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct intel_uncore_type *type; + + type = (struct intel_uncore_type *)id->driver_data; + return uncore_pci_add(type, pdev); +} + +static int __init uncore_pci_init(void) +{ + int ret; + + switch (boot_cpu_data.x86_model) { + default: + return 0; + } + + ret = uncore_types_init(pci_uncores); + if (ret) + return ret; + + uncore_pci_driver->probe = uncore_pci_probe; + uncore_pci_driver->remove = uncore_pci_remove; + + ret = pci_register_driver(uncore_pci_driver); + if (ret == 0) + pcidrv_registered = true; + else + uncore_types_exit(pci_uncores); + + return ret; +} + +static void __init uncore_pci_exit(void) +{ + if (pcidrv_registered) { + pcidrv_registered = false; + pci_unregister_driver(uncore_pci_driver); + uncore_types_exit(pci_uncores); + } +} + static void __cpuinit uncore_cpu_dying(int cpu) { struct intel_uncore_type *type; @@ -921,6 +1071,7 @@ static void __cpuinit uncore_event_exit_cpu(int cpu) cpumask_set_cpu(target, &uncore_cpu_mask); uncore_change_context(msr_uncores, cpu, target); + uncore_change_context(pci_uncores, cpu, target); } static void __cpuinit uncore_event_init_cpu(int cpu) @@ -936,6 +1087,7 @@ static void __cpuinit uncore_event_init_cpu(int cpu) cpumask_set_cpu(cpu, &uncore_cpu_mask); uncore_change_context(msr_uncores, -1, cpu); + uncore_change_context(pci_uncores, -1, cpu); } static int __cpuinit uncore_cpu_notifier(struct notifier_block *self, @@ -1051,6 +1203,14 @@ static int __init uncore_pmus_register(void) } } + for (i = 0; pci_uncores[i]; i++) { + type = pci_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + return 0; } @@ -1061,9 +1221,14 @@ static int __init intel_uncore_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return -ENODEV; - ret = uncore_cpu_init(); + ret = uncore_pci_init(); if (ret) goto fail; + ret = uncore_cpu_init(); + if (ret) { + uncore_pci_exit(); + goto fail; + } uncore_pmus_register(); return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index eeb5ca5815a8..aa01df87b8de 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -1,5 +1,6 @@ #include #include +#include #include #include "perf_event.h" @@ -110,6 +111,7 @@ struct intel_uncore_pmu { int func_id; struct intel_uncore_type *type; struct intel_uncore_box ** __percpu box; + struct list_head box_list; }; struct intel_uncore_box { @@ -123,6 +125,7 @@ struct intel_uncore_box { struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; u64 tags[UNCORE_PMC_IDX_MAX]; + struct pci_dev *pci_dev; struct intel_uncore_pmu *pmu; struct hrtimer hrtimer; struct list_head list; @@ -161,6 +164,33 @@ static ssize_t uncore_event_show(struct kobject *kobj, return sprintf(buf, "%s", event->config); } +static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->box_ctl; +} + +static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctl; +} + +static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr; +} + +static inline +unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx) +{ + return idx * 4 + box->pmu->type->event_ctl; +} + +static inline +unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return idx * 8 + box->pmu->type->perf_ctr; +} + static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) { @@ -200,6 +230,42 @@ unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) box->pmu->type->msr_offset * box->pmu->pmu_idx; } +static inline +unsigned uncore_fixed_ctl(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctl(box); + else + return uncore_msr_fixed_ctl(box); +} + +static inline +unsigned uncore_fixed_ctr(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctr(box); + else + return uncore_msr_fixed_ctr(box); +} + +static inline +unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_event_ctl(box, idx); + else + return uncore_msr_event_ctl(box, idx); +} + +static inline +unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_perf_ctr(box, idx); + else + return uncore_msr_perf_ctr(box, idx); +} + static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box) { return box->pmu->type->perf_ctr_bits; From 7c94ee2e0917b2ea56498bff939c8aa55da27207 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:37 +0800 Subject: [PATCH 105/612] perf/x86: Add Intel Nehalem and Sandy Bridge-EP uncore support The uncore subsystem in Sandy Bridge-EP consists of 8 components: Ubox, Cacheing Agent, Home Agent, Memory controller, Power Control, QPI Link Layer, R2PCIe, R3QPI. Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1339741902-8449-9-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 484 ++++++++++++++++++ arch/x86/kernel/cpu/perf_event_intel_uncore.h | 86 ++++ include/linux/pci_ids.h | 11 + 3 files changed, 581 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index e20c65a0e108..d34f68bf990b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -21,6 +21,482 @@ DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); +DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); +DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); + +/* Sandy Bridge-EP uncore support */ +static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config; + + pci_read_config_dword(pdev, box_ctl, &config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config; + + pci_read_config_dword(pdev, box_ctl, &config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | + SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count; + + pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); + pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); + return count; +} + +static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, + SNBEP_PMON_BOX_CTL_INT); +} + +static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + return; + } +} + +static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + return; + } +} + +static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_msr_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 count; + + rdmsrl(hwc->event_base, count); + return count; +} + +static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + if (msr) + wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); +} + +static struct attribute *snbep_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute *snbep_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + NULL, +}; + +static struct attribute *snbep_uncore_pcu_formats_attr[] = { + &format_attr_event.attr, + &format_attr_occ_sel.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge.attr, + NULL, +}; + +static struct uncore_event_desc snbep_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(CLOCKTICKS, "config=0xffff"), + /* read */ + INTEL_UNCORE_EVENT_DESC(CAS_COUNT_RD, "event=0x4,umask=0x3"), + /* write */ + INTEL_UNCORE_EVENT_DESC(CAS_COUNT_WR, "event=0x4,umask=0xc"), + { /* end: all zeroes */ }, +}; + +static struct uncore_event_desc snbep_uncore_qpi_events[] = { + INTEL_UNCORE_EVENT_DESC(CLOCKTICKS, "event=0x14"), + /* outgoing data+nondata flits */ + INTEL_UNCORE_EVENT_DESC(TxL_FLITS_ACTIVE, "event=0x0,umask=0x6"), + /* DRS data received */ + INTEL_UNCORE_EVENT_DESC(DRS_DATA, "event=0x2,umask=0x8"), + /* NCB data received */ + INTEL_UNCORE_EVENT_DESC(NCB_DATA, "event=0x3,umask=0x4"), + { /* end: all zeroes */ }, +}; + +static struct attribute_group snbep_uncore_format_group = { + .name = "format", + .attrs = snbep_uncore_formats_attr, +}; + +static struct attribute_group snbep_uncore_ubox_format_group = { + .name = "format", + .attrs = snbep_uncore_ubox_formats_attr, +}; + +static struct attribute_group snbep_uncore_pcu_format_group = { + .name = "format", + .attrs = snbep_uncore_pcu_formats_attr, +}; + +static struct intel_uncore_ops snbep_uncore_msr_ops = { + .init_box = snbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = snbep_uncore_msr_enable_event, + .read_counter = snbep_uncore_msr_read_counter, +}; + +static struct intel_uncore_ops snbep_uncore_pci_ops = { + .init_box = snbep_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct event_constraint snbep_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x01, 0x1), + UNCORE_EVENT_CONSTRAINT(0x02, 0x3), + UNCORE_EVENT_CONSTRAINT(0x04, 0x3), + UNCORE_EVENT_CONSTRAINT(0x05, 0x3), + UNCORE_EVENT_CONSTRAINT(0x07, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x3), + UNCORE_EVENT_CONSTRAINT(0x1b, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1f, 0xe), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x35, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x1), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r3qpi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x20, 0x3), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x22, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x30, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x3), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snbep_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNBEP_U_MSR_PMON_CTR0, + .event_ctl = SNBEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_ubox_format_group, +}; + +static struct intel_uncore_type snbep_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .constraints = snbep_uncore_cbox_constraints, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_format_group, +}; + +static struct intel_uncore_type snbep_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *snbep_msr_uncores[] = { + &snbep_uncore_ubox, + &snbep_uncore_cbox, + &snbep_uncore_pcu, + NULL, +}; + +#define SNBEP_UNCORE_PCI_COMMON_INIT() \ + .perf_ctr = SNBEP_PCI_PMON_CTR0, \ + .event_ctl = SNBEP_PCI_PMON_CTL0, \ + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \ + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ + .ops = &snbep_uncore_pci_ops, \ + .format_group = &snbep_uncore_format_group + +static struct intel_uncore_type snbep_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = snbep_uncore_imc_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_qpi = { + .name = "qpi", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_descs = snbep_uncore_qpi_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + + +static struct intel_uncore_type snbep_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 3, + .num_boxes = 2, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r3qpi_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type *snbep_pci_uncores[] = { + &snbep_uncore_ha, + &snbep_uncore_imc, + &snbep_uncore_qpi, + &snbep_uncore_r2pcie, + &snbep_uncore_r3qpi, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { + { /* Home Agent */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), + .driver_data = (unsigned long)&snbep_uncore_ha, + }, + { /* MC Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* QPI Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), + .driver_data = (unsigned long)&snbep_uncore_qpi, + }, + { /* QPI Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), + .driver_data = (unsigned long)&snbep_uncore_qpi, + }, + { /* P2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), + .driver_data = (unsigned long)&snbep_uncore_r2pcie, + }, + { /* R3QPI Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), + .driver_data = (unsigned long)&snbep_uncore_r3qpi, + }, + { /* R3QPI Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), + .driver_data = (unsigned long)&snbep_uncore_r3qpi, + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snbep_uncore_pci_driver = { + .name = "snbep_uncore", + .id_table = snbep_uncore_pci_ids, +}; + +/* + * build pci bus to socket mapping + */ +static void snbep_pci2phy_map_init(void) +{ + struct pci_dev *ubox_dev = NULL; + int i, bus, nodeid; + u32 config; + + while (1) { + /* find the UBOX device */ + ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX, + ubox_dev); + if (!ubox_dev) + break; + bus = ubox_dev->bus->number; + /* get the Node ID of the local register */ + pci_read_config_dword(ubox_dev, 0x40, &config); + nodeid = config; + /* get the Node ID mapping */ + pci_read_config_dword(ubox_dev, 0x54, &config); + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == ((config >> (3 * i)) & 0x7)) { + pcibus_to_physid[bus] = i; + break; + } + } + }; + return; +} +/* end of Sandy Bridge-EP uncore support */ + /* Sandy Bridge uncore support */ static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, @@ -894,6 +1370,11 @@ static int __init uncore_pci_init(void) int ret; switch (boot_cpu_data.x86_model) { + case 45: /* Sandy Bridge-EP */ + pci_uncores = snbep_pci_uncores; + uncore_pci_driver = &snbep_uncore_pci_driver; + snbep_pci2phy_map_init(); + break; default: return 0; } @@ -1155,6 +1636,9 @@ static int __init uncore_cpu_init(void) case 42: /* Sandy Bridge */ msr_uncores = snb_msr_uncores; break; + case 45: /* Sandy Birdge-EP */ + msr_uncores = snbep_msr_uncores; + break; default: return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index aa01df87b8de..4d52db0d1dfe 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -65,6 +65,92 @@ #define NHM_UNC_PERFEVTSEL0 0x3c0 #define NHM_UNC_UNCORE_PMC0 0x3b0 +/* SNB-EP Box level control */ +#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) +#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1) +#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8) +#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16) +#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ + SNBEP_PMON_BOX_CTL_RST_CTRS | \ + SNBEP_PMON_BOX_CTL_FRZ_EN) +/* SNB-EP event control */ +#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff +#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00 +#define SNBEP_PMON_CTL_RST (1 << 17) +#define SNBEP_PMON_CTL_EDGE_DET (1 << 18) +#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) /* only for QPI */ +#define SNBEP_PMON_CTL_EN (1 << 22) +#define SNBEP_PMON_CTL_INVERT (1 << 23) +#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000 +#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PMON_CTL_TRESH_MASK) + +/* SNB-EP Ubox event control */ +#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_U_MSR_PMON_CTL_TRESH_MASK) + +/* SNB-EP PCU event control */ +#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000 +#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30) +#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31) +#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) + +/* SNB-EP pci control register */ +#define SNBEP_PCI_PMON_BOX_CTL 0xf4 +#define SNBEP_PCI_PMON_CTL0 0xd8 +/* SNB-EP pci counter register */ +#define SNBEP_PCI_PMON_CTR0 0xa0 + +/* SNB-EP home agent register */ +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40 +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44 +#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48 +/* SNB-EP memory controller register */ +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0 +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0 +/* SNB-EP QPI register */ +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228 +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238 +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c + +/* SNB-EP Ubox register */ +#define SNBEP_U_MSR_PMON_CTR0 0xc16 +#define SNBEP_U_MSR_PMON_CTL0 0xc10 + +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08 +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09 + +/* SNB-EP Cbo register */ +#define SNBEP_C0_MSR_PMON_CTR0 0xd16 +#define SNBEP_C0_MSR_PMON_CTL0 0xd10 +#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 +#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04 +#define SNBEP_CBO_MSR_OFFSET 0x20 + +/* SNB-EP PCU register */ +#define SNBEP_PCU_MSR_PMON_CTR0 0xc36 +#define SNBEP_PCU_MSR_PMON_CTL0 0xc30 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 +#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24 +#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc +#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd struct intel_uncore_ops; struct intel_uncore_pmu; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index ab741b0d0074..5f187026b812 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2755,6 +2755,17 @@ #define PCI_DEVICE_ID_INTEL_IOAT_SNB7 0x3c27 #define PCI_DEVICE_ID_INTEL_IOAT_SNB8 0x3c2e #define PCI_DEVICE_ID_INTEL_IOAT_SNB9 0x3c2f +#define PCI_DEVICE_ID_INTEL_UNC_HA 0x3c46 +#define PCI_DEVICE_ID_INTEL_UNC_IMC0 0x3cb0 +#define PCI_DEVICE_ID_INTEL_UNC_IMC1 0x3cb1 +#define PCI_DEVICE_ID_INTEL_UNC_IMC2 0x3cb4 +#define PCI_DEVICE_ID_INTEL_UNC_IMC3 0x3cb5 +#define PCI_DEVICE_ID_INTEL_UNC_QPI0 0x3c41 +#define PCI_DEVICE_ID_INTEL_UNC_QPI1 0x3c42 +#define PCI_DEVICE_ID_INTEL_UNC_R2PCIE 0x3c43 +#define PCI_DEVICE_ID_INTEL_UNC_R3QPI0 0x3c44 +#define PCI_DEVICE_ID_INTEL_UNC_R3QPI1 0x3c45 +#define PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX 0x3ce0 #define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f #define PCI_DEVICE_ID_INTEL_5100_16 0x65f0 #define PCI_DEVICE_ID_INTEL_5100_21 0x65f5 From 46010ab2607aed9484816a8f1679c2ec3c8e7dcf Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 15 Jun 2012 14:31:38 +0800 Subject: [PATCH 106/612] perf/tool: Use data struct for arg passing in event parse function Moving all the bison arguments into the structure. In upcomming patches we are going to: - add more arguments - reuse the grammer for term parsing so it's more clear to pack/separate related arguments. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-10-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 16 ++++++----- tools/perf/util/parse-events.h | 8 ++++-- tools/perf/util/parse-events.y | 52 +++++++++++++++++++++++----------- 3 files changed, 50 insertions(+), 26 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 05dbc8b3c767..c71b29ad26ec 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -26,7 +26,7 @@ struct event_symbol { #ifdef PARSER_DEBUG extern int parse_events_debug; #endif -int parse_events_parse(struct list_head *list, int *idx); +int parse_events_parse(void *data); #define CHW(x) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_##x #define CSW(x) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_##x @@ -789,25 +789,27 @@ int parse_events_modifier(struct list_head *list, char *str) int parse_events(struct perf_evlist *evlist, const char *str, int unset __used) { - LIST_HEAD(list); - LIST_HEAD(list_tmp); + struct parse_events_data__events data = { + .list = LIST_HEAD_INIT(data.list), + .idx = evlist->nr_entries, + }; YY_BUFFER_STATE buffer; - int ret, idx = evlist->nr_entries; + int ret; buffer = parse_events__scan_string(str); #ifdef PARSER_DEBUG parse_events_debug = 1; #endif - ret = parse_events_parse(&list, &idx); + ret = parse_events_parse(&data); parse_events__flush_buffer(buffer); parse_events__delete_buffer(buffer); parse_events_lex_destroy(); if (!ret) { - int entries = idx - evlist->nr_entries; - perf_evlist__splice_list_tail(evlist, &list, entries); + int entries = data.idx - evlist->nr_entries; + perf_evlist__splice_list_tail(evlist, &data.list, entries); return 0; } diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 8cac57ab4ee6..dc3c83a0ab8a 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -63,6 +63,11 @@ struct parse_events__term { struct list_head list; }; +struct parse_events_data__events { + struct list_head list; + int idx; +}; + int parse_events__is_hardcoded_term(struct parse_events__term *term); int parse_events__term_num(struct parse_events__term **_term, int type_term, char *config, long num); @@ -83,8 +88,7 @@ int parse_events_add_pmu(struct list_head **list, int *idx, char *pmu , struct list_head *head_config); void parse_events_update_lists(struct list_head *list_event, struct list_head *list_all); -void parse_events_error(struct list_head *list_all, - int *idx, char const *msg); +void parse_events_error(void *data, char const *msg); int parse_events__test(void); void print_events(const char *event_glob); diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 362cc59332ae..e533bf72ba9c 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -1,7 +1,6 @@ %name-prefix "parse_events_" -%parse-param {struct list_head *list_all} -%parse-param {int *idx} +%parse-param {void *_data} %{ @@ -64,18 +63,22 @@ events ',' event | event event: event_def PE_MODIFIER_EVENT { + struct parse_events_data__events *data = _data; + /* * Apply modifier on all events added by single event definition * (there could be more events added for multiple tracepoint * definitions via '*?'. */ ABORT_ON(parse_events_modifier($1, $2)); - parse_events_update_lists($1, list_all); + parse_events_update_lists($1, &data->list); } | event_def { - parse_events_update_lists($1, list_all); + struct parse_events_data__events *data = _data; + + parse_events_update_lists($1, &data->list); } event_def: event_pmu | @@ -89,9 +92,10 @@ event_def: event_pmu | event_pmu: PE_NAME '/' event_config '/' { + struct parse_events_data__events *data = _data; struct list_head *list = NULL; - ABORT_ON(parse_events_add_pmu(&list, idx, $1, $3)); + ABORT_ON(parse_events_add_pmu(&list, &data->idx, $1, $3)); parse_events__free_terms($3); $$ = list; } @@ -99,91 +103,106 @@ PE_NAME '/' event_config '/' event_legacy_symbol: PE_VALUE_SYM '/' event_config '/' { + struct parse_events_data__events *data = _data; struct list_head *list = NULL; int type = $1 >> 16; int config = $1 & 255; - ABORT_ON(parse_events_add_numeric(&list, idx, type, config, $3)); + ABORT_ON(parse_events_add_numeric(&list, &data->idx, + type, config, $3)); parse_events__free_terms($3); $$ = list; } | PE_VALUE_SYM sep_slash_dc { + struct parse_events_data__events *data = _data; struct list_head *list = NULL; int type = $1 >> 16; int config = $1 & 255; - ABORT_ON(parse_events_add_numeric(&list, idx, type, config, NULL)); + ABORT_ON(parse_events_add_numeric(&list, &data->idx, + type, config, NULL)); $$ = list; } event_legacy_cache: PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT { + struct parse_events_data__events *data = _data; struct list_head *list = NULL; - ABORT_ON(parse_events_add_cache(&list, idx, $1, $3, $5)); + ABORT_ON(parse_events_add_cache(&list, &data->idx, $1, $3, $5)); $$ = list; } | PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT { + struct parse_events_data__events *data = _data; struct list_head *list = NULL; - ABORT_ON(parse_events_add_cache(&list, idx, $1, $3, NULL)); + ABORT_ON(parse_events_add_cache(&list, &data->idx, $1, $3, NULL)); $$ = list; } | PE_NAME_CACHE_TYPE { + struct parse_events_data__events *data = _data; struct list_head *list = NULL; - ABORT_ON(parse_events_add_cache(&list, idx, $1, NULL, NULL)); + ABORT_ON(parse_events_add_cache(&list, &data->idx, $1, NULL, NULL)); $$ = list; } event_legacy_mem: PE_PREFIX_MEM PE_VALUE ':' PE_MODIFIER_BP sep_dc { + struct parse_events_data__events *data = _data; struct list_head *list = NULL; - ABORT_ON(parse_events_add_breakpoint(&list, idx, (void *) $2, $4)); + ABORT_ON(parse_events_add_breakpoint(&list, &data->idx, + (void *) $2, $4)); $$ = list; } | PE_PREFIX_MEM PE_VALUE sep_dc { + struct parse_events_data__events *data = _data; struct list_head *list = NULL; - ABORT_ON(parse_events_add_breakpoint(&list, idx, (void *) $2, NULL)); + ABORT_ON(parse_events_add_breakpoint(&list, &data->idx, + (void *) $2, NULL)); $$ = list; } event_legacy_tracepoint: PE_NAME ':' PE_NAME { + struct parse_events_data__events *data = _data; struct list_head *list = NULL; - ABORT_ON(parse_events_add_tracepoint(&list, idx, $1, $3)); + ABORT_ON(parse_events_add_tracepoint(&list, &data->idx, $1, $3)); $$ = list; } event_legacy_numeric: PE_VALUE ':' PE_VALUE { + struct parse_events_data__events *data = _data; struct list_head *list = NULL; - ABORT_ON(parse_events_add_numeric(&list, idx, $1, $3, NULL)); + ABORT_ON(parse_events_add_numeric(&list, &data->idx, $1, $3, NULL)); $$ = list; } event_legacy_raw: PE_RAW { + struct parse_events_data__events *data = _data; struct list_head *list = NULL; - ABORT_ON(parse_events_add_numeric(&list, idx, PERF_TYPE_RAW, $1, NULL)); + ABORT_ON(parse_events_add_numeric(&list, &data->idx, + PERF_TYPE_RAW, $1, NULL)); $$ = list; } @@ -267,8 +286,7 @@ sep_slash_dc: '/' | ':' | %% -void parse_events_error(struct list_head *list_all __used, - int *idx __used, +void parse_events_error(void *data __used, char const *msg __used) { } From ac20de6fff445d6deb0c44c25946d198f79f2f00 Mon Sep 17 00:00:00 2001 From: Zheng Yan Date: Fri, 15 Jun 2012 14:31:39 +0800 Subject: [PATCH 107/612] perf/tool: Make the event parser re-entrant Make the event parser reentrant by creating separate scanner for each parsing. The scanner is passed to the bison as and argument to the lexer. Signed-off-by: Zheng Yan [ Cleaned up the patch. ] Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-11-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 39 +++++++---- tools/perf/util/parse-events.h | 2 +- tools/perf/util/parse-events.l | 116 +++++++++++++++++++-------------- tools/perf/util/parse-events.y | 9 ++- 4 files changed, 100 insertions(+), 66 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index c71b29ad26ec..ca8665e19c0d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -11,6 +11,7 @@ #include "cache.h" #include "header.h" #include "debugfs.h" +#include "parse-events-bison.h" #include "parse-events-flex.h" #include "pmu.h" @@ -26,7 +27,7 @@ struct event_symbol { #ifdef PARSER_DEBUG extern int parse_events_debug; #endif -int parse_events_parse(void *data); +int parse_events_parse(void *data, void *scanner); #define CHW(x) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_##x #define CSW(x) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_##x @@ -787,26 +788,38 @@ int parse_events_modifier(struct list_head *list, char *str) return 0; } +static int parse_events__scanner(const char *str, void *data) +{ + YY_BUFFER_STATE buffer; + void *scanner; + int ret; + + ret = parse_events_lex_init(&scanner); + if (ret) + return ret; + + buffer = parse_events__scan_string(str, scanner); + +#ifdef PARSER_DEBUG + parse_events_debug = 1; +#endif + ret = parse_events_parse(data, scanner); + + parse_events__flush_buffer(buffer, scanner); + parse_events__delete_buffer(buffer, scanner); + parse_events_lex_destroy(scanner); + return ret; +} + int parse_events(struct perf_evlist *evlist, const char *str, int unset __used) { struct parse_events_data__events data = { .list = LIST_HEAD_INIT(data.list), .idx = evlist->nr_entries, }; - YY_BUFFER_STATE buffer; int ret; - buffer = parse_events__scan_string(str); - -#ifdef PARSER_DEBUG - parse_events_debug = 1; -#endif - ret = parse_events_parse(&data); - - parse_events__flush_buffer(buffer); - parse_events__delete_buffer(buffer); - parse_events_lex_destroy(); - + ret = parse_events__scanner(str, &data); if (!ret) { int entries = data.idx - evlist->nr_entries; perf_evlist__splice_list_tail(evlist, &data.list, entries); diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index dc3c83a0ab8a..fa2b19b862e2 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -88,7 +88,7 @@ int parse_events_add_pmu(struct list_head **list, int *idx, char *pmu , struct list_head *head_config); void parse_events_update_lists(struct list_head *list_event, struct list_head *list_all); -void parse_events_error(void *data, char const *msg); +void parse_events_error(void *data, void *scanner, char const *msg); int parse_events__test(void); void print_events(const char *event_glob); diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 618a8e788399..329794eea711 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -1,4 +1,6 @@ +%option reentrant +%option bison-bridge %option prefix="parse_events_" %option stack @@ -8,7 +10,10 @@ #include "parse-events-bison.h" #include "parse-events.h" -static int __value(char *str, int base, int token) +char *parse_events_get_text(yyscan_t yyscanner); +YYSTYPE *parse_events_get_lval(yyscan_t yyscanner); + +static int __value(YYSTYPE *yylval, char *str, int base, int token) { long num; @@ -17,35 +22,48 @@ static int __value(char *str, int base, int token) if (errno) return PE_ERROR; - parse_events_lval.num = num; + yylval->num = num; return token; } -static int value(int base) +static int value(yyscan_t scanner, int base) { - return __value(parse_events_text, base, PE_VALUE); + YYSTYPE *yylval = parse_events_get_lval(scanner); + char *text = parse_events_get_text(scanner); + + return __value(yylval, text, base, PE_VALUE); } -static int raw(void) +static int raw(yyscan_t scanner) { - return __value(parse_events_text + 1, 16, PE_RAW); + YYSTYPE *yylval = parse_events_get_lval(scanner); + char *text = parse_events_get_text(scanner); + + return __value(yylval, text + 1, 16, PE_RAW); } -static int str(int token) +static int str(yyscan_t scanner, int token) { - parse_events_lval.str = strdup(parse_events_text); + YYSTYPE *yylval = parse_events_get_lval(scanner); + char *text = parse_events_get_text(scanner); + + yylval->str = strdup(text); return token; } -static int sym(int type, int config) +static int sym(yyscan_t scanner, int type, int config) { - parse_events_lval.num = (type << 16) + config; + YYSTYPE *yylval = parse_events_get_lval(scanner); + + yylval->num = (type << 16) + config; return PE_VALUE_SYM; } -static int term(int type) +static int term(yyscan_t scanner, int type) { - parse_events_lval.num = type; + YYSTYPE *yylval = parse_events_get_lval(scanner); + + yylval->num = type; return PE_TERM; } @@ -61,25 +79,25 @@ modifier_event [ukhpGH]{1,8} modifier_bp [rwx] %% -cpu-cycles|cycles { return sym(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); } -stalled-cycles-frontend|idle-cycles-frontend { return sym(PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); } -stalled-cycles-backend|idle-cycles-backend { return sym(PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); } -instructions { return sym(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS); } -cache-references { return sym(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES); } -cache-misses { return sym(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES); } -branch-instructions|branches { return sym(PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } -branch-misses { return sym(PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES); } -bus-cycles { return sym(PERF_TYPE_HARDWARE, PERF_COUNT_HW_BUS_CYCLES); } -ref-cycles { return sym(PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES); } -cpu-clock { return sym(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK); } -task-clock { return sym(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK); } -page-faults|faults { return sym(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS); } -minor-faults { return sym(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN); } -major-faults { return sym(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ); } -context-switches|cs { return sym(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES); } -cpu-migrations|migrations { return sym(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_MIGRATIONS); } -alignment-faults { return sym(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS); } -emulation-faults { return sym(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS); } +cpu-cycles|cycles { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); } +stalled-cycles-frontend|idle-cycles-frontend { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); } +stalled-cycles-backend|idle-cycles-backend { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); } +instructions { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS); } +cache-references { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES); } +cache-misses { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES); } +branch-instructions|branches { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } +branch-misses { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES); } +bus-cycles { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BUS_CYCLES); } +ref-cycles { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES); } +cpu-clock { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK); } +task-clock { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK); } +page-faults|faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS); } +minor-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN); } +major-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ); } +context-switches|cs { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES); } +cpu-migrations|migrations { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_MIGRATIONS); } +alignment-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS); } +emulation-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS); } L1-dcache|l1-d|l1d|L1-data | L1-icache|l1-i|l1i|L1-instruction | @@ -87,14 +105,14 @@ LLC|L2 | dTLB|d-tlb|Data-TLB | iTLB|i-tlb|Instruction-TLB | branch|branches|bpu|btb|bpc | -node { return str(PE_NAME_CACHE_TYPE); } +node { return str(yyscanner, PE_NAME_CACHE_TYPE); } load|loads|read | store|stores|write | prefetch|prefetches | speculative-read|speculative-load | refs|Reference|ops|access | -misses|miss { return str(PE_NAME_CACHE_OP_RESULT); } +misses|miss { return str(yyscanner, PE_NAME_CACHE_OP_RESULT); } /* * These are event config hardcoded term names to be specified @@ -102,20 +120,20 @@ misses|miss { return str(PE_NAME_CACHE_OP_RESULT); } * so we can put them here directly. In case the we have a conflict * in future, this needs to go into '//' condition block. */ -config { return term(PARSE_EVENTS__TERM_TYPE_CONFIG); } -config1 { return term(PARSE_EVENTS__TERM_TYPE_CONFIG1); } -config2 { return term(PARSE_EVENTS__TERM_TYPE_CONFIG2); } -name { return term(PARSE_EVENTS__TERM_TYPE_NAME); } -period { return term(PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD); } -branch_type { return term(PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE); } +config { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG); } +config1 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG1); } +config2 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG2); } +name { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NAME); } +period { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD); } +branch_type { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE); } mem: { BEGIN(mem); return PE_PREFIX_MEM; } -r{num_raw_hex} { return raw(); } -{num_dec} { return value(10); } -{num_hex} { return value(16); } +r{num_raw_hex} { return raw(yyscanner); } +{num_dec} { return value(yyscanner, 10); } +{num_hex} { return value(yyscanner, 16); } -{modifier_event} { return str(PE_MODIFIER_EVENT); } -{name} { return str(PE_NAME); } +{modifier_event} { return str(yyscanner, PE_MODIFIER_EVENT); } +{name} { return str(yyscanner, PE_NAME); } "/" { return '/'; } - { return '-'; } , { return ','; } @@ -123,17 +141,17 @@ r{num_raw_hex} { return raw(); } = { return '='; } { -{modifier_bp} { return str(PE_MODIFIER_BP); } +{modifier_bp} { return str(yyscanner, PE_MODIFIER_BP); } : { return ':'; } -{num_dec} { return value(10); } -{num_hex} { return value(16); } +{num_dec} { return value(yyscanner, 10); } +{num_hex} { return value(yyscanner, 16); } /* * We need to separate 'mem:' scanner part, in order to get specific * modifier bits parsed out. Otherwise we would need to handle PE_NAME * and we'd need to parse it manually. During the escape from * state we need to put the escaping char back, so we dont miss it. */ -. { unput(*parse_events_text); BEGIN(INITIAL); } +. { unput(*yytext); BEGIN(INITIAL); } /* * We destroy the scanner after reaching EOF, * but anyway just to be sure get back to INIT state. @@ -143,7 +161,7 @@ r{num_raw_hex} { return raw(); } %% -int parse_events_wrap(void) +int parse_events_wrap(void *scanner __used) { return 1; } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index e533bf72ba9c..2a93d5c8ccda 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -1,6 +1,8 @@ - +%pure-parser %name-prefix "parse_events_" %parse-param {void *_data} +%parse-param {void *scanner} +%lex-param {void* scanner} %{ @@ -11,8 +13,9 @@ #include "types.h" #include "util.h" #include "parse-events.h" +#include "parse-events-bison.h" -extern int parse_events_lex (void); +extern int parse_events_lex (YYSTYPE* lvalp, void* scanner); #define ABORT_ON(val) \ do { \ @@ -286,7 +289,7 @@ sep_slash_dc: '/' | ':' | %% -void parse_events_error(void *data __used, +void parse_events_error(void *data __used, void *scanner __used, char const *msg __used) { } From 90e2b22dee908c13df256140a0d6527e3e8ea3f4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 15 Jun 2012 14:31:40 +0800 Subject: [PATCH 108/612] perf/tool: Add support to reuse event grammar to parse out terms We want to reuse the event grammar for parsing aliased terms. The obvious reason is we dont need to add new code when there's already support for this in event grammar. Doing this by adding terms and event start entries into event parse grammar. The grammar forks on the begining based on the starting token, which is supplied via bison interface into the lexer. The lexer then returns the starting token as the first token, thus making the grammar switch accordingly. Currently 2 starting tokens/grammars are supported: PE_START_TERMS, PE_START_EVENTS The PE_START_TERMS related grammar uses 'event_config' part of the grammar for term parsing. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-12-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 28 +++++++++++++++++++++++++--- tools/perf/util/parse-events.h | 5 +++++ tools/perf/util/parse-events.l | 13 +++++++++++++ tools/perf/util/parse-events.y | 12 ++++++++++++ 4 files changed, 55 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index ca8665e19c0d..d002170adb3f 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -12,6 +12,7 @@ #include "header.h" #include "debugfs.h" #include "parse-events-bison.h" +#define YY_EXTRA_TYPE int #include "parse-events-flex.h" #include "pmu.h" @@ -788,13 +789,13 @@ int parse_events_modifier(struct list_head *list, char *str) return 0; } -static int parse_events__scanner(const char *str, void *data) +static int parse_events__scanner(const char *str, void *data, int start_token) { YY_BUFFER_STATE buffer; void *scanner; int ret; - ret = parse_events_lex_init(&scanner); + ret = parse_events_lex_init_extra(start_token, &scanner); if (ret) return ret; @@ -811,6 +812,27 @@ static int parse_events__scanner(const char *str, void *data) return ret; } +/* + * parse event config string, return a list of event terms. + */ +int parse_events_terms(struct list_head *terms, const char *str) +{ + struct parse_events_data__terms data = { + .terms = NULL, + }; + int ret; + + ret = parse_events__scanner(str, &data, PE_START_TERMS); + if (!ret) { + list_splice(data.terms, terms); + free(data.terms); + return 0; + } + + parse_events__free_terms(data.terms); + return ret; +} + int parse_events(struct perf_evlist *evlist, const char *str, int unset __used) { struct parse_events_data__events data = { @@ -819,7 +841,7 @@ int parse_events(struct perf_evlist *evlist, const char *str, int unset __used) }; int ret; - ret = parse_events__scanner(str, &data); + ret = parse_events__scanner(str, &data, PE_START_EVENTS); if (!ret) { int entries = data.idx - evlist->nr_entries; perf_evlist__splice_list_tail(evlist, &data.list, entries); diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index fa2b19b862e2..9896edadbe62 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -33,6 +33,7 @@ extern int parse_events_option(const struct option *opt, const char *str, int unset); extern int parse_events(struct perf_evlist *evlist, const char *str, int unset); +extern int parse_events_terms(struct list_head *terms, const char *str); extern int parse_filter(const struct option *opt, const char *str, int unset); #define EVENTS_HELP_MAX (128*1024) @@ -68,6 +69,10 @@ struct parse_events_data__events { int idx; }; +struct parse_events_data__terms { + struct list_head *terms; +}; + int parse_events__is_hardcoded_term(struct parse_events__term *term); int parse_events__term_num(struct parse_events__term **_term, int type_term, char *config, long num); diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 329794eea711..488362e14133 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -79,6 +79,19 @@ modifier_event [ukhpGH]{1,8} modifier_bp [rwx] %% + +%{ + { + int start_token; + + start_token = (int) parse_events_get_extra(yyscanner); + if (start_token) { + parse_events_set_extra(NULL, yyscanner); + return start_token; + } + } +%} + cpu-cycles|cycles { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); } stalled-cycles-frontend|idle-cycles-frontend { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); } stalled-cycles-backend|idle-cycles-backend { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 2a93d5c8ccda..9525c455d27f 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -25,6 +25,7 @@ do { \ %} +%token PE_START_EVENTS PE_START_TERMS %token PE_VALUE PE_VALUE_SYM PE_RAW PE_TERM %token PE_NAME %token PE_MODIFIER_EVENT PE_MODIFIER_BP @@ -60,6 +61,11 @@ do { \ } %% +start: +PE_START_EVENTS events +| +PE_START_TERMS terms + events: events ',' event | event @@ -209,6 +215,12 @@ PE_RAW $$ = list; } +terms: event_config +{ + struct parse_events_data__terms *data = _data; + data->terms = $1; +} + event_config: event_config ',' event_term { From a6146d5040cce560f700221158d77dd335eed332 Mon Sep 17 00:00:00 2001 From: Zheng Yan Date: Fri, 15 Jun 2012 14:31:41 +0800 Subject: [PATCH 109/612] perf/tool: Add PMU event alias support Add support to specify alias term within the event description. The definition of pmu event alias is located at: ${sysfs_mount}/bus/event_source/devices/${pmu}/events/ Each file in the 'events' directory defines a event alias. Its contents are like: config=1,config1=2 Using pmu event alias, an event can be now specified like: uncore/CLOCKTICKS/ or uncore/event=CLOCKTICKS/ Signed-off-by: Zheng Yan [ Cleaned it up. ] Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1339741902-8449-13-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 10 ++ tools/perf/util/parse-events.h | 2 + tools/perf/util/pmu.c | 166 +++++++++++++++++++++++++++++++++ tools/perf/util/pmu.h | 11 ++- 4 files changed, 188 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index d002170adb3f..3339424cc421 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -701,6 +701,9 @@ int parse_events_add_pmu(struct list_head **list, int *idx, memset(&attr, 0, sizeof(attr)); + if (perf_pmu__check_alias(pmu, head_config)) + return -EINVAL; + /* * Configure hardcoded terms first, no need to check * return value when called with fail == 0 ;) @@ -1143,6 +1146,13 @@ int parse_events__term_str(struct parse_events__term **term, config, str, 0); } +int parse_events__term_clone(struct parse_events__term **new, + struct parse_events__term *term) +{ + return new_term(new, term->type_val, term->type_term, term->config, + term->val.str, term->val.num); +} + void parse_events__free_terms(struct list_head *terms) { struct parse_events__term *term, *h; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 9896edadbe62..a2c71684f6a4 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -78,6 +78,8 @@ int parse_events__term_num(struct parse_events__term **_term, int type_term, char *config, long num); int parse_events__term_str(struct parse_events__term **_term, int type_term, char *config, char *str); +int parse_events__term_clone(struct parse_events__term **new, + struct parse_events__term *term); void parse_events__free_terms(struct list_head *terms); int parse_events_modifier(struct list_head *list, char *str); int parse_events_add_tracepoint(struct list_head **list, int *idx, diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index a119a5371699..74d0948ec368 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -80,6 +80,114 @@ static int pmu_format(char *name, struct list_head *format) return 0; } +static int perf_pmu__new_alias(struct list_head *list, char *name, FILE *file) +{ + struct perf_pmu__alias *alias; + char buf[256]; + int ret; + + ret = fread(buf, 1, sizeof(buf), file); + if (ret == 0) + return -EINVAL; + buf[ret] = 0; + + alias = malloc(sizeof(*alias)); + if (!alias) + return -ENOMEM; + + INIT_LIST_HEAD(&alias->terms); + ret = parse_events_terms(&alias->terms, buf); + if (ret) { + free(alias); + return ret; + } + + alias->name = strdup(name); + list_add_tail(&alias->list, list); + return 0; +} + +/* + * Process all the sysfs attributes located under the directory + * specified in 'dir' parameter. + */ +static int pmu_aliases_parse(char *dir, struct list_head *head) +{ + struct dirent *evt_ent; + DIR *event_dir; + int ret = 0; + + event_dir = opendir(dir); + if (!event_dir) + return -EINVAL; + + while (!ret && (evt_ent = readdir(event_dir))) { + char path[PATH_MAX]; + char *name = evt_ent->d_name; + FILE *file; + + if (!strcmp(name, ".") || !strcmp(name, "..")) + continue; + + snprintf(path, PATH_MAX, "%s/%s", dir, name); + + ret = -EINVAL; + file = fopen(path, "r"); + if (!file) + break; + ret = perf_pmu__new_alias(head, name, file); + fclose(file); + } + + closedir(event_dir); + return ret; +} + +/* + * Reading the pmu event aliases definition, which should be located at: + * /sys/bus/event_source/devices//events as sysfs group attributes. + */ +static int pmu_aliases(char *name, struct list_head *head) +{ + struct stat st; + char path[PATH_MAX]; + const char *sysfs; + + sysfs = sysfs_find_mountpoint(); + if (!sysfs) + return -1; + + snprintf(path, PATH_MAX, + "%s/bus/event_source/devices/%s/events", sysfs, name); + + if (stat(path, &st) < 0) + return -1; + + if (pmu_aliases_parse(path, head)) + return -1; + + return 0; +} + +static int pmu_alias_terms(struct perf_pmu__alias *alias, + struct list_head *terms) +{ + struct parse_events__term *term, *clone; + LIST_HEAD(list); + int ret; + + list_for_each_entry(term, &alias->terms, list) { + ret = parse_events__term_clone(&clone, term); + if (ret) { + parse_events__free_terms(&list); + return ret; + } + list_add_tail(&clone->list, &list); + } + list_splice(&list, terms); + return 0; +} + /* * Reading/parsing the default pmu type value, which should be * located at: @@ -118,6 +226,7 @@ static struct perf_pmu *pmu_lookup(char *name) { struct perf_pmu *pmu; LIST_HEAD(format); + LIST_HEAD(aliases); __u32 type; /* @@ -135,8 +244,12 @@ static struct perf_pmu *pmu_lookup(char *name) if (!pmu) return NULL; + pmu_aliases(name, &aliases); + INIT_LIST_HEAD(&pmu->format); + INIT_LIST_HEAD(&pmu->aliases); list_splice(&format, &pmu->format); + list_splice(&aliases, &pmu->aliases); pmu->name = strdup(name); pmu->type = type; return pmu; @@ -279,6 +392,59 @@ int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr, return pmu_config(&pmu->format, attr, head_terms); } +static struct perf_pmu__alias *pmu_find_alias(struct perf_pmu *pmu, + struct parse_events__term *term) +{ + struct perf_pmu__alias *alias; + char *name; + + if (parse_events__is_hardcoded_term(term)) + return NULL; + + if (term->type_val == PARSE_EVENTS__TERM_TYPE_NUM) { + if (term->val.num != 1) + return NULL; + if (pmu_find_format(&pmu->format, term->config)) + return NULL; + name = term->config; + } else if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) { + if (strcasecmp(term->config, "event")) + return NULL; + name = term->val.str; + } else { + return NULL; + } + + list_for_each_entry(alias, &pmu->aliases, list) { + if (!strcasecmp(alias->name, name)) + return alias; + } + return NULL; +} + +/* + * Find alias in the terms list and replace it with the terms + * defined for the alias + */ +int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms) +{ + struct parse_events__term *term, *h; + struct perf_pmu__alias *alias; + int ret; + + list_for_each_entry_safe(term, h, head_terms, list) { + alias = pmu_find_alias(pmu, term); + if (!alias) + continue; + ret = pmu_alias_terms(alias, &term->list); + if (ret) + return ret; + list_del(&term->list); + free(term); + } + return 0; +} + int perf_pmu__new_format(struct list_head *list, char *name, int config, unsigned long *bits) { diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 68c0db965e1f..535f2c5258ab 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -19,17 +19,26 @@ struct perf_pmu__format { struct list_head list; }; +struct perf_pmu__alias { + char *name; + struct list_head terms; + struct list_head list; +}; + struct perf_pmu { char *name; __u32 type; struct list_head format; + struct list_head aliases; struct list_head list; }; struct perf_pmu *perf_pmu__find(char *name); int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr, struct list_head *head_terms); - +int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms); +struct list_head *perf_pmu__alias(struct perf_pmu *pmu, + struct list_head *head_terms); int perf_pmu_wrap(void); void perf_pmu_error(struct list_head *list, char *name, char const *msg); From 4429392e1484319df4209d41a98244c76d0caf56 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 15 Jun 2012 14:31:42 +0800 Subject: [PATCH 110/612] perf/tool: Add automated test for pure terms parsing Adding automated test for parsing terms out of the event grammar. Also slightly changing current event parsing test functions to follow up more generic namespace. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-14-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events-test.c | 122 ++++++++++++++++++++++++++-- 1 file changed, 117 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/parse-events-test.c b/tools/perf/util/parse-events-test.c index 76b98e2a587d..af1039cdb853 100644 --- a/tools/perf/util/parse-events-test.c +++ b/tools/perf/util/parse-events-test.c @@ -430,6 +430,49 @@ static int test__checkevent_pmu_name(struct perf_evlist *evlist) return 0; } +static int test__checkterms_simple(struct list_head *terms) +{ + struct parse_events__term *term; + + /* config=10 */ + term = list_entry(terms->next, struct parse_events__term, list); + TEST_ASSERT_VAL("wrong type term", + term->type_term == PARSE_EVENTS__TERM_TYPE_CONFIG); + TEST_ASSERT_VAL("wrong type val", + term->type_val == PARSE_EVENTS__TERM_TYPE_NUM); + TEST_ASSERT_VAL("wrong val", term->val.num == 10); + TEST_ASSERT_VAL("wrong config", !term->config); + + /* config1 */ + term = list_entry(term->list.next, struct parse_events__term, list); + TEST_ASSERT_VAL("wrong type term", + term->type_term == PARSE_EVENTS__TERM_TYPE_CONFIG1); + TEST_ASSERT_VAL("wrong type val", + term->type_val == PARSE_EVENTS__TERM_TYPE_NUM); + TEST_ASSERT_VAL("wrong val", term->val.num == 1); + TEST_ASSERT_VAL("wrong config", !term->config); + + /* config2=3 */ + term = list_entry(term->list.next, struct parse_events__term, list); + TEST_ASSERT_VAL("wrong type term", + term->type_term == PARSE_EVENTS__TERM_TYPE_CONFIG2); + TEST_ASSERT_VAL("wrong type val", + term->type_val == PARSE_EVENTS__TERM_TYPE_NUM); + TEST_ASSERT_VAL("wrong val", term->val.num == 3); + TEST_ASSERT_VAL("wrong config", !term->config); + + /* umask=1*/ + term = list_entry(term->list.next, struct parse_events__term, list); + TEST_ASSERT_VAL("wrong type term", + term->type_term == PARSE_EVENTS__TERM_TYPE_USER); + TEST_ASSERT_VAL("wrong type val", + term->type_val == PARSE_EVENTS__TERM_TYPE_NUM); + TEST_ASSERT_VAL("wrong val", term->val.num == 1); + TEST_ASSERT_VAL("wrong config", !strcmp(term->config, "umask")); + + return 0; +} + struct test__event_st { const char *name; __u32 type; @@ -559,7 +602,23 @@ static struct test__event_st test__events_pmu[] = { #define TEST__EVENTS_PMU_CNT (sizeof(test__events_pmu) / \ sizeof(struct test__event_st)) -static int test(struct test__event_st *e) +struct test__term { + const char *str; + __u32 type; + int (*check)(struct list_head *terms); +}; + +static struct test__term test__terms[] = { + [0] = { + .str = "config=10,config1,config2=3,umask=1", + .check = test__checkterms_simple, + }, +}; + +#define TEST__TERMS_CNT (sizeof(test__terms) / \ + sizeof(struct test__term)) + +static int test_event(struct test__event_st *e) { struct perf_evlist *evlist; int ret; @@ -590,7 +649,48 @@ static int test_events(struct test__event_st *events, unsigned cnt) struct test__event_st *e = &events[i]; pr_debug("running test %d '%s'\n", i, e->name); - ret = test(e); + ret = test_event(e); + if (ret) + break; + } + + return ret; +} + +static int test_term(struct test__term *t) +{ + struct list_head *terms; + int ret; + + terms = malloc(sizeof(*terms)); + if (!terms) + return -ENOMEM; + + INIT_LIST_HEAD(terms); + + ret = parse_events_terms(terms, t->str); + if (ret) { + pr_debug("failed to parse terms '%s', err %d\n", + t->str , ret); + return ret; + } + + ret = t->check(terms); + parse_events__free_terms(terms); + + return ret; +} + +static int test_terms(struct test__term *terms, unsigned cnt) +{ + int ret = 0; + unsigned i; + + for (i = 0; i < cnt; i++) { + struct test__term *t = &terms[i]; + + pr_debug("running test %d '%s'\n", i, t->str); + ret = test_term(t); if (ret) break; } @@ -617,9 +717,21 @@ int parse_events__test(void) { int ret; - ret = test_events(test__events, TEST__EVENTS_CNT); - if (!ret && test_pmu()) - ret = test_events(test__events_pmu, TEST__EVENTS_PMU_CNT); + do { + ret = test_events(test__events, TEST__EVENTS_CNT); + if (ret) + break; + + if (test_pmu()) { + ret = test_events(test__events_pmu, + TEST__EVENTS_PMU_CNT); + if (ret) + break; + } + + ret = test_terms(test__terms, TEST__TERMS_CNT); + + } while (0); return ret; } From 2992c542fcd40777ed253f57362c65711fb8acaf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 16 Jun 2012 14:45:49 +0200 Subject: [PATCH 111/612] perf/x86: Lowercase uncore PMU event names Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ucnds8gkve4x3s4biuukyph3@git.kernel.org [ Trivial build fix ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 51 ++++++------------- 1 file changed, 16 insertions(+), 35 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index d34f68bf990b..28a8413ca199 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -179,22 +179,17 @@ static struct attribute *snbep_uncore_pcu_formats_attr[] = { }; static struct uncore_event_desc snbep_uncore_imc_events[] = { - INTEL_UNCORE_EVENT_DESC(CLOCKTICKS, "config=0xffff"), - /* read */ - INTEL_UNCORE_EVENT_DESC(CAS_COUNT_RD, "event=0x4,umask=0x3"), - /* write */ - INTEL_UNCORE_EVENT_DESC(CAS_COUNT_WR, "event=0x4,umask=0xc"), + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0xff"), + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), { /* end: all zeroes */ }, }; static struct uncore_event_desc snbep_uncore_qpi_events[] = { - INTEL_UNCORE_EVENT_DESC(CLOCKTICKS, "event=0x14"), - /* outgoing data+nondata flits */ - INTEL_UNCORE_EVENT_DESC(TxL_FLITS_ACTIVE, "event=0x0,umask=0x6"), - /* DRS data received */ - INTEL_UNCORE_EVENT_DESC(DRS_DATA, "event=0x2,umask=0x8"), - /* NCB data received */ - INTEL_UNCORE_EVENT_DESC(NCB_DATA, "event=0x3,umask=0x4"), + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), + INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), + INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"), { /* end: all zeroes */ }, }; @@ -621,29 +616,15 @@ static struct attribute_group nhm_uncore_format_group = { }; static struct uncore_event_desc nhm_uncore_events[] = { - INTEL_UNCORE_EVENT_DESC(CLOCKTICKS, "config=0xffff"), - /* full cache line writes to DRAM */ - INTEL_UNCORE_EVENT_DESC(QMC_WRITES_FULL_ANY, "event=0x2f,umask=0xf"), - /* Quickpath Memory Controller normal priority read requests */ - INTEL_UNCORE_EVENT_DESC(QMC_NORMAL_READS_ANY, "event=0x2c,umask=0xf"), - /* Quickpath Home Logic read requests from the IOH */ - INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_IOH_READS, - "event=0x20,umask=0x1"), - /* Quickpath Home Logic write requests from the IOH */ - INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_IOH_WRITES, - "event=0x20,umask=0x2"), - /* Quickpath Home Logic read requests from a remote socket */ - INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_REMOTE_READS, - "event=0x20,umask=0x4"), - /* Quickpath Home Logic write requests from a remote socket */ - INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_REMOTE_WRITES, - "event=0x20,umask=0x8"), - /* Quickpath Home Logic read requests from the local socket */ - INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_LOCAL_READS, - "event=0x20,umask=0x10"), - /* Quickpath Home Logic write requests from the local socket */ - INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_LOCAL_WRITES, - "event=0x20,umask=0x20"), + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0xff"), + INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"), { /* end: all zeroes */ }, }; From 9e0304e388ef0e7495b279ce737c24ad56d20b5d Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Mon, 18 Jun 2012 13:11:21 -0400 Subject: [PATCH 112/612] arch/tile: big-endian: properly bswap instruction bundles when backtracing Instruction bundles are always little-endian, even when running in big-endian mode. I missed this internal bug fix when cherry-picking the big-endian code to return to the community. Signed-off-by: Chris Metcalf --- arch/tile/kernel/backtrace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/tile/kernel/backtrace.c b/arch/tile/kernel/backtrace.c index 9092ce8aa6b4..f8b74ca83b92 100644 --- a/arch/tile/kernel/backtrace.c +++ b/arch/tile/kernel/backtrace.c @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -336,8 +337,12 @@ static void find_caller_pc_and_caller_sp(CallerLocation *location, bytes_to_prefetch / sizeof(tile_bundle_bits); } - /* Decode the next bundle. */ - bundle.bits = prefetched_bundles[next_bundle++]; + /* + * Decode the next bundle. + * TILE always stores instruction bundles in little-endian + * mode, even when the chip is running in big-endian mode. + */ + bundle.bits = le64_to_cpu(prefetched_bundles[next_bundle++]); bundle.num_insns = parse_insn_tile(bundle.bits, pc, bundle.insns); num_info_ops = bt_get_info_ops(&bundle, info_operands); From 4d146ad7388a9cca2a890d7449aeb767565068d7 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Sat, 9 Jun 2012 20:00:19 -0300 Subject: [PATCH 113/612] [media] pms: fix build error in pms_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a compiler breakage introduced by commit 8173090acb33: drivers/media/video/pms.c: In function ‘pms_probe’: drivers/media/video/pms.c:1047:2: error: implicit declaration of function ‘kzalloc’ [-Werror=implicit-function-declaration] drivers/media/video/pms.c:1116:2: error: implicit declaration of function ‘kfree’ [-Werror=implicit-function-declaration] Signed-off-by: Fengguang Wu Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/pms.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/video/pms.c b/drivers/media/video/pms.c index af2d9086d7e8..77f9c92186f4 100644 --- a/drivers/media/video/pms.c +++ b/drivers/media/video/pms.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include From 409a8be61560c5875816da6dcb0c575d78145a5c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 30 May 2012 10:33:24 -0300 Subject: [PATCH 114/612] perf tools: Add sort by src line/number Using addr2line for now, requires debuginfo, needs more work to support detached debuginfo, aka foo-debuginfo packages. Example: [root@sandy ~]# perf record -a sleep 3 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.555 MB perf.data (~24236 samples) ] [root@sandy ~]# perf report -s dso,srcline 2>&1 | grep -v ^# | head -5 22.41% [kernel.kallsyms] /home/git/linux/drivers/idle/intel_idle.c:280 4.79% [kernel.kallsyms] /home/git/linux/drivers/cpuidle/cpuidle.c:148 4.78% [kernel.kallsyms] /home/git/linux/arch/x86/include/asm/atomic64_64.h:121 4.49% [kernel.kallsyms] /home/git/linux/kernel/sched/core.c:1690 4.30% [kernel.kallsyms] /home/git/linux/include/linux/seqlock.h:90 [root@sandy ~]# [root@sandy ~]# perf top -U -s dso,symbol,srcline Samples: 1K of event 'cycles', Event count (approx.): 589617389 18.66% [kernel] [k] copy_user_generic_unrolled /home/git/linux/arch/x86/lib/copy_user_64.S:143 7.83% [kernel] [k] clear_page /home/git/linux/arch/x86/lib/clear_page_64.S:39 6.59% [kernel] [k] clear_page /home/git/linux/arch/x86/lib/clear_page_64.S:38 3.66% [kernel] [k] page_fault /home/git/linux/arch/x86/kernel/entry_64.S:1379 3.25% [kernel] [k] clear_page /home/git/linux/arch/x86/lib/clear_page_64.S:40 3.12% [kernel] [k] clear_page /home/git/linux/arch/x86/lib/clear_page_64.S:37 2.74% [kernel] [k] clear_page /home/git/linux/arch/x86/lib/clear_page_64.S:36 2.39% [kernel] [k] clear_page /home/git/linux/arch/x86/lib/clear_page_64.S:43 2.12% [kernel] [k] ioread32 /home/git/linux/lib/iomap.c:90 1.51% [kernel] [k] copy_user_generic_unrolled /home/git/linux/arch/x86/lib/copy_user_64.S:144 1.19% [kernel] [k] copy_user_generic_unrolled /home/git/linux/arch/x86/lib/copy_user_64.S:154 Suggested-by: Andi Kleen Cc: David Ahern Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-pdmqbng9twz06jzkbgtuwbp8@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 2 +- tools/perf/Documentation/perf-top.txt | 2 +- tools/perf/util/hist.h | 1 + tools/perf/util/sort.c | 49 ++++++++++++++++++++++++ tools/perf/util/sort.h | 2 + 5 files changed, 54 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 2d89f02719b5..495210a612c4 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -57,7 +57,7 @@ OPTIONS -s:: --sort=:: - Sort by key(s): pid, comm, dso, symbol, parent. + Sort by key(s): pid, comm, dso, symbol, parent, srcline. -p:: --parent=:: diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 4a5680cb242e..5b80d84d6b4a 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -112,7 +112,7 @@ Default is to monitor all CPUS. -s:: --sort:: - Sort by key(s): pid, comm, dso, symbol, parent + Sort by key(s): pid, comm, dso, symbol, parent, srcline. -n:: --show-nr-samples:: diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 34bb556d6219..0b096c27a419 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -47,6 +47,7 @@ enum hist_column { HISTC_SYMBOL_TO, HISTC_DSO_FROM, HISTC_DSO_TO, + HISTC_SRCLINE, HISTC_NR_COLS, /* Last entry */ }; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index a27237430c5f..0f5a0a496bc4 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -241,6 +241,54 @@ struct sort_entry sort_sym = { .se_width_idx = HISTC_SYMBOL, }; +/* --sort srcline */ + +static int64_t +sort__srcline_cmp(struct hist_entry *left, struct hist_entry *right) +{ + return (int64_t)(right->ip - left->ip); +} + +static int hist_entry__srcline_snprintf(struct hist_entry *self, char *bf, + size_t size, unsigned int width __used) +{ + FILE *fp; + char cmd[PATH_MAX + 2], *path = self->srcline, *nl; + size_t line_len; + + if (path != NULL) + goto out_path; + + snprintf(cmd, sizeof(cmd), "addr2line -e %s %016" PRIx64, + self->ms.map->dso->long_name, self->ip); + fp = popen(cmd, "r"); + if (!fp) + goto out_ip; + + if (getline(&path, &line_len, fp) < 0 || !line_len) + goto out_ip; + fclose(fp); + self->srcline = strdup(path); + if (self->srcline == NULL) + goto out_ip; + + nl = strchr(self->srcline, '\n'); + if (nl != NULL) + *nl = '\0'; + path = self->srcline; +out_path: + return repsep_snprintf(bf, size, "%s", path); +out_ip: + return repsep_snprintf(bf, size, "%-#*llx", BITS_PER_LONG / 4, self->ip); +} + +struct sort_entry sort_srcline = { + .se_header = "Source:Line", + .se_cmp = sort__srcline_cmp, + .se_snprintf = hist_entry__srcline_snprintf, + .se_width_idx = HISTC_SRCLINE, +}; + /* --sort parent */ static int64_t @@ -439,6 +487,7 @@ static struct sort_dimension sort_dimensions[] = { DIM(SORT_PARENT, "parent", sort_parent), DIM(SORT_CPU, "cpu", sort_cpu), DIM(SORT_MISPREDICT, "mispredict", sort_mispredict), + DIM(SORT_SRCLINE, "srcline", sort_srcline), }; int sort_dimension__add(const char *tok) diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 472aa5a63a58..e724b26acd51 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -71,6 +71,7 @@ struct hist_entry { char level; bool used; u8 filtered; + char *srcline; struct symbol *parent; union { unsigned long position; @@ -93,6 +94,7 @@ enum sort_type { SORT_SYM_FROM, SORT_SYM_TO, SORT_MISPREDICT, + SORT_SRCLINE, }; /* From ba47a142d9f9b84e0464a11b7a067e5ad95c5d4b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 29 May 2012 13:22:58 +0900 Subject: [PATCH 115/612] perf ui: Introduce struct perf_error_ops The struct perf_error_ops is for flexible error logging. We can register appropriate functions based on front-end. Signed-off-by: Namhyung Kim Acked-by: Pekka Enberg Cc: Paul Mackerras Cc: Pekka Enberg Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338265382-6872-4-git-send-email-namhyung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 3 + tools/perf/ui/gtk/util.c | 20 +++ tools/perf/ui/tui/setup.c | 6 + tools/perf/ui/tui/util.c | 243 ++++++++++++++++++++++++++++++ tools/perf/ui/util.c | 301 +++++++++----------------------------- tools/perf/ui/util.h | 9 +- tools/perf/util/debug.c | 2 +- tools/perf/util/debug.h | 23 ++- 8 files changed, 369 insertions(+), 238 deletions(-) create mode 100644 tools/perf/ui/gtk/util.c create mode 100644 tools/perf/ui/tui/util.c diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 0eee64cfe9a0..c0ee917ae8ab 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -503,6 +503,7 @@ else LIB_OBJS += $(OUTPUT)ui/progress.o LIB_OBJS += $(OUTPUT)ui/util.o LIB_OBJS += $(OUTPUT)ui/tui/setup.o + LIB_OBJS += $(OUTPUT)ui/tui/util.o LIB_H += ui/browser.h LIB_H += ui/browsers/map.h LIB_H += ui/helpline.h @@ -526,9 +527,11 @@ else EXTLIBS += $(shell pkg-config --libs gtk+-2.0) LIB_OBJS += $(OUTPUT)ui/gtk/browser.o LIB_OBJS += $(OUTPUT)ui/gtk/setup.o + LIB_OBJS += $(OUTPUT)ui/gtk/util.o # Make sure that it'd be included only once. ifneq ($(findstring -DNO_NEWT_SUPPORT,$(BASIC_CFLAGS)),) LIB_OBJS += $(OUTPUT)ui/setup.o + LIB_OBJS += $(OUTPUT)ui/util.o endif endif endif diff --git a/tools/perf/ui/gtk/util.c b/tools/perf/ui/gtk/util.c new file mode 100644 index 000000000000..a727fe394e91 --- /dev/null +++ b/tools/perf/ui/gtk/util.c @@ -0,0 +1,20 @@ +#include "../util.h" +#include "../../util/debug.h" +#include "gtk.h" + + +/* + * FIXME: Functions below should be implemented properly. + * For now, just add stubs for NO_NEWT=1 build. + */ +#ifdef NO_NEWT_SUPPORT +int ui_helpline__show_help(const char *format __used, va_list ap __used) +{ + return 0; +} + +void ui_progress__update(u64 curr __used, u64 total __used, + const char *title __used) +{ +} +#endif diff --git a/tools/perf/ui/tui/setup.c b/tools/perf/ui/tui/setup.c index d33e943ac434..e813c1d17346 100644 --- a/tools/perf/ui/tui/setup.c +++ b/tools/perf/ui/tui/setup.c @@ -15,6 +15,8 @@ pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER; static volatile int ui__need_resize; +extern struct perf_error_ops perf_tui_eops; + void ui__refresh_dimensions(bool force) { if (force || ui__need_resize) { @@ -122,6 +124,8 @@ int ui__init(void) signal(SIGINT, ui__signal); signal(SIGQUIT, ui__signal); signal(SIGTERM, ui__signal); + + perf_error__register(&perf_tui_eops); out: return err; } @@ -137,4 +141,6 @@ void ui__exit(bool wait_for_ok) SLsmg_refresh(); SLsmg_reset_smg(); SLang_reset_tty(); + + perf_error__unregister(&perf_tui_eops); } diff --git a/tools/perf/ui/tui/util.c b/tools/perf/ui/tui/util.c new file mode 100644 index 000000000000..092902e30cee --- /dev/null +++ b/tools/perf/ui/tui/util.c @@ -0,0 +1,243 @@ +#include "../../util/util.h" +#include +#include +#include +#include + +#include "../../util/cache.h" +#include "../../util/debug.h" +#include "../browser.h" +#include "../keysyms.h" +#include "../helpline.h" +#include "../ui.h" +#include "../util.h" +#include "../libslang.h" + +static void ui_browser__argv_write(struct ui_browser *browser, + void *entry, int row) +{ + char **arg = entry; + bool current_entry = ui_browser__is_current_entry(browser, row); + + ui_browser__set_color(browser, current_entry ? HE_COLORSET_SELECTED : + HE_COLORSET_NORMAL); + slsmg_write_nstring(*arg, browser->width); +} + +static int popup_menu__run(struct ui_browser *menu) +{ + int key; + + if (ui_browser__show(menu, " ", "ESC: exit, ENTER|->: Select option") < 0) + return -1; + + while (1) { + key = ui_browser__run(menu, 0); + + switch (key) { + case K_RIGHT: + case K_ENTER: + key = menu->index; + break; + case K_LEFT: + case K_ESC: + case 'q': + case CTRL('c'): + key = -1; + break; + default: + continue; + } + + break; + } + + ui_browser__hide(menu); + return key; +} + +int ui__popup_menu(int argc, char * const argv[]) +{ + struct ui_browser menu = { + .entries = (void *)argv, + .refresh = ui_browser__argv_refresh, + .seek = ui_browser__argv_seek, + .write = ui_browser__argv_write, + .nr_entries = argc, + }; + + return popup_menu__run(&menu); +} + +int ui_browser__input_window(const char *title, const char *text, char *input, + const char *exit_msg, int delay_secs) +{ + int x, y, len, key; + int max_len = 60, nr_lines = 0; + static char buf[50]; + const char *t; + + t = text; + while (1) { + const char *sep = strchr(t, '\n'); + + if (sep == NULL) + sep = strchr(t, '\0'); + len = sep - t; + if (max_len < len) + max_len = len; + ++nr_lines; + if (*sep == '\0') + break; + t = sep + 1; + } + + max_len += 2; + nr_lines += 8; + y = SLtt_Screen_Rows / 2 - nr_lines / 2; + x = SLtt_Screen_Cols / 2 - max_len / 2; + + SLsmg_set_color(0); + SLsmg_draw_box(y, x++, nr_lines, max_len); + if (title) { + SLsmg_gotorc(y, x + 1); + SLsmg_write_string((char *)title); + } + SLsmg_gotorc(++y, x); + nr_lines -= 7; + max_len -= 2; + SLsmg_write_wrapped_string((unsigned char *)text, y, x, + nr_lines, max_len, 1); + y += nr_lines; + len = 5; + while (len--) { + SLsmg_gotorc(y + len - 1, x); + SLsmg_write_nstring((char *)" ", max_len); + } + SLsmg_draw_box(y++, x + 1, 3, max_len - 2); + + SLsmg_gotorc(y + 3, x); + SLsmg_write_nstring((char *)exit_msg, max_len); + SLsmg_refresh(); + + x += 2; + len = 0; + key = ui__getch(delay_secs); + while (key != K_TIMER && key != K_ENTER && key != K_ESC) { + if (key == K_BKSPC) { + if (len == 0) + goto next_key; + SLsmg_gotorc(y, x + --len); + SLsmg_write_char(' '); + } else { + buf[len] = key; + SLsmg_gotorc(y, x + len++); + SLsmg_write_char(key); + } + SLsmg_refresh(); + + /* XXX more graceful overflow handling needed */ + if (len == sizeof(buf) - 1) { + ui_helpline__push("maximum size of symbol name reached!"); + key = K_ENTER; + break; + } +next_key: + key = ui__getch(delay_secs); + } + + buf[len] = '\0'; + strncpy(input, buf, len+1); + return key; +} + +int ui__question_window(const char *title, const char *text, + const char *exit_msg, int delay_secs) +{ + int x, y; + int max_len = 0, nr_lines = 0; + const char *t; + + t = text; + while (1) { + const char *sep = strchr(t, '\n'); + int len; + + if (sep == NULL) + sep = strchr(t, '\0'); + len = sep - t; + if (max_len < len) + max_len = len; + ++nr_lines; + if (*sep == '\0') + break; + t = sep + 1; + } + + max_len += 2; + nr_lines += 4; + y = SLtt_Screen_Rows / 2 - nr_lines / 2, + x = SLtt_Screen_Cols / 2 - max_len / 2; + + SLsmg_set_color(0); + SLsmg_draw_box(y, x++, nr_lines, max_len); + if (title) { + SLsmg_gotorc(y, x + 1); + SLsmg_write_string((char *)title); + } + SLsmg_gotorc(++y, x); + nr_lines -= 2; + max_len -= 2; + SLsmg_write_wrapped_string((unsigned char *)text, y, x, + nr_lines, max_len, 1); + SLsmg_gotorc(y + nr_lines - 2, x); + SLsmg_write_nstring((char *)" ", max_len); + SLsmg_gotorc(y + nr_lines - 1, x); + SLsmg_write_nstring((char *)exit_msg, max_len); + SLsmg_refresh(); + return ui__getch(delay_secs); +} + +int ui__help_window(const char *text) +{ + return ui__question_window("Help", text, "Press any key...", 0); +} + +int ui__dialog_yesno(const char *msg) +{ + return ui__question_window(NULL, msg, "Enter: Yes, ESC: No", 0); +} + +static int __ui__warning(const char *title, const char *format, va_list args) +{ + char *s; + + if (vasprintf(&s, format, args) > 0) { + int key; + + pthread_mutex_lock(&ui__lock); + key = ui__question_window(title, s, "Press any key...", 0); + pthread_mutex_unlock(&ui__lock); + free(s); + return key; + } + + fprintf(stderr, "%s\n", title); + vfprintf(stderr, format, args); + return K_ESC; +} + +static int perf_tui__error(const char *format, va_list args) +{ + return __ui__warning("Error:", format, args); +} + +static int perf_tui__warning(const char *format, va_list args) +{ + return __ui__warning("Warning:", format, args); +} + +struct perf_error_ops perf_tui_eops = { + .error = perf_tui__error, + .warning = perf_tui__warning, +}; diff --git a/tools/perf/ui/util.c b/tools/perf/ui/util.c index ad4374a16bb0..4f989774c8c6 100644 --- a/tools/perf/ui/util.c +++ b/tools/perf/ui/util.c @@ -1,250 +1,85 @@ -#include "../util.h" -#include -#include -#include -#include - -#include "../cache.h" -#include "../debug.h" -#include "browser.h" -#include "keysyms.h" -#include "helpline.h" -#include "ui.h" #include "util.h" -#include "libslang.h" +#include "../debug.h" -static void ui_browser__argv_write(struct ui_browser *browser, - void *entry, int row) + +/* + * Default error logging functions + */ +static int perf_stdio__error(const char *format, va_list args) { - char **arg = entry; - bool current_entry = ui_browser__is_current_entry(browser, row); - - ui_browser__set_color(browser, current_entry ? HE_COLORSET_SELECTED : - HE_COLORSET_NORMAL); - slsmg_write_nstring(*arg, browser->width); -} - -static int popup_menu__run(struct ui_browser *menu) -{ - int key; - - if (ui_browser__show(menu, " ", "ESC: exit, ENTER|->: Select option") < 0) - return -1; - - while (1) { - key = ui_browser__run(menu, 0); - - switch (key) { - case K_RIGHT: - case K_ENTER: - key = menu->index; - break; - case K_LEFT: - case K_ESC: - case 'q': - case CTRL('c'): - key = -1; - break; - default: - continue; - } - - break; - } - - ui_browser__hide(menu); - return key; -} - -int ui__popup_menu(int argc, char * const argv[]) -{ - struct ui_browser menu = { - .entries = (void *)argv, - .refresh = ui_browser__argv_refresh, - .seek = ui_browser__argv_seek, - .write = ui_browser__argv_write, - .nr_entries = argc, - }; - - return popup_menu__run(&menu); -} - -int ui_browser__input_window(const char *title, const char *text, char *input, - const char *exit_msg, int delay_secs) -{ - int x, y, len, key; - int max_len = 60, nr_lines = 0; - static char buf[50]; - const char *t; - - t = text; - while (1) { - const char *sep = strchr(t, '\n'); - - if (sep == NULL) - sep = strchr(t, '\0'); - len = sep - t; - if (max_len < len) - max_len = len; - ++nr_lines; - if (*sep == '\0') - break; - t = sep + 1; - } - - max_len += 2; - nr_lines += 8; - y = SLtt_Screen_Rows / 2 - nr_lines / 2; - x = SLtt_Screen_Cols / 2 - max_len / 2; - - SLsmg_set_color(0); - SLsmg_draw_box(y, x++, nr_lines, max_len); - if (title) { - SLsmg_gotorc(y, x + 1); - SLsmg_write_string((char *)title); - } - SLsmg_gotorc(++y, x); - nr_lines -= 7; - max_len -= 2; - SLsmg_write_wrapped_string((unsigned char *)text, y, x, - nr_lines, max_len, 1); - y += nr_lines; - len = 5; - while (len--) { - SLsmg_gotorc(y + len - 1, x); - SLsmg_write_nstring((char *)" ", max_len); - } - SLsmg_draw_box(y++, x + 1, 3, max_len - 2); - - SLsmg_gotorc(y + 3, x); - SLsmg_write_nstring((char *)exit_msg, max_len); - SLsmg_refresh(); - - x += 2; - len = 0; - key = ui__getch(delay_secs); - while (key != K_TIMER && key != K_ENTER && key != K_ESC) { - if (key == K_BKSPC) { - if (len == 0) - goto next_key; - SLsmg_gotorc(y, x + --len); - SLsmg_write_char(' '); - } else { - buf[len] = key; - SLsmg_gotorc(y, x + len++); - SLsmg_write_char(key); - } - SLsmg_refresh(); - - /* XXX more graceful overflow handling needed */ - if (len == sizeof(buf) - 1) { - ui_helpline__push("maximum size of symbol name reached!"); - key = K_ENTER; - break; - } -next_key: - key = ui__getch(delay_secs); - } - - buf[len] = '\0'; - strncpy(input, buf, len+1); - return key; -} - -int ui__question_window(const char *title, const char *text, - const char *exit_msg, int delay_secs) -{ - int x, y; - int max_len = 0, nr_lines = 0; - const char *t; - - t = text; - while (1) { - const char *sep = strchr(t, '\n'); - int len; - - if (sep == NULL) - sep = strchr(t, '\0'); - len = sep - t; - if (max_len < len) - max_len = len; - ++nr_lines; - if (*sep == '\0') - break; - t = sep + 1; - } - - max_len += 2; - nr_lines += 4; - y = SLtt_Screen_Rows / 2 - nr_lines / 2, - x = SLtt_Screen_Cols / 2 - max_len / 2; - - SLsmg_set_color(0); - SLsmg_draw_box(y, x++, nr_lines, max_len); - if (title) { - SLsmg_gotorc(y, x + 1); - SLsmg_write_string((char *)title); - } - SLsmg_gotorc(++y, x); - nr_lines -= 2; - max_len -= 2; - SLsmg_write_wrapped_string((unsigned char *)text, y, x, - nr_lines, max_len, 1); - SLsmg_gotorc(y + nr_lines - 2, x); - SLsmg_write_nstring((char *)" ", max_len); - SLsmg_gotorc(y + nr_lines - 1, x); - SLsmg_write_nstring((char *)exit_msg, max_len); - SLsmg_refresh(); - return ui__getch(delay_secs); -} - -int ui__help_window(const char *text) -{ - return ui__question_window("Help", text, "Press any key...", 0); -} - -int ui__dialog_yesno(const char *msg) -{ - return ui__question_window(NULL, msg, "Enter: Yes, ESC: No", 0); -} - -int __ui__warning(const char *title, const char *format, va_list args) -{ - char *s; - - if (use_browser > 0 && vasprintf(&s, format, args) > 0) { - int key; - - pthread_mutex_lock(&ui__lock); - key = ui__question_window(title, s, "Press any key...", 0); - pthread_mutex_unlock(&ui__lock); - free(s); - return key; - } - - fprintf(stderr, "%s:\n", title); + fprintf(stderr, "Error:\n"); vfprintf(stderr, format, args); - return K_ESC; + return 0; +} + +static int perf_stdio__warning(const char *format, va_list args) +{ + fprintf(stderr, "Warning:\n"); + vfprintf(stderr, format, args); + return 0; +} + +static struct perf_error_ops default_eops = +{ + .error = perf_stdio__error, + .warning = perf_stdio__warning, +}; + +static struct perf_error_ops *perf_eops = &default_eops; + + +int ui__error(const char *format, ...) +{ + int ret; + va_list args; + + va_start(args, format); + ret = perf_eops->error(format, args); + va_end(args); + + return ret; } int ui__warning(const char *format, ...) { - int key; + int ret; va_list args; va_start(args, format); - key = __ui__warning("Warning", format, args); + ret = perf_eops->warning(format, args); va_end(args); - return key; + + return ret; } -int ui__error(const char *format, ...) + +/** + * perf_error__register - Register error logging functions + * @eops: The pointer to error logging function struct + * + * Register UI-specific error logging functions. Before calling this, + * other logging functions should be unregistered, if any. + */ +int perf_error__register(struct perf_error_ops *eops) { - int key; - va_list args; + if (perf_eops != &default_eops) + return -1; - va_start(args, format); - key = __ui__warning("Error", format, args); - va_end(args); - return key; + perf_eops = eops; + return 0; +} + +/** + * perf_error__unregister - Unregister error logging functions + * @eops: The pointer to error logging function struct + * + * Unregister already registered error logging functions. + */ +int perf_error__unregister(struct perf_error_ops *eops) +{ + if (perf_eops != eops) + return -1; + + perf_eops = &default_eops; + return 0; } diff --git a/tools/perf/ui/util.h b/tools/perf/ui/util.h index 2d1738bd71c8..361f08c52d37 100644 --- a/tools/perf/ui/util.h +++ b/tools/perf/ui/util.h @@ -9,6 +9,13 @@ int ui__help_window(const char *text); int ui__dialog_yesno(const char *msg); int ui__question_window(const char *title, const char *text, const char *exit_msg, int delay_secs); -int __ui__warning(const char *title, const char *format, va_list args); + +struct perf_error_ops { + int (*error)(const char *format, va_list args); + int (*warning)(const char *format, va_list args); +}; + +int perf_error__register(struct perf_error_ops *eops); +int perf_error__unregister(struct perf_error_ops *eops); #endif /* _PERF_UI_UTIL_H_ */ diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c index efb1fce259a4..4dfe0bb3c322 100644 --- a/tools/perf/util/debug.c +++ b/tools/perf/util/debug.c @@ -47,7 +47,7 @@ int dump_printf(const char *fmt, ...) return ret; } -#ifdef NO_NEWT_SUPPORT +#if defined(NO_NEWT_SUPPORT) && defined(NO_GTK2_SUPPORT) int ui__warning(const char *format, ...) { va_list args; diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h index 6bebe7f0a20c..015c91dbc096 100644 --- a/tools/perf/util/debug.h +++ b/tools/perf/util/debug.h @@ -12,8 +12,9 @@ int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2))); void trace_event(union perf_event *event); struct ui_progress; +struct perf_error_ops; -#ifdef NO_NEWT_SUPPORT +#if defined(NO_NEWT_SUPPORT) && defined(NO_GTK2_SUPPORT) static inline int ui_helpline__show_help(const char *format __used, va_list ap __used) { return 0; @@ -23,12 +24,28 @@ static inline void ui_progress__update(u64 curr __used, u64 total __used, const char *title __used) {} #define ui__error(format, arg...) ui__warning(format, ##arg) -#else + +static inline int +perf_error__register(struct perf_error_ops *eops __used) +{ + return 0; +} + +static inline int +perf_error__unregister(struct perf_error_ops *eops __used) +{ + return 0; +} + +#else /* NO_NEWT_SUPPORT && NO_GTK2_SUPPORT */ + extern char ui_helpline__last_msg[]; int ui_helpline__show_help(const char *format, va_list ap); #include "../ui/progress.h" int ui__error(const char *format, ...) __attribute__((format(printf, 1, 2))); -#endif +#include "../ui/util.h" + +#endif /* NO_NEWT_SUPPORT && NO_GTK2_SUPPORT */ int ui__warning(const char *format, ...) __attribute__((format(printf, 1, 2))); int ui__error_paranoid(void); From 42ab68a35ffee04700648ec42c9507145a66837d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 29 May 2012 13:22:59 +0900 Subject: [PATCH 116/612] perf ui/gtk: Introduce struct perf_gtk_context The struct perf_gtk_context is for tracking current state of GTK window and/or other things. This is a preparation of next changes. Signed-off-by: Namhyung Kim Acked-by: Pekka Enberg Cc: Paul Mackerras Cc: Pekka Enberg Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338265382-6872-5-git-send-email-namhyung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/gtk/browser.c | 8 +++++++- tools/perf/ui/gtk/gtk.h | 17 +++++++++++++++++ tools/perf/ui/gtk/util.c | 23 +++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/tools/perf/ui/gtk/browser.c b/tools/perf/ui/gtk/browser.c index 0656c381a89c..33ab1aee3472 100644 --- a/tools/perf/ui/gtk/browser.c +++ b/tools/perf/ui/gtk/browser.c @@ -11,8 +11,8 @@ static void perf_gtk__signal(int sig) { + perf_gtk__exit(false); psignal(sig, "perf"); - gtk_main_quit(); } static void perf_gtk__resize_window(GtkWidget *window) @@ -143,6 +143,10 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist, g_signal_connect(window, "delete_event", gtk_main_quit, NULL); + pgctx = perf_gtk__activate_context(window); + if (!pgctx) + return -1; + notebook = gtk_notebook_new(); list_for_each_entry(pos, &evlist->entries, node) { @@ -174,5 +178,7 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist, gtk_main(); + perf_gtk__deactivate_context(&pgctx); + return 0; } diff --git a/tools/perf/ui/gtk/gtk.h b/tools/perf/ui/gtk/gtk.h index 75177ee04032..34fbca6d48a2 100644 --- a/tools/perf/ui/gtk/gtk.h +++ b/tools/perf/ui/gtk/gtk.h @@ -1,8 +1,25 @@ #ifndef _PERF_GTK_H_ #define _PERF_GTK_H_ 1 +#include + #pragma GCC diagnostic ignored "-Wstrict-prototypes" #include #pragma GCC diagnostic error "-Wstrict-prototypes" + +struct perf_gtk_context { + GtkWidget *main_window; +}; + +extern struct perf_gtk_context *pgctx; + +static inline bool perf_gtk__is_active_context(struct perf_gtk_context *ctx) +{ + return ctx && ctx->main_window; +} + +struct perf_gtk_context *perf_gtk__activate_context(GtkWidget *window); +int perf_gtk__deactivate_context(struct perf_gtk_context **ctx); + #endif /* _PERF_GTK_H_ */ diff --git a/tools/perf/ui/gtk/util.c b/tools/perf/ui/gtk/util.c index a727fe394e91..6fe13fdc513e 100644 --- a/tools/perf/ui/gtk/util.c +++ b/tools/perf/ui/gtk/util.c @@ -3,6 +3,29 @@ #include "gtk.h" +struct perf_gtk_context *pgctx; + +struct perf_gtk_context *perf_gtk__activate_context(GtkWidget *window) +{ + struct perf_gtk_context *ctx; + + ctx = malloc(sizeof(*pgctx)); + if (ctx) + ctx->main_window = window; + + return ctx; +} + +int perf_gtk__deactivate_context(struct perf_gtk_context **ctx) +{ + if (!perf_gtk__is_active_context(*ctx)) + return -1; + + free(*ctx); + *ctx = NULL; + return 0; +} + /* * FIXME: Functions below should be implemented properly. * For now, just add stubs for NO_NEWT=1 build. From b4418c6848dcd56cc90625dcfaef7cb019492233 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 29 May 2012 13:23:00 +0900 Subject: [PATCH 117/612] perf ui/gtk: Add GTK statusbar widget to browser window Add statusbar widget to display non-critical messages at the bottom of the window. This can be used for showing a status change, warning or help message. Signed-off-by: Namhyung Kim Acked-by: Pekka Enberg Cc: Paul Mackerras Cc: Pekka Enberg Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338265382-6872-6-git-send-email-namhyung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/gtk/browser.c | 26 +++++++++++++++++++++++++- tools/perf/ui/gtk/gtk.h | 2 ++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/tools/perf/ui/gtk/browser.c b/tools/perf/ui/gtk/browser.c index 33ab1aee3472..ece360db8658 100644 --- a/tools/perf/ui/gtk/browser.c +++ b/tools/perf/ui/gtk/browser.c @@ -122,13 +122,30 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists) gtk_container_add(GTK_CONTAINER(window), view); } +static GtkWidget *perf_gtk__setup_statusbar(void) +{ + GtkWidget *stbar; + unsigned ctxid; + + stbar = gtk_statusbar_new(); + + ctxid = gtk_statusbar_get_context_id(GTK_STATUSBAR(stbar), + "perf report"); + pgctx->statbar = stbar; + pgctx->statbar_ctx_id = ctxid; + + return stbar; +} + int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist, const char *help __used, void (*timer) (void *arg)__used, void *arg __used, int delay_secs __used) { struct perf_evsel *pos; + GtkWidget *vbox; GtkWidget *notebook; + GtkWidget *statbar; GtkWidget *window; signal(SIGSEGV, perf_gtk__signal); @@ -147,6 +164,8 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist, if (!pgctx) return -1; + vbox = gtk_vbox_new(FALSE, 0); + notebook = gtk_notebook_new(); list_for_each_entry(pos, &evlist->entries, node) { @@ -168,7 +187,12 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist, gtk_notebook_append_page(GTK_NOTEBOOK(notebook), scrolled_window, tab_label); } - gtk_container_add(GTK_CONTAINER(window), notebook); + gtk_box_pack_start(GTK_BOX(vbox), notebook, TRUE, TRUE, 0); + + statbar = perf_gtk__setup_statusbar(); + gtk_box_pack_start(GTK_BOX(vbox), statbar, FALSE, FALSE, 0); + + gtk_container_add(GTK_CONTAINER(window), vbox); gtk_widget_show_all(window); diff --git a/tools/perf/ui/gtk/gtk.h b/tools/perf/ui/gtk/gtk.h index 34fbca6d48a2..206167868c5f 100644 --- a/tools/perf/ui/gtk/gtk.h +++ b/tools/perf/ui/gtk/gtk.h @@ -10,6 +10,8 @@ struct perf_gtk_context { GtkWidget *main_window; + GtkWidget *statbar; + guint statbar_ctx_id; }; extern struct perf_gtk_context *pgctx; From a6b702c117f839023814c1e03453c701d26de522 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 29 May 2012 13:23:01 +0900 Subject: [PATCH 118/612] perf ui/gtk: Add GTK info_bar widget to browser window The GtkInfoBar is a modern UI component to display messages without bothering the main window. It'll be used for showing a warning message. As the GtkInfoBar requires 2.18 (or newer) version of GTK+ library, add availability check to Makefile too. Suggested-by: Sunjin Yang Signed-off-by: Namhyung Kim Acked-by: Pekka Enberg Cc: Paul Mackerras Cc: Pekka Enberg Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338265382-6872-7-git-send-email-namhyung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 3 +++ tools/perf/config/feature-tests.mak | 13 ++++++++++++ tools/perf/ui/gtk/browser.c | 33 +++++++++++++++++++++++++++++ tools/perf/ui/gtk/gtk.h | 12 +++++++++++ 4 files changed, 61 insertions(+) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index c0ee917ae8ab..d698c118a602 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -523,6 +523,9 @@ else msg := $(warning GTK2 not found, disables GTK2 support. Please install gtk2-devel or libgtk2.0-dev); BASIC_CFLAGS += -DNO_GTK2_SUPPORT else + ifeq ($(call try-cc,$(SOURCE_GTK2_INFOBAR),$(FLAGS_GTK2)),y) + BASIC_CFLAGS += -DHAVE_GTK_INFO_BAR + endif BASIC_CFLAGS += $(shell pkg-config --cflags gtk+-2.0) EXTLIBS += $(shell pkg-config --libs gtk+-2.0) LIB_OBJS += $(OUTPUT)ui/gtk/browser.o diff --git a/tools/perf/config/feature-tests.mak b/tools/perf/config/feature-tests.mak index d9084e03ce56..6c18785a6417 100644 --- a/tools/perf/config/feature-tests.mak +++ b/tools/perf/config/feature-tests.mak @@ -78,6 +78,19 @@ int main(int argc, char *argv[]) return 0; } endef + +define SOURCE_GTK2_INFOBAR +#pragma GCC diagnostic ignored \"-Wstrict-prototypes\" +#include +#pragma GCC diagnostic error \"-Wstrict-prototypes\" + +int main(void) +{ + gtk_info_bar_new(); + + return 0; +} +endef endif ifndef NO_LIBPERL diff --git a/tools/perf/ui/gtk/browser.c b/tools/perf/ui/gtk/browser.c index ece360db8658..fd41e8d10337 100644 --- a/tools/perf/ui/gtk/browser.c +++ b/tools/perf/ui/gtk/browser.c @@ -122,6 +122,34 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists) gtk_container_add(GTK_CONTAINER(window), view); } +#ifdef HAVE_GTK_INFO_BAR +static GtkWidget *perf_gtk__setup_info_bar(void) +{ + GtkWidget *info_bar; + GtkWidget *label; + GtkWidget *content_area; + + info_bar = gtk_info_bar_new(); + gtk_widget_set_no_show_all(info_bar, TRUE); + + label = gtk_label_new(""); + gtk_widget_show(label); + + content_area = gtk_info_bar_get_content_area(GTK_INFO_BAR(info_bar)); + gtk_container_add(GTK_CONTAINER(content_area), label); + + gtk_info_bar_add_button(GTK_INFO_BAR(info_bar), GTK_STOCK_OK, + GTK_RESPONSE_OK); + g_signal_connect(info_bar, "response", + G_CALLBACK(gtk_widget_hide), NULL); + + pgctx->info_bar = info_bar; + pgctx->message_label = label; + + return info_bar; +} +#endif + static GtkWidget *perf_gtk__setup_statusbar(void) { GtkWidget *stbar; @@ -145,6 +173,7 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist, struct perf_evsel *pos; GtkWidget *vbox; GtkWidget *notebook; + GtkWidget *info_bar; GtkWidget *statbar; GtkWidget *window; @@ -189,6 +218,10 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist, gtk_box_pack_start(GTK_BOX(vbox), notebook, TRUE, TRUE, 0); + info_bar = perf_gtk__setup_info_bar(); + if (info_bar) + gtk_box_pack_start(GTK_BOX(vbox), info_bar, FALSE, FALSE, 0); + statbar = perf_gtk__setup_statusbar(); gtk_box_pack_start(GTK_BOX(vbox), statbar, FALSE, FALSE, 0); diff --git a/tools/perf/ui/gtk/gtk.h b/tools/perf/ui/gtk/gtk.h index 206167868c5f..a4d0f2b4a2dc 100644 --- a/tools/perf/ui/gtk/gtk.h +++ b/tools/perf/ui/gtk/gtk.h @@ -10,6 +10,11 @@ struct perf_gtk_context { GtkWidget *main_window; + +#ifdef HAVE_GTK_INFO_BAR + GtkWidget *info_bar; + GtkWidget *message_label; +#endif GtkWidget *statbar; guint statbar_ctx_id; }; @@ -24,4 +29,11 @@ static inline bool perf_gtk__is_active_context(struct perf_gtk_context *ctx) struct perf_gtk_context *perf_gtk__activate_context(GtkWidget *window); int perf_gtk__deactivate_context(struct perf_gtk_context **ctx); +#ifndef HAVE_GTK_INFO_BAR +static inline GtkWidget *perf_gtk__setup_info_bar(void) +{ + return NULL; +} +#endif + #endif /* _PERF_GTK_H_ */ From e078ba14dff5c315ce19a978574883f417d2cfa0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 29 May 2012 13:23:02 +0900 Subject: [PATCH 119/612] perf ui/gtk: Use struct perf_error_ops Define and use perf_gtk_eops to provide a GTK2 message dialog for error reporting and a info_bar for warning. As GtkInfoBar requires recent GTK+ libraries, provides a fallback implementation using statusbar widget too. Signed-off-by: Namhyung Kim Acked-by: Pekka Enberg Cc: Paul Mackerras Cc: Pekka Enberg Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338265382-6872-8-git-send-email-namhyung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/gtk/setup.c | 5 +++ tools/perf/ui/gtk/util.c | 86 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/tools/perf/ui/gtk/setup.c b/tools/perf/ui/gtk/setup.c index 829529957766..92879ce61e2f 100644 --- a/tools/perf/ui/gtk/setup.c +++ b/tools/perf/ui/gtk/setup.c @@ -1,12 +1,17 @@ #include "gtk.h" #include "../../util/cache.h" +#include "../../util/debug.h" + +extern struct perf_error_ops perf_gtk_eops; int perf_gtk__init(void) { + perf_error__register(&perf_gtk_eops); return gtk_init_check(NULL, NULL) ? 0 : -1; } void perf_gtk__exit(bool wait_for_ok __used) { + perf_error__unregister(&perf_gtk_eops); gtk_main_quit(); } diff --git a/tools/perf/ui/gtk/util.c b/tools/perf/ui/gtk/util.c index 6fe13fdc513e..0ead373c0dfb 100644 --- a/tools/perf/ui/gtk/util.c +++ b/tools/perf/ui/gtk/util.c @@ -2,6 +2,8 @@ #include "../../util/debug.h" #include "gtk.h" +#include + struct perf_gtk_context *pgctx; @@ -26,6 +28,90 @@ int perf_gtk__deactivate_context(struct perf_gtk_context **ctx) return 0; } +static int perf_gtk__error(const char *format, va_list args) +{ + char *msg; + GtkWidget *dialog; + + if (!perf_gtk__is_active_context(pgctx) || + vasprintf(&msg, format, args) < 0) { + fprintf(stderr, "Error:\n"); + vfprintf(stderr, format, args); + fprintf(stderr, "\n"); + return -1; + } + + dialog = gtk_message_dialog_new_with_markup(GTK_WINDOW(pgctx->main_window), + GTK_DIALOG_DESTROY_WITH_PARENT, + GTK_MESSAGE_ERROR, + GTK_BUTTONS_CLOSE, + "Error\n\n%s", msg); + gtk_dialog_run(GTK_DIALOG(dialog)); + + gtk_widget_destroy(dialog); + free(msg); + return 0; +} + +#ifdef HAVE_GTK_INFO_BAR +static int perf_gtk__warning_info_bar(const char *format, va_list args) +{ + char *msg; + + if (!perf_gtk__is_active_context(pgctx) || + vasprintf(&msg, format, args) < 0) { + fprintf(stderr, "Warning:\n"); + vfprintf(stderr, format, args); + fprintf(stderr, "\n"); + return -1; + } + + gtk_label_set_text(GTK_LABEL(pgctx->message_label), msg); + gtk_info_bar_set_message_type(GTK_INFO_BAR(pgctx->info_bar), + GTK_MESSAGE_WARNING); + gtk_widget_show(pgctx->info_bar); + + free(msg); + return 0; +} +#else +static int perf_gtk__warning_statusbar(const char *format, va_list args) +{ + char *msg, *p; + + if (!perf_gtk__is_active_context(pgctx) || + vasprintf(&msg, format, args) < 0) { + fprintf(stderr, "Warning:\n"); + vfprintf(stderr, format, args); + fprintf(stderr, "\n"); + return -1; + } + + gtk_statusbar_pop(GTK_STATUSBAR(pgctx->statbar), + pgctx->statbar_ctx_id); + + /* Only first line can be displayed */ + p = strchr(msg, '\n'); + if (p) + *p = '\0'; + + gtk_statusbar_push(GTK_STATUSBAR(pgctx->statbar), + pgctx->statbar_ctx_id, msg); + + free(msg); + return 0; +} +#endif + +struct perf_error_ops perf_gtk_eops = { + .error = perf_gtk__error, +#ifdef HAVE_GTK_INFO_BAR + .warning = perf_gtk__warning_info_bar, +#else + .warning = perf_gtk__warning_statusbar, +#endif +}; + /* * FIXME: Functions below should be implemented properly. * For now, just add stubs for NO_NEWT=1 build. From cb1a28a0cbf9dac5a7a0ca02ebebc12db260d2f8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 7 Jun 2012 18:23:31 -0300 Subject: [PATCH 120/612] perf lib: Introduce rtrim Remove the trailing whitespaces. Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-8bxozh5lyixgjmziqaxo9675@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/string.c | 22 ++++++++++++++++++++++ tools/perf/util/util.h | 2 ++ 2 files changed, 24 insertions(+) diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c index d5836382ff2c..199bc4d8905d 100644 --- a/tools/perf/util/string.c +++ b/tools/perf/util/string.c @@ -313,3 +313,25 @@ int strtailcmp(const char *s1, const char *s2) return 0; } +/** + * rtrim - Removes trailing whitespace from @s. + * @s: The string to be stripped. + * + * Note that the first trailing whitespace is replaced with a %NUL-terminator + * in the given string @s. Returns @s. + */ +char *rtrim(char *s) +{ + size_t size = strlen(s); + char *end; + + if (!size) + return s; + + end = s + size - 1; + while (end >= s && isspace(*end)) + end--; + *(end + 1) = '\0'; + + return s; +} diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 2daaedb83d84..b13c7331eaf8 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -264,4 +264,6 @@ bool is_power_of_2(unsigned long n) size_t hex_width(u64 v); +char *rtrim(char *s); + #endif From aff3f3f68ae6002a30543726b2ae48cafce109e6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 7 Jun 2012 19:31:28 -0300 Subject: [PATCH 121/612] perf hists browser: Implement printing snapshots to files To avoid having to resort to --stdio, that expands everything, instead allow the user to go on expanding the relevant callchains and then press 'P' to print that view. As the hists browser is used for both static (report) and dynamic (top) views, it prints to a 'perf.hists.N' sequence, i.e. multiple snapshots can be taken in report and top. Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-wr9xx4ba0utrynu5j6wotd79@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/hists.c | 195 +++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 53f6697d014e..f556e5f6388b 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -23,6 +23,7 @@ struct hist_browser { struct hists *hists; struct hist_entry *he_selection; struct map_symbol *selection; + int print_seq; bool has_symbols; }; @@ -800,6 +801,196 @@ do_offset: } } +static int hist_browser__fprintf_callchain_node_rb_tree(struct hist_browser *browser, + struct callchain_node *chain_node, + u64 total, int level, + FILE *fp) +{ + struct rb_node *node; + int offset = level * LEVEL_OFFSET_STEP; + u64 new_total, remaining; + int printed = 0; + + if (callchain_param.mode == CHAIN_GRAPH_REL) + new_total = chain_node->children_hit; + else + new_total = total; + + remaining = new_total; + node = rb_first(&chain_node->rb_root); + while (node) { + struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node); + struct rb_node *next = rb_next(node); + u64 cumul = callchain_cumul_hits(child); + struct callchain_list *chain; + char folded_sign = ' '; + int first = true; + int extra_offset = 0; + + remaining -= cumul; + + list_for_each_entry(chain, &child->val, list) { + char ipstr[BITS_PER_LONG / 4 + 1], *alloc_str; + const char *str; + bool was_first = first; + + if (first) + first = false; + else + extra_offset = LEVEL_OFFSET_STEP; + + folded_sign = callchain_list__folded(chain); + + alloc_str = NULL; + str = callchain_list__sym_name(chain, ipstr, sizeof(ipstr)); + if (was_first) { + double percent = cumul * 100.0 / new_total; + + if (asprintf(&alloc_str, "%2.2f%% %s", percent, str) < 0) + str = "Not enough memory!"; + else + str = alloc_str; + } + + printed += fprintf(fp, "%*s%c %s\n", offset + extra_offset, " ", folded_sign, str); + free(alloc_str); + if (folded_sign == '+') + break; + } + + if (folded_sign == '-') { + const int new_level = level + (extra_offset ? 2 : 1); + printed += hist_browser__fprintf_callchain_node_rb_tree(browser, child, new_total, + new_level, fp); + } + + node = next; + } + + return printed; +} + +static int hist_browser__fprintf_callchain_node(struct hist_browser *browser, + struct callchain_node *node, + int level, FILE *fp) +{ + struct callchain_list *chain; + int offset = level * LEVEL_OFFSET_STEP; + char folded_sign = ' '; + int printed = 0; + + list_for_each_entry(chain, &node->val, list) { + char ipstr[BITS_PER_LONG / 4 + 1], *s; + + folded_sign = callchain_list__folded(chain); + s = callchain_list__sym_name(chain, ipstr, sizeof(ipstr)); + printed += fprintf(fp, "%*s%c %s\n", offset, " ", folded_sign, s); + } + + if (folded_sign == '-') + printed += hist_browser__fprintf_callchain_node_rb_tree(browser, node, + browser->hists->stats.total_period, + level + 1, fp); + return printed; +} + +static int hist_browser__fprintf_callchain(struct hist_browser *browser, + struct rb_root *chain, int level, FILE *fp) +{ + struct rb_node *nd; + int printed = 0; + + for (nd = rb_first(chain); nd; nd = rb_next(nd)) { + struct callchain_node *node = rb_entry(nd, struct callchain_node, rb_node); + + printed += hist_browser__fprintf_callchain_node(browser, node, level, fp); + } + + return printed; +} + +static int hist_browser__fprintf_entry(struct hist_browser *browser, + struct hist_entry *he, FILE *fp) +{ + char s[8192]; + double percent; + int printed = 0; + char folded_sign = ' '; + + if (symbol_conf.use_callchain) + folded_sign = hist_entry__folded(he); + + hist_entry__snprintf(he, s, sizeof(s), browser->hists); + percent = (he->period * 100.0) / browser->hists->stats.total_period; + + if (symbol_conf.use_callchain) + printed += fprintf(fp, "%c ", folded_sign); + + printed += fprintf(fp, " %5.2f%%", percent); + + if (symbol_conf.show_nr_samples) + printed += fprintf(fp, " %11u", he->nr_events); + + if (symbol_conf.show_total_period) + printed += fprintf(fp, " %12" PRIu64, he->period); + + printed += fprintf(fp, "%s\n", rtrim(s)); + + if (folded_sign == '-') + printed += hist_browser__fprintf_callchain(browser, &he->sorted_chain, 1, fp); + + return printed; +} + +static int hist_browser__fprintf(struct hist_browser *browser, FILE *fp) +{ + struct rb_node *nd = hists__filter_entries(rb_first(browser->b.entries)); + int printed = 0; + + while (nd) { + struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node); + + printed += hist_browser__fprintf_entry(browser, h, fp); + nd = hists__filter_entries(rb_next(nd)); + } + + return printed; +} + +static int hist_browser__dump(struct hist_browser *browser) +{ + char filename[64]; + FILE *fp; + + while (1) { + scnprintf(filename, sizeof(filename), "perf.hist.%d", browser->print_seq); + if (access(filename, F_OK)) + break; + /* + * XXX: Just an arbitrary lazy upper limit + */ + if (++browser->print_seq == 8192) { + ui_helpline__fpush("Too many perf.hist.N files, nothing written!"); + return -1; + } + } + + fp = fopen(filename, "w"); + if (fp == NULL) { + char bf[64]; + strerror_r(errno, bf, sizeof(bf)); + ui_helpline__fpush("Couldn't write to %s: %s", filename, bf); + return -1; + } + + ++browser->print_seq; + hist_browser__fprintf(browser, fp); + fclose(fp); + ui_helpline__fpush("%s written!", filename); + + return 0; +} + static struct hist_browser *hist_browser__new(struct hists *hists) { struct hist_browser *browser = zalloc(sizeof(*browser)); @@ -937,6 +1128,9 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, browser->selection->map->dso->annotate_warned) continue; goto do_annotate; + case 'P': + hist_browser__dump(browser); + continue; case 'd': goto zoom_dso; case 't': @@ -969,6 +1163,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, "E Expand all callchains\n" "d Zoom into current DSO\n" "t Zoom into current Thread\n" + "P Print histograms to perf.hist.N\n" "/ Filter symbol by name"); continue; case K_ENTER: From 27f18617b01dbbc928e9bd3731d1766222fb7e0d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Jun 2012 13:33:09 -0300 Subject: [PATCH 122/612] perf evsel: Carve out event modifier formatting From perf_evsel__hw_name, so that we can use it for the other kinds of events (tracepoints, software, hw cache, etc). Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-9gmd5wewsrvtny8tzxjfp471@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 9f6cebd798ee..dab893804a14 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -86,16 +86,15 @@ const char *__perf_evsel__hw_name(u64 config) return "unknown-hardware"; } -static int perf_evsel__hw_name(struct perf_evsel *evsel, char *bf, size_t size) +static int perf_evsel__add_modifiers(struct perf_evsel *evsel, char *bf, size_t size) { - int colon = 0; + int colon = 0, r = 0; struct perf_event_attr *attr = &evsel->attr; - int r = scnprintf(bf, size, "%s", __perf_evsel__hw_name(attr->config)); bool exclude_guest_default = false; #define MOD_PRINT(context, mod) do { \ if (!attr->exclude_##context) { \ - if (!colon) colon = r++; \ + if (!colon) colon = ++r; \ r += scnprintf(bf + r, size - r, "%c", mod); \ } } while(0) @@ -108,7 +107,7 @@ static int perf_evsel__hw_name(struct perf_evsel *evsel, char *bf, size_t size) if (attr->precise_ip) { if (!colon) - colon = r++; + colon = ++r; r += scnprintf(bf + r, size - r, "%.*s", attr->precise_ip, "ppp"); exclude_guest_default = true; } @@ -119,10 +118,16 @@ static int perf_evsel__hw_name(struct perf_evsel *evsel, char *bf, size_t size) } #undef MOD_PRINT if (colon) - bf[colon] = ':'; + bf[colon - 1] = ':'; return r; } +static int perf_evsel__hw_name(struct perf_evsel *evsel, char *bf, size_t size) +{ + int r = scnprintf(bf, size, "%s", __perf_evsel__hw_name(evsel->attr.config)); + return r + perf_evsel__add_modifiers(evsel, bf + r, size - r); +} + int perf_evsel__name(struct perf_evsel *evsel, char *bf, size_t size) { int ret; From 0b668bc9a74ce1bd3b8c5fd93e8d85ed955e11fe Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Jun 2012 14:08:07 -0300 Subject: [PATCH 123/612] perf tools: Reconstruct hw cache event with modifiers from perf_event_attr [root@sandy ~]# perf record -a -e dTLB-load-misses:u usleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.486 MB perf.data (~21216 samples) ] Before: [root@sandy ~]# perf evlist dTLB-load-misses [root@sandy ~]# After: [root@sandy ~]# perf evlist dTLB-load-misses:u [root@sandy ~]# Ditto for other tools. Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-7x1b0e6jthkr93lfjzsuakk5@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 104 +++++++++++++++++++++++++++++ tools/perf/util/evsel.h | 16 ++++- tools/perf/util/parse-events.c | 115 +++++---------------------------- 3 files changed, 134 insertions(+), 101 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index dab893804a14..47f1fe2feab8 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -128,6 +128,105 @@ static int perf_evsel__hw_name(struct perf_evsel *evsel, char *bf, size_t size) return r + perf_evsel__add_modifiers(evsel, bf + r, size - r); } +const char *perf_evsel__hw_cache[PERF_COUNT_HW_CACHE_MAX] + [PERF_EVSEL__MAX_ALIASES] = { + { "L1-dcache", "l1-d", "l1d", "L1-data", }, + { "L1-icache", "l1-i", "l1i", "L1-instruction", }, + { "LLC", "L2", }, + { "dTLB", "d-tlb", "Data-TLB", }, + { "iTLB", "i-tlb", "Instruction-TLB", }, + { "branch", "branches", "bpu", "btb", "bpc", }, + { "node", }, +}; + +const char *perf_evsel__hw_cache_op[PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_EVSEL__MAX_ALIASES] = { + { "load", "loads", "read", }, + { "store", "stores", "write", }, + { "prefetch", "prefetches", "speculative-read", "speculative-load", }, +}; + +const char *perf_evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX] + [PERF_EVSEL__MAX_ALIASES] = { + { "refs", "Reference", "ops", "access", }, + { "misses", "miss", }, +}; + +#define C(x) PERF_COUNT_HW_CACHE_##x +#define CACHE_READ (1 << C(OP_READ)) +#define CACHE_WRITE (1 << C(OP_WRITE)) +#define CACHE_PREFETCH (1 << C(OP_PREFETCH)) +#define COP(x) (1 << x) + +/* + * cache operartion stat + * L1I : Read and prefetch only + * ITLB and BPU : Read-only + */ +static unsigned long perf_evsel__hw_cache_stat[C(MAX)] = { + [C(L1D)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH), + [C(L1I)] = (CACHE_READ | CACHE_PREFETCH), + [C(LL)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH), + [C(DTLB)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH), + [C(ITLB)] = (CACHE_READ), + [C(BPU)] = (CACHE_READ), + [C(NODE)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH), +}; + +bool perf_evsel__is_cache_op_valid(u8 type, u8 op) +{ + if (perf_evsel__hw_cache_stat[type] & COP(op)) + return true; /* valid */ + else + return false; /* invalid */ +} + +int __perf_evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, + char *bf, size_t size) +{ + if (result) { + return scnprintf(bf, size, "%s-%s-%s", perf_evsel__hw_cache[type][0], + perf_evsel__hw_cache_op[op][0], + perf_evsel__hw_cache_result[result][0]); + } + + return scnprintf(bf, size, "%s-%s", perf_evsel__hw_cache[type][0], + perf_evsel__hw_cache_op[op][1]); +} + +int __perf_evsel__hw_cache_name(u64 config, char *bf, size_t size) +{ + u8 op, result, type = (config >> 0) & 0xff; + const char *err = "unknown-ext-hardware-cache-type"; + + if (type > PERF_COUNT_HW_CACHE_MAX) + goto out_err; + + op = (config >> 8) & 0xff; + err = "unknown-ext-hardware-cache-op"; + if (op > PERF_COUNT_HW_CACHE_OP_MAX) + goto out_err; + + result = (config >> 16) & 0xff; + err = "unknown-ext-hardware-cache-result"; + if (result > PERF_COUNT_HW_CACHE_RESULT_MAX) + goto out_err; + + err = "invalid-cache"; + if (!perf_evsel__is_cache_op_valid(type, op)) + goto out_err; + + return __perf_evsel__hw_cache_type_op_res_name(type, op, result, bf, size); +out_err: + return scnprintf(bf, size, "%s", err); +} + +static int perf_evsel__hw_cache_name(struct perf_evsel *evsel, char *bf, size_t size) +{ + int ret = __perf_evsel__hw_cache_name(evsel->attr.config, bf, size); + return ret + perf_evsel__add_modifiers(evsel, bf + ret, size - ret); +} + int perf_evsel__name(struct perf_evsel *evsel, char *bf, size_t size) { int ret; @@ -140,6 +239,11 @@ int perf_evsel__name(struct perf_evsel *evsel, char *bf, size_t size) case PERF_TYPE_HARDWARE: ret = perf_evsel__hw_name(evsel, bf, size); break; + + case PERF_TYPE_HW_CACHE: + ret = perf_evsel__hw_cache_name(evsel, bf, size); + break; + default: /* * FIXME diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 4ba8b564e6f4..5bf946a05a6b 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -83,7 +83,21 @@ void perf_evsel__config(struct perf_evsel *evsel, struct perf_record_opts *opts, struct perf_evsel *first); -const char* __perf_evsel__hw_name(u64 config); +bool perf_evsel__is_cache_op_valid(u8 type, u8 op); + +#define PERF_EVSEL__MAX_ALIASES 8 + +extern const char *perf_evsel__hw_cache[PERF_COUNT_HW_CACHE_MAX] + [PERF_EVSEL__MAX_ALIASES]; +extern const char *perf_evsel__hw_cache_op[PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_EVSEL__MAX_ALIASES]; +const char *perf_evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX] + [PERF_EVSEL__MAX_ALIASES]; +int __perf_evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, + char *bf, size_t size); +int __perf_evsel__hw_cache_name(u64 config, char *bf, size_t size); + +const char *__perf_evsel__hw_name(u64 config); int perf_evsel__name(struct perf_evsel *evsel, char *bf, size_t size); int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 05dbc8b3c767..c8f8cf4a6920 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -74,51 +74,6 @@ static const char *sw_event_names[PERF_COUNT_SW_MAX] = { "emulation-faults", }; -#define MAX_ALIASES 8 - -static const char *hw_cache[PERF_COUNT_HW_CACHE_MAX][MAX_ALIASES] = { - { "L1-dcache", "l1-d", "l1d", "L1-data", }, - { "L1-icache", "l1-i", "l1i", "L1-instruction", }, - { "LLC", "L2", }, - { "dTLB", "d-tlb", "Data-TLB", }, - { "iTLB", "i-tlb", "Instruction-TLB", }, - { "branch", "branches", "bpu", "btb", "bpc", }, - { "node", }, -}; - -static const char *hw_cache_op[PERF_COUNT_HW_CACHE_OP_MAX][MAX_ALIASES] = { - { "load", "loads", "read", }, - { "store", "stores", "write", }, - { "prefetch", "prefetches", "speculative-read", "speculative-load", }, -}; - -static const char *hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX] - [MAX_ALIASES] = { - { "refs", "Reference", "ops", "access", }, - { "misses", "miss", }, -}; - -#define C(x) PERF_COUNT_HW_CACHE_##x -#define CACHE_READ (1 << C(OP_READ)) -#define CACHE_WRITE (1 << C(OP_WRITE)) -#define CACHE_PREFETCH (1 << C(OP_PREFETCH)) -#define COP(x) (1 << x) - -/* - * cache operartion stat - * L1I : Read and prefetch only - * ITLB and BPU : Read-only - */ -static unsigned long hw_cache_stat[C(MAX)] = { - [C(L1D)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH), - [C(L1I)] = (CACHE_READ | CACHE_PREFETCH), - [C(LL)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH), - [C(DTLB)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH), - [C(ITLB)] = (CACHE_READ), - [C(BPU)] = (CACHE_READ), - [C(NODE)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH), -}; - #define for_each_subsystem(sys_dir, sys_dirent, sys_next) \ while (!readdir_r(sys_dir, &sys_dirent, &sys_next) && sys_next) \ if (sys_dirent.d_type == DT_DIR && \ @@ -236,30 +191,6 @@ static const char *tracepoint_id_to_name(u64 config) return buf; } -static int is_cache_op_valid(u8 cache_type, u8 cache_op) -{ - if (hw_cache_stat[cache_type] & COP(cache_op)) - return 1; /* valid */ - else - return 0; /* invalid */ -} - -static char *event_cache_name(u8 cache_type, u8 cache_op, u8 cache_result) -{ - static char name[50]; - - if (cache_result) { - sprintf(name, "%s-%s-%s", hw_cache[cache_type][0], - hw_cache_op[cache_op][0], - hw_cache_result[cache_result][0]); - } else { - sprintf(name, "%s-%s", hw_cache[cache_type][0], - hw_cache_op[cache_op][1]); - } - - return name; -} - const char *event_type(int type) { switch (type) { @@ -287,7 +218,7 @@ const char *event_name(struct perf_evsel *evsel) u64 config = evsel->attr.config; int type = evsel->attr.type; - if (type == PERF_TYPE_RAW || type == PERF_TYPE_HARDWARE) { + if (type == PERF_TYPE_RAW || type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) { /* * XXX minimal fix, see comment on perf_evsen__name, this static buffer * will go away together with event_name in the next devel cycle. @@ -316,26 +247,9 @@ const char *__event_name(int type, u64 config) case PERF_TYPE_HARDWARE: return __perf_evsel__hw_name(config); - case PERF_TYPE_HW_CACHE: { - u8 cache_type, cache_op, cache_result; - - cache_type = (config >> 0) & 0xff; - if (cache_type > PERF_COUNT_HW_CACHE_MAX) - return "unknown-ext-hardware-cache-type"; - - cache_op = (config >> 8) & 0xff; - if (cache_op > PERF_COUNT_HW_CACHE_OP_MAX) - return "unknown-ext-hardware-cache-op"; - - cache_result = (config >> 16) & 0xff; - if (cache_result > PERF_COUNT_HW_CACHE_RESULT_MAX) - return "unknown-ext-hardware-cache-result"; - - if (!is_cache_op_valid(cache_type, cache_op)) - return "invalid-cache"; - - return event_cache_name(cache_type, cache_op, cache_result); - } + case PERF_TYPE_HW_CACHE: + __perf_evsel__hw_cache_name(config, buf, sizeof(buf)); + return buf; case PERF_TYPE_SOFTWARE: if (config < PERF_COUNT_SW_MAX && sw_event_names[config]) @@ -379,13 +293,13 @@ static int add_event(struct list_head **_list, int *idx, return 0; } -static int parse_aliases(char *str, const char *names[][MAX_ALIASES], int size) +static int parse_aliases(char *str, const char *names[][PERF_EVSEL__MAX_ALIASES], int size) { int i, j; int n, longest = -1; for (i = 0; i < size; i++) { - for (j = 0; j < MAX_ALIASES && names[i][j]; j++) { + for (j = 0; j < PERF_EVSEL__MAX_ALIASES && names[i][j]; j++) { n = strlen(names[i][j]); if (n > longest && !strncasecmp(str, names[i][j], n)) longest = n; @@ -410,7 +324,7 @@ int parse_events_add_cache(struct list_head **list, int *idx, * No fallback - if we cannot get a clear cache type * then bail out: */ - cache_type = parse_aliases(type, hw_cache, + cache_type = parse_aliases(type, perf_evsel__hw_cache, PERF_COUNT_HW_CACHE_MAX); if (cache_type == -1) return -EINVAL; @@ -423,18 +337,18 @@ int parse_events_add_cache(struct list_head **list, int *idx, snprintf(name + n, MAX_NAME_LEN - n, "-%s\n", str); if (cache_op == -1) { - cache_op = parse_aliases(str, hw_cache_op, + cache_op = parse_aliases(str, perf_evsel__hw_cache_op, PERF_COUNT_HW_CACHE_OP_MAX); if (cache_op >= 0) { - if (!is_cache_op_valid(cache_type, cache_op)) + if (!perf_evsel__is_cache_op_valid(cache_type, cache_op)) return -EINVAL; continue; } } if (cache_result == -1) { - cache_result = parse_aliases(str, hw_cache_result, - PERF_COUNT_HW_CACHE_RESULT_MAX); + cache_result = parse_aliases(str, perf_evsel__hw_cache_result, + PERF_COUNT_HW_CACHE_RESULT_MAX); if (cache_result >= 0) continue; } @@ -970,16 +884,17 @@ void print_events_type(u8 type) int print_hwcache_events(const char *event_glob) { unsigned int type, op, i, printed = 0; + char name[64]; for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { /* skip invalid cache type */ - if (!is_cache_op_valid(type, op)) + if (!perf_evsel__is_cache_op_valid(type, op)) continue; for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { - char *name = event_cache_name(type, op, i); - + __perf_evsel__hw_cache_type_op_res_name(type, op, i, + name, sizeof(name)); if (event_glob != NULL && !strglobmatch(name, event_glob)) continue; From 335c2f5d25319b208fb8b444e6f3099a806a33bf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Jun 2012 14:36:20 -0300 Subject: [PATCH 124/612] perf tools: Reconstruct sw event with modifiers from perf_event_attr [root@sandy ~]# perf record -e task-clock:u -a usleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.482 MB perf.data (~21073 samples) ] [root@sandy ~]# Before: [root@sandy ~]# perf evlist task-clock [root@sandy ~]# After: [root@sandy ~]# perf evlist task-clock:u [root@sandy ~]# Ditto for other tools. Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-97ltkmj7v23kyhflltf6iz5n@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 29 +++++++++++++++++++++++++++++ tools/perf/util/evsel.h | 2 ++ tools/perf/util/parse-events.c | 19 +++---------------- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 47f1fe2feab8..2da047331173 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -128,6 +128,31 @@ static int perf_evsel__hw_name(struct perf_evsel *evsel, char *bf, size_t size) return r + perf_evsel__add_modifiers(evsel, bf + r, size - r); } +static const char *perf_evsel__sw_names[PERF_COUNT_SW_MAX] = { + "cpu-clock", + "task-clock", + "page-faults", + "context-switches", + "CPU-migrations", + "minor-faults", + "major-faults", + "alignment-faults", + "emulation-faults", +}; + +const char *__perf_evsel__sw_name(u64 config) +{ + if (config < PERF_COUNT_SW_MAX && perf_evsel__sw_names[config]) + return perf_evsel__sw_names[config]; + return "unknown-software"; +} + +static int perf_evsel__sw_name(struct perf_evsel *evsel, char *bf, size_t size) +{ + int r = scnprintf(bf, size, "%s", __perf_evsel__sw_name(evsel->attr.config)); + return r + perf_evsel__add_modifiers(evsel, bf + r, size - r); +} + const char *perf_evsel__hw_cache[PERF_COUNT_HW_CACHE_MAX] [PERF_EVSEL__MAX_ALIASES] = { { "L1-dcache", "l1-d", "l1d", "L1-data", }, @@ -244,6 +269,10 @@ int perf_evsel__name(struct perf_evsel *evsel, char *bf, size_t size) ret = perf_evsel__hw_cache_name(evsel, bf, size); break; + case PERF_TYPE_SOFTWARE: + ret = perf_evsel__sw_name(evsel, bf, size); + break; + default: /* * FIXME diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 5bf946a05a6b..9ae64d9622bc 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -98,6 +98,8 @@ int __perf_evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, int __perf_evsel__hw_cache_name(u64 config, char *bf, size_t size); const char *__perf_evsel__hw_name(u64 config); +const char *__perf_evsel__sw_name(u64 config); + int perf_evsel__name(struct perf_evsel *evsel, char *bf, size_t size); int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index c8f8cf4a6920..641c4ac8a838 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -62,18 +62,6 @@ static struct event_symbol event_symbols[] = { #define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE) #define PERF_EVENT_ID(config) __PERF_EVENT_FIELD(config, EVENT) -static const char *sw_event_names[PERF_COUNT_SW_MAX] = { - "cpu-clock", - "task-clock", - "page-faults", - "context-switches", - "CPU-migrations", - "minor-faults", - "major-faults", - "alignment-faults", - "emulation-faults", -}; - #define for_each_subsystem(sys_dir, sys_dirent, sys_next) \ while (!readdir_r(sys_dir, &sys_dirent, &sys_next) && sys_next) \ if (sys_dirent.d_type == DT_DIR && \ @@ -218,7 +206,8 @@ const char *event_name(struct perf_evsel *evsel) u64 config = evsel->attr.config; int type = evsel->attr.type; - if (type == PERF_TYPE_RAW || type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) { + if (type == PERF_TYPE_RAW || type == PERF_TYPE_HARDWARE || + type == PERF_TYPE_SOFTWARE || type == PERF_TYPE_HW_CACHE) { /* * XXX minimal fix, see comment on perf_evsen__name, this static buffer * will go away together with event_name in the next devel cycle. @@ -252,9 +241,7 @@ const char *__event_name(int type, u64 config) return buf; case PERF_TYPE_SOFTWARE: - if (config < PERF_COUNT_SW_MAX && sw_event_names[config]) - return sw_event_names[config]; - return "unknown-software"; + return __perf_evsel__sw_name(config); case PERF_TYPE_TRACEPOINT: return tracepoint_id_to_name(config); From a446083604fe2bafe0f46b1e95b22f7b06e3a63f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 12 Jun 2012 10:29:12 -0300 Subject: [PATCH 125/612] perf evsel: Handle all event types in perf_evsel__name Now to convert all event_name users to perf_evsel__name. Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-buuz0j0gynseglxa76r01rdn@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 24 +++++++++++------------- tools/perf/util/parse-events.c | 21 +++------------------ 2 files changed, 14 insertions(+), 31 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 2da047331173..2ddc81271855 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -252,6 +252,11 @@ static int perf_evsel__hw_cache_name(struct perf_evsel *evsel, char *bf, size_t return ret + perf_evsel__add_modifiers(evsel, bf + ret, size - ret); } +static int perf_evsel__tracepoint_name(struct perf_evsel *evsel, char *bf, size_t size) +{ + return scnprintf(bf, size, "%s", evsel->name ?: "unknown tracepoint"); +} + int perf_evsel__name(struct perf_evsel *evsel, char *bf, size_t size) { int ret; @@ -273,20 +278,13 @@ int perf_evsel__name(struct perf_evsel *evsel, char *bf, size_t size) ret = perf_evsel__sw_name(evsel, bf, size); break; + case PERF_TYPE_TRACEPOINT: + ret = perf_evsel__tracepoint_name(evsel, bf, size); + break; + default: - /* - * FIXME - * - * This is the minimal perf_evsel__name so that we can - * reconstruct event names taking into account event modifiers. - * - * The old event_name uses it now for raw anr hw events, so that - * we don't drag all the parsing stuff into the python binding. - * - * On the next devel cycle the rest of the event naming will be - * brought here. - */ - return 0; + ret = scnprintf(bf, size, "%s", "unknown attr type"); + break; } return ret; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 641c4ac8a838..d73690b11242 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -203,24 +203,9 @@ const char *event_type(int type) const char *event_name(struct perf_evsel *evsel) { - u64 config = evsel->attr.config; - int type = evsel->attr.type; - - if (type == PERF_TYPE_RAW || type == PERF_TYPE_HARDWARE || - type == PERF_TYPE_SOFTWARE || type == PERF_TYPE_HW_CACHE) { - /* - * XXX minimal fix, see comment on perf_evsen__name, this static buffer - * will go away together with event_name in the next devel cycle. - */ - static char bf[128]; - perf_evsel__name(evsel, bf, sizeof(bf)); - return bf; - } - - if (evsel->name) - return evsel->name; - - return __event_name(type, config); + static char bf[128]; + perf_evsel__name(evsel, bf, sizeof(bf)); + return bf; } const char *__event_name(int type, u64 config) From 7289f83cceb437ca56c77eb45b8b1cda15e2e476 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 12 Jun 2012 12:34:58 -0300 Subject: [PATCH 126/612] perf tools: Move all users of event_name to perf_evsel__name So that we don't use global variables that could make us misreport event names when having a multi window top, for instance. Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-mccancovi1u0wdkg8ncth509@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-evlist.c | 2 +- tools/perf/builtin-record.c | 4 ++-- tools/perf/builtin-report.c | 6 +++--- tools/perf/builtin-stat.c | 12 ++++++------ tools/perf/builtin-test.c | 2 +- tools/perf/builtin-top.c | 10 +++++----- tools/perf/ui/browsers/hists.c | 15 ++++----------- tools/perf/ui/gtk/browser.c | 2 +- tools/perf/util/evsel.c | 26 +++++++++++++------------- tools/perf/util/evsel.h | 2 +- tools/perf/util/header.c | 2 +- tools/perf/util/parse-events.c | 7 ------- tools/perf/util/parse-events.h | 1 - tools/perf/util/session.c | 2 +- tools/perf/util/top.c | 2 +- 15 files changed, 40 insertions(+), 55 deletions(-) diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c index acd78dc28341..0dd5a058f766 100644 --- a/tools/perf/builtin-evlist.c +++ b/tools/perf/builtin-evlist.c @@ -60,7 +60,7 @@ static int __cmd_evlist(const char *input_name, struct perf_attr_details *detail list_for_each_entry(pos, &session->evlist->entries, node) { bool first = true; - printf("%s", event_name(pos)); + printf("%s", perf_evsel__name(pos)); if (details->verbose || details->freq) { comma_printf(&first, " sample_freq=%" PRIu64, diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index f95840d04e4c..f5a6452931e6 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -265,7 +265,7 @@ try_again: if (err == ENOENT) { ui__error("The %s event is not supported.\n", - event_name(pos)); + perf_evsel__name(pos)); exit(EXIT_FAILURE); } @@ -916,7 +916,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) usage_with_options(record_usage, record_options); list_for_each_entry(pos, &evsel_list->entries, node) { - if (perf_header__push_event(pos->attr.config, event_name(pos))) + if (perf_header__push_event(pos->attr.config, perf_evsel__name(pos))) goto out_free_fd; } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 25249f76329d..ea8ce8e1c0d8 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -230,7 +230,7 @@ static int process_read_event(struct perf_tool *tool, struct perf_report *rep = container_of(tool, struct perf_report, tool); if (rep->show_threads) { - const char *name = evsel ? event_name(evsel) : "unknown"; + const char *name = evsel ? perf_evsel__name(evsel) : "unknown"; perf_read_values_add_value(&rep->show_threads_values, event->read.pid, event->read.tid, event->read.id, @@ -239,7 +239,7 @@ static int process_read_event(struct perf_tool *tool, } dump_printf(": %d %d %s %" PRIu64 "\n", event->read.pid, event->read.tid, - evsel ? event_name(evsel) : "FAIL", + evsel ? perf_evsel__name(evsel) : "FAIL", event->read.value); return 0; @@ -314,7 +314,7 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist, list_for_each_entry(pos, &evlist->entries, node) { struct hists *hists = &pos->hists; - const char *evname = event_name(pos); + const char *evname = perf_evsel__name(pos); hists__fprintf_nr_sample_events(hists, evname, stdout); hists__fprintf(hists, NULL, false, true, 0, 0, stdout); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 262589991ea4..875bf2675326 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -391,7 +391,7 @@ static int read_counter_aggr(struct perf_evsel *counter) if (verbose) { fprintf(output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", - event_name(counter), count[0], count[1], count[2]); + perf_evsel__name(counter), count[0], count[1], count[2]); } /* @@ -496,7 +496,7 @@ static int run_perf_stat(int argc __used, const char **argv) errno == ENXIO) { if (verbose) ui__warning("%s event is not supported by the kernel.\n", - event_name(counter)); + perf_evsel__name(counter)); counter->supported = false; continue; } @@ -594,7 +594,7 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) csv_output ? 0 : -4, evsel_list->cpus->map[cpu], csv_sep); - fprintf(output, fmt, cpustr, msecs, csv_sep, event_name(evsel)); + fprintf(output, fmt, cpustr, msecs, csv_sep, perf_evsel__name(evsel)); if (evsel->cgrp) fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); @@ -792,7 +792,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) else cpu = 0; - fprintf(output, fmt, cpustr, avg, csv_sep, event_name(evsel)); + fprintf(output, fmt, cpustr, avg, csv_sep, perf_evsel__name(evsel)); if (evsel->cgrp) fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); @@ -908,7 +908,7 @@ static void print_counter_aggr(struct perf_evsel *counter) counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, csv_sep, csv_output ? 0 : -24, - event_name(counter)); + perf_evsel__name(counter)); if (counter->cgrp) fprintf(output, "%s%s", csv_sep, counter->cgrp->name); @@ -961,7 +961,7 @@ static void print_counter(struct perf_evsel *counter) counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, csv_sep, csv_output ? 0 : -24, - event_name(counter)); + perf_evsel__name(counter)); if (counter->cgrp) fprintf(output, "%s%s", diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 5a8727c08757..5ce30305462b 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -583,7 +583,7 @@ static int test__basic_mmap(void) if (nr_events[evsel->idx] != expected_nr_events[evsel->idx]) { pr_debug("expected %d %s events, got %d\n", expected_nr_events[evsel->idx], - event_name(evsel), nr_events[evsel->idx]); + perf_evsel__name(evsel), nr_events[evsel->idx]); goto out_munmap; } } diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 6bb0277b7dfe..8090a280578c 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -245,7 +245,7 @@ static void perf_top__show_details(struct perf_top *top) if (notes->src == NULL) goto out_unlock; - printf("Showing %s for %s\n", event_name(top->sym_evsel), symbol->name); + printf("Showing %s for %s\n", perf_evsel__name(top->sym_evsel), symbol->name); printf(" Events Pcnt (>=%d%%)\n", top->sym_pcnt_filter); more = symbol__annotate_printf(symbol, he->ms.map, top->sym_evsel->idx, @@ -408,7 +408,7 @@ static void perf_top__print_mapped_keys(struct perf_top *top) fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", top->print_entries); if (top->evlist->nr_entries > 1) - fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(top->sym_evsel)); + fprintf(stdout, "\t[E] active event counter. \t(%s)\n", perf_evsel__name(top->sym_evsel)); fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", top->count_filter); @@ -503,13 +503,13 @@ static void perf_top__handle_keypress(struct perf_top *top, int c) fprintf(stderr, "\nAvailable events:"); list_for_each_entry(top->sym_evsel, &top->evlist->entries, node) - fprintf(stderr, "\n\t%d %s", top->sym_evsel->idx, event_name(top->sym_evsel)); + fprintf(stderr, "\n\t%d %s", top->sym_evsel->idx, perf_evsel__name(top->sym_evsel)); prompt_integer(&counter, "Enter details event counter"); if (counter >= top->evlist->nr_entries) { top->sym_evsel = list_entry(top->evlist->entries.next, struct perf_evsel, node); - fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(top->sym_evsel)); + fprintf(stderr, "Sorry, no such event, using %s.\n", perf_evsel__name(top->sym_evsel)); sleep(1); break; } @@ -960,7 +960,7 @@ try_again: if (err == ENOENT) { ui__error("The %s event is not supported.\n", - event_name(counter)); + perf_evsel__name(counter)); goto out_err; } else if (err == EMFILE) { ui__error("Too many events are opened.\n" diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index f556e5f6388b..482f0517b61e 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -1367,7 +1367,7 @@ static void perf_evsel_menu__write(struct ui_browser *browser, struct perf_evsel *evsel = list_entry(entry, struct perf_evsel, node); bool current_entry = ui_browser__is_current_entry(browser, row); unsigned long nr_events = evsel->hists.stats.nr_events[PERF_RECORD_SAMPLE]; - const char *ev_name = event_name(evsel); + const char *ev_name = perf_evsel__name(evsel); char bf[256], unit; const char *warn = " "; size_t printed; @@ -1435,7 +1435,7 @@ browse_hists: */ if (timer) timer(arg); - ev_name = event_name(pos); + ev_name = perf_evsel__name(pos); key = perf_evsel__hists_browse(pos, nr_events, help, ev_name, true, timer, arg, delay_secs); @@ -1504,17 +1504,11 @@ static int __perf_evlist__tui_browse_hists(struct perf_evlist *evlist, ui_helpline__push("Press ESC to exit"); list_for_each_entry(pos, &evlist->entries, node) { - const char *ev_name = event_name(pos); + const char *ev_name = perf_evsel__name(pos); size_t line_len = strlen(ev_name) + 7; if (menu.b.width < line_len) menu.b.width = line_len; - /* - * Cache the evsel name, tracepoints have a _high_ cost per - * event_name() call. - */ - if (pos->name == NULL) - pos->name = strdup(ev_name); } return perf_evsel_menu__run(&menu, evlist->nr_entries, help, timer, @@ -1525,11 +1519,10 @@ int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help, void(*timer)(void *arg), void *arg, int delay_secs) { - if (evlist->nr_entries == 1) { struct perf_evsel *first = list_entry(evlist->entries.next, struct perf_evsel, node); - const char *ev_name = event_name(first); + const char *ev_name = perf_evsel__name(first); return perf_evsel__hists_browse(first, evlist->nr_entries, help, ev_name, false, timer, arg, delay_secs); diff --git a/tools/perf/ui/gtk/browser.c b/tools/perf/ui/gtk/browser.c index fd41e8d10337..ec12e0b4ded6 100644 --- a/tools/perf/ui/gtk/browser.c +++ b/tools/perf/ui/gtk/browser.c @@ -199,7 +199,7 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist, list_for_each_entry(pos, &evlist->entries, node) { struct hists *hists = &pos->hists; - const char *evname = event_name(pos); + const char *evname = perf_evsel__name(pos); GtkWidget *scrolled_window; GtkWidget *tab_label; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 2ddc81271855..236bdf25db6d 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -252,42 +252,42 @@ static int perf_evsel__hw_cache_name(struct perf_evsel *evsel, char *bf, size_t return ret + perf_evsel__add_modifiers(evsel, bf + ret, size - ret); } -static int perf_evsel__tracepoint_name(struct perf_evsel *evsel, char *bf, size_t size) +const char *perf_evsel__name(struct perf_evsel *evsel) { - return scnprintf(bf, size, "%s", evsel->name ?: "unknown tracepoint"); -} + char bf[128]; -int perf_evsel__name(struct perf_evsel *evsel, char *bf, size_t size) -{ - int ret; + if (evsel->name) + return evsel->name; switch (evsel->attr.type) { case PERF_TYPE_RAW: - ret = scnprintf(bf, size, "raw 0x%" PRIx64, evsel->attr.config); + scnprintf(bf, sizeof(bf), "raw 0x%" PRIx64, evsel->attr.config); break; case PERF_TYPE_HARDWARE: - ret = perf_evsel__hw_name(evsel, bf, size); + perf_evsel__hw_name(evsel, bf, sizeof(bf)); break; case PERF_TYPE_HW_CACHE: - ret = perf_evsel__hw_cache_name(evsel, bf, size); + perf_evsel__hw_cache_name(evsel, bf, sizeof(bf)); break; case PERF_TYPE_SOFTWARE: - ret = perf_evsel__sw_name(evsel, bf, size); + perf_evsel__sw_name(evsel, bf, sizeof(bf)); break; case PERF_TYPE_TRACEPOINT: - ret = perf_evsel__tracepoint_name(evsel, bf, size); + scnprintf(bf, sizeof(bf), "%s", "unknown tracepoint"); break; default: - ret = scnprintf(bf, size, "%s", "unknown attr type"); + scnprintf(bf, sizeof(bf), "%s", "unknown attr type"); break; } - return ret; + evsel->name = strdup(bf); + + return evsel->name ?: "unknown"; } void perf_evsel__config(struct perf_evsel *evsel, struct perf_record_opts *opts, diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 9ae64d9622bc..c936feb4e0a6 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -100,7 +100,7 @@ int __perf_evsel__hw_cache_name(u64 config, char *bf, size_t size); const char *__perf_evsel__hw_name(u64 config); const char *__perf_evsel__sw_name(u64 config); -int perf_evsel__name(struct perf_evsel *evsel, char *bf, size_t size); +const char *perf_evsel__name(struct perf_evsel *evsel); int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 2dd5edf161b7..07c8f3792954 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -641,7 +641,7 @@ static int write_event_desc(int fd, struct perf_header *h __used, /* * write event string as passed on cmdline */ - ret = do_write_string(fd, event_name(attr)); + ret = do_write_string(fd, perf_evsel__name(attr)); if (ret < 0) return ret; /* diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index d73690b11242..517e6473982c 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -201,13 +201,6 @@ const char *event_type(int type) return "unknown"; } -const char *event_name(struct perf_evsel *evsel) -{ - static char bf[128]; - perf_evsel__name(evsel, bf, sizeof(bf)); - return bf; -} - const char *__event_name(int type, u64 config) { static char buf[32]; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 8cac57ab4ee6..d7eb070937c4 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -26,7 +26,6 @@ extern struct tracepoint_path *tracepoint_id_to_path(u64 config); extern bool have_tracepoints(struct list_head *evlist); const char *event_type(int type); -const char *event_name(struct perf_evsel *event); extern const char *__event_name(int type, u64 config); extern int parse_events_option(const struct option *opt, const char *str, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 2600916efa83..582ee38ed216 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1439,7 +1439,7 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp) ret += hists__fprintf_nr_events(&session->hists, fp); list_for_each_entry(pos, &session->evlist->entries, node) { - ret += fprintf(fp, "%s stats:\n", event_name(pos)); + ret += fprintf(fp, "%s stats:\n", perf_evsel__name(pos)); ret += hists__fprintf_nr_events(&pos->hists, fp); } diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c index abe0e8e95068..7eeebcee291c 100644 --- a/tools/perf/util/top.c +++ b/tools/perf/util/top.c @@ -65,7 +65,7 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) top->freq ? "Hz" : ""); } - ret += SNPRINTF(bf + ret, size - ret, "%s", event_name(top->sym_evsel)); + ret += SNPRINTF(bf + ret, size - ret, "%s", perf_evsel__name(top->sym_evsel)); ret += SNPRINTF(bf + ret, size - ret, "], "); From 5bff01f66db1753abcae03a06b21e56e7e9d9fa9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 12 Jun 2012 13:35:44 -0300 Subject: [PATCH 127/612] perf script: Replace __event_name uses with perf_evsel__name No logic change, just remove one more user of __event_name(). Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-e4f0vuy3283hmzfjjvkgm7fo@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 38 +++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 8e395a538eb9..8f9f9b6140db 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -137,10 +137,11 @@ static const char *output_field2str(enum perf_output_field field) #define PRINT_FIELD(x) (output[attr->type].fields & PERF_OUTPUT_##x) -static int perf_event_attr__check_stype(struct perf_event_attr *attr, - u64 sample_type, const char *sample_msg, - enum perf_output_field field) +static int perf_evsel__check_stype(struct perf_evsel *evsel, + u64 sample_type, const char *sample_msg, + enum perf_output_field field) { + struct perf_event_attr *attr = &evsel->attr; int type = attr->type; const char *evname; @@ -148,7 +149,7 @@ static int perf_event_attr__check_stype(struct perf_event_attr *attr, return 0; if (output[type].user_set) { - evname = __event_name(attr->type, attr->config); + evname = perf_evsel__name(evsel); pr_err("Samples for '%s' event do not have %s attribute set. " "Cannot print '%s' field.\n", evname, sample_msg, output_field2str(field)); @@ -157,7 +158,7 @@ static int perf_event_attr__check_stype(struct perf_event_attr *attr, /* user did not ask for it explicitly so remove from the default list */ output[type].fields &= ~field; - evname = __event_name(attr->type, attr->config); + evname = perf_evsel__name(evsel); pr_debug("Samples for '%s' event do not have %s attribute set. " "Skipping '%s' field.\n", evname, sample_msg, output_field2str(field)); @@ -175,8 +176,8 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel, return -EINVAL; if (PRINT_FIELD(IP)) { - if (perf_event_attr__check_stype(attr, PERF_SAMPLE_IP, "IP", - PERF_OUTPUT_IP)) + if (perf_evsel__check_stype(evsel, PERF_SAMPLE_IP, "IP", + PERF_OUTPUT_IP)) return -EINVAL; if (!no_callchain && @@ -185,8 +186,8 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel, } if (PRINT_FIELD(ADDR) && - perf_event_attr__check_stype(attr, PERF_SAMPLE_ADDR, "ADDR", - PERF_OUTPUT_ADDR)) + perf_evsel__check_stype(evsel, PERF_SAMPLE_ADDR, "ADDR", + PERF_OUTPUT_ADDR)) return -EINVAL; if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) { @@ -208,18 +209,18 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel, } if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) && - perf_event_attr__check_stype(attr, PERF_SAMPLE_TID, "TID", - PERF_OUTPUT_TID|PERF_OUTPUT_PID)) + perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID", + PERF_OUTPUT_TID|PERF_OUTPUT_PID)) return -EINVAL; if (PRINT_FIELD(TIME) && - perf_event_attr__check_stype(attr, PERF_SAMPLE_TIME, "TIME", - PERF_OUTPUT_TIME)) + perf_evsel__check_stype(evsel, PERF_SAMPLE_TIME, "TIME", + PERF_OUTPUT_TIME)) return -EINVAL; if (PRINT_FIELD(CPU) && - perf_event_attr__check_stype(attr, PERF_SAMPLE_CPU, "CPU", - PERF_OUTPUT_CPU)) + perf_evsel__check_stype(evsel, PERF_SAMPLE_CPU, "CPU", + PERF_OUTPUT_CPU)) return -EINVAL; return 0; @@ -258,9 +259,10 @@ static int perf_session__check_output_opt(struct perf_session *session) static void print_sample_start(struct perf_sample *sample, struct thread *thread, - struct perf_event_attr *attr) + struct perf_evsel *evsel) { int type; + struct perf_event_attr *attr = &evsel->attr; struct event_format *event; const char *evname = NULL; unsigned long secs; @@ -305,7 +307,7 @@ static void print_sample_start(struct perf_sample *sample, if (event) evname = event->name; } else - evname = __event_name(attr->type, attr->config); + evname = perf_evsel__name(evsel); printf("%s: ", evname ? evname : "[unknown]"); } @@ -412,7 +414,7 @@ static void process_event(union perf_event *event __unused, if (output[attr->type].fields == 0) return; - print_sample_start(sample, thread, attr); + print_sample_start(sample, thread, evsel); if (is_bts_event(attr)) { print_sample_bts(event, sample, evsel, machine, thread); From 22c8b84320ecf9ecbb6587d46a259b828e9ece5e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 12 Jun 2012 13:55:13 -0300 Subject: [PATCH 128/612] perf tools: Don't access evsel->name directly One needs to use perf_evsel__name() so that if needed the name gets synthesized and stored in evsel->name, from where perf_evsel__name() will serve from them on. Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-ml7zbenjmri9bghmrea0jm0d@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 2 +- tools/perf/util/parse-events-test.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index b125e07eb399..9fe77b185338 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1601,7 +1601,7 @@ static int perf_sched__process_tracepoint_sample(struct perf_tool *tool, if (thread == NULL) { pr_debug("problem processing %s event, skipping it.\n", - evsel->name); + perf_evsel__name(evsel)); return -1; } diff --git a/tools/perf/util/parse-events-test.c b/tools/perf/util/parse-events-test.c index 76b98e2a587d..d0cf7c1ed068 100644 --- a/tools/perf/util/parse-events-test.c +++ b/tools/perf/util/parse-events-test.c @@ -418,14 +418,14 @@ static int test__checkevent_pmu_name(struct perf_evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->attr.type); TEST_ASSERT_VAL("wrong config", 1 == evsel->attr.config); - TEST_ASSERT_VAL("wrong name", !strcmp(evsel->name, "krava")); + TEST_ASSERT_VAL("wrong name", !strcmp(perf_evsel__name(evsel), "krava")); /* cpu/config=2/" */ evsel = list_entry(evsel->node.next, struct perf_evsel, node); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->attr.type); TEST_ASSERT_VAL("wrong config", 2 == evsel->attr.config); - TEST_ASSERT_VAL("wrong name", !strcmp(evsel->name, "raw 0x2")); + TEST_ASSERT_VAL("wrong name", !strcmp(perf_evsel__name(evsel), "raw 0x2")); return 0; } From 9db1763c72b1548626ae9d634015de4f07ef624f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 12 Jun 2012 13:45:00 -0300 Subject: [PATCH 129/612] perf tools: Remove __event_name Not needed anymore, the parsing code can just leave evsel->name as NULL and the first call to perf_evsel__name() will do exactly what was being pre-cached using __event_name(). Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-cn2eiijcinnc97buod8cs34m@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 61 ++++------------------------------ tools/perf/util/parse-events.h | 1 - 2 files changed, 6 insertions(+), 56 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 517e6473982c..eacf932a36a0 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -161,24 +161,6 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config) return NULL; } -#define TP_PATH_LEN (MAX_EVENT_LENGTH * 2 + 1) -static const char *tracepoint_id_to_name(u64 config) -{ - static char buf[TP_PATH_LEN]; - struct tracepoint_path *path; - - path = tracepoint_id_to_path(config); - if (path) { - snprintf(buf, TP_PATH_LEN, "%s:%s", path->system, path->name); - free(path->name); - free(path->system); - free(path); - } else - snprintf(buf, TP_PATH_LEN, "%s:%s", "unknown", "unknown"); - - return buf; -} - const char *event_type(int type) { switch (type) { @@ -201,36 +183,6 @@ const char *event_type(int type) return "unknown"; } -const char *__event_name(int type, u64 config) -{ - static char buf[32]; - - if (type == PERF_TYPE_RAW) { - sprintf(buf, "raw 0x%" PRIx64, config); - return buf; - } - - switch (type) { - case PERF_TYPE_HARDWARE: - return __perf_evsel__hw_name(config); - - case PERF_TYPE_HW_CACHE: - __perf_evsel__hw_cache_name(config, buf, sizeof(buf)); - return buf; - - case PERF_TYPE_SOFTWARE: - return __perf_evsel__sw_name(config); - - case PERF_TYPE_TRACEPOINT: - return tracepoint_id_to_name(config); - - default: - break; - } - - return "unknown"; -} - static int add_event(struct list_head **_list, int *idx, struct perf_event_attr *attr, char *name) { @@ -252,7 +204,8 @@ static int add_event(struct list_head **_list, int *idx, return -ENOMEM; } - evsel->name = strdup(name); + if (name) + evsel->name = strdup(name); list_add_tail(&evsel->node, list); *_list = list; return 0; @@ -545,8 +498,7 @@ int parse_events_add_numeric(struct list_head **list, int *idx, config_attr(&attr, head_config, 1)) return -EINVAL; - return add_event(list, idx, &attr, - (char *) __event_name(type, config)); + return add_event(list, idx, &attr, NULL); } static int parse_events__is_name_term(struct parse_events__term *term) @@ -554,8 +506,7 @@ static int parse_events__is_name_term(struct parse_events__term *term) return term->type_term == PARSE_EVENTS__TERM_TYPE_NAME; } -static char *pmu_event_name(struct perf_event_attr *attr, - struct list_head *head_terms) +static char *pmu_event_name(struct list_head *head_terms) { struct parse_events__term *term; @@ -563,7 +514,7 @@ static char *pmu_event_name(struct perf_event_attr *attr, if (parse_events__is_name_term(term)) return term->val.str; - return (char *) __event_name(PERF_TYPE_RAW, attr->config); + return NULL; } int parse_events_add_pmu(struct list_head **list, int *idx, @@ -588,7 +539,7 @@ int parse_events_add_pmu(struct list_head **list, int *idx, return -EINVAL; return add_event(list, idx, &attr, - pmu_event_name(&attr, head_config)); + pmu_event_name(head_config)); } void parse_events_update_lists(struct list_head *list_event, diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index d7eb070937c4..1784f06e3a6a 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -26,7 +26,6 @@ extern struct tracepoint_path *tracepoint_id_to_path(u64 config); extern bool have_tracepoints(struct list_head *evlist); const char *event_type(int type); -extern const char *__event_name(int type, u64 config); extern int parse_events_option(const struct option *opt, const char *str, int unset); From 6eef3d9c2bcf52b7a3c18e609f5838c007b989a4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 13 Jun 2012 11:53:37 -0300 Subject: [PATCH 130/612] perf evsel: Reconstruct raw event with modifiers from perf_event_attr I forgot to add the modifiers to raw events too, fix it. Reported-by: Jiri Olsa Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-pi267j1aqqjti9rqh9qy4g58@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 236bdf25db6d..8f0e9dd03775 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -252,6 +252,12 @@ static int perf_evsel__hw_cache_name(struct perf_evsel *evsel, char *bf, size_t return ret + perf_evsel__add_modifiers(evsel, bf + ret, size - ret); } +static int perf_evsel__raw_name(struct perf_evsel *evsel, char *bf, size_t size) +{ + int ret = scnprintf(bf, size, "raw 0x%" PRIx64, evsel->attr.config); + return ret + perf_evsel__add_modifiers(evsel, bf + ret, size - ret); +} + const char *perf_evsel__name(struct perf_evsel *evsel) { char bf[128]; @@ -261,7 +267,7 @@ const char *perf_evsel__name(struct perf_evsel *evsel) switch (evsel->attr.type) { case PERF_TYPE_RAW: - scnprintf(bf, sizeof(bf), "raw 0x%" PRIx64, evsel->attr.config); + perf_evsel__raw_name(evsel, bf, sizeof(bf)); break; case PERF_TYPE_HARDWARE: From a9c34a9f9c677fcbe06bd3eda8d6caa3487b4a65 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 11 Jun 2012 15:20:03 +0200 Subject: [PATCH 131/612] perf tools: Remove unused evsel parameter from machine__resolve_callchain Removing unused evsel parameter from machine__resolve_callchain function. Plus related header file and callers changes. The evsel parameter is unused since following commit: perf callchain: Make callchain cursors TLS commit 472606458f3e1ced5fe3cc5f04e90a6b5a4732cf Author: Namhyung Kim Date: Thu May 31 14:43:26 2012 +0900 Signed-off-by: Jiri Olsa Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1339420814-7379-9-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 4 ++-- tools/perf/builtin-script.c | 4 ++-- tools/perf/builtin-top.c | 2 +- tools/perf/util/map.h | 2 +- tools/perf/util/session.c | 7 +++---- tools/perf/util/session.h | 4 ++-- 6 files changed, 11 insertions(+), 12 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index ea8ce8e1c0d8..40b0ffc3ad3b 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -69,7 +69,7 @@ static int perf_report__add_branch_hist_entry(struct perf_tool *tool, if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) { - err = machine__resolve_callchain(machine, evsel, al->thread, + err = machine__resolve_callchain(machine, al->thread, sample->callchain, &parent); if (err) return err; @@ -140,7 +140,7 @@ static int perf_evsel__add_hist_entry(struct perf_evsel *evsel, struct hist_entry *he; if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) { - err = machine__resolve_callchain(machine, evsel, al->thread, + err = machine__resolve_callchain(machine, al->thread, sample->callchain, &parent); if (err) return err; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 8f9f9b6140db..8fecd3b8130a 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -389,7 +389,7 @@ static void print_sample_bts(union perf_event *event, printf(" "); else printf("\n"); - perf_event__print_ip(event, sample, machine, evsel, + perf_event__print_ip(event, sample, machine, PRINT_FIELD(SYM), PRINT_FIELD(DSO), PRINT_FIELD(SYMOFFSET)); } @@ -433,7 +433,7 @@ static void process_event(union perf_event *event __unused, printf(" "); else printf("\n"); - perf_event__print_ip(event, sample, machine, evsel, + perf_event__print_ip(event, sample, machine, PRINT_FIELD(SYM), PRINT_FIELD(DSO), PRINT_FIELD(SYMOFFSET)); } diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 8090a280578c..e3cab5f088f8 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -774,7 +774,7 @@ static void perf_event__process_sample(struct perf_tool *tool, if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) { - err = machine__resolve_callchain(machine, evsel, al.thread, + err = machine__resolve_callchain(machine, al.thread, sample->callchain, &parent); if (err) return; diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 81371bad4ef0..c14c665d9a25 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -157,7 +157,7 @@ void machine__exit(struct machine *self); void machine__delete(struct machine *self); int machine__resolve_callchain(struct machine *machine, - struct perf_evsel *evsel, struct thread *thread, + struct thread *thread, struct ip_callchain *chain, struct symbol **parent); int maps__set_kallsyms_ref_reloc_sym(struct map **maps, const char *symbol_name, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 582ee38ed216..febc0aeb3c66 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -289,7 +289,6 @@ struct branch_info *machine__resolve_bstack(struct machine *self, } int machine__resolve_callchain(struct machine *self, - struct perf_evsel *evsel __used, struct thread *thread, struct ip_callchain *chain, struct symbol **parent) @@ -1480,8 +1479,8 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, } void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, - struct machine *machine, struct perf_evsel *evsel, - int print_sym, int print_dso, int print_symoffset) + struct machine *machine, int print_sym, + int print_dso, int print_symoffset) { struct addr_location al; struct callchain_cursor_node *node; @@ -1495,7 +1494,7 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, if (symbol_conf.use_callchain && sample->callchain) { - if (machine__resolve_callchain(machine, evsel, al.thread, + if (machine__resolve_callchain(machine, al.thread, sample->callchain, NULL) != 0) { if (verbose) error("Failed to resolve callchain. Skipping\n"); diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 7a5434c00565..877d78186f2c 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -150,8 +150,8 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, unsigned int type); void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, - struct machine *machine, struct perf_evsel *evsel, - int print_sym, int print_dso, int print_symoffset); + struct machine *machine, int print_sym, + int print_dso, int print_symoffset); int perf_session__cpu_bitmap(struct perf_session *session, const char *cpu_list, unsigned long *cpu_bitmap); From dd4f52232c60bcb41205a67d2df2ac0ecdfe3683 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 13 Jun 2012 15:52:42 -0300 Subject: [PATCH 132/612] perf evsel: Make some methods private Now that __event_name is gone, no need to export __perf_evsel__[hs]w_name(). Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-rpjnarbt83nu9uowrfatmy12@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 6 +++--- tools/perf/util/evsel.h | 5 ----- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 8f0e9dd03775..876f639d69ed 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -78,7 +78,7 @@ static const char *perf_evsel__hw_names[PERF_COUNT_HW_MAX] = { "ref-cycles", }; -const char *__perf_evsel__hw_name(u64 config) +static const char *__perf_evsel__hw_name(u64 config) { if (config < PERF_COUNT_HW_MAX && perf_evsel__hw_names[config]) return perf_evsel__hw_names[config]; @@ -140,7 +140,7 @@ static const char *perf_evsel__sw_names[PERF_COUNT_SW_MAX] = { "emulation-faults", }; -const char *__perf_evsel__sw_name(u64 config) +static const char *__perf_evsel__sw_name(u64 config) { if (config < PERF_COUNT_SW_MAX && perf_evsel__sw_names[config]) return perf_evsel__sw_names[config]; @@ -219,7 +219,7 @@ int __perf_evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, perf_evsel__hw_cache_op[op][1]); } -int __perf_evsel__hw_cache_name(u64 config, char *bf, size_t size) +static int __perf_evsel__hw_cache_name(u64 config, char *bf, size_t size) { u8 op, result, type = (config >> 0) & 0xff; const char *err = "unknown-ext-hardware-cache-type"; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index c936feb4e0a6..67cc5033d192 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -95,11 +95,6 @@ const char *perf_evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX] [PERF_EVSEL__MAX_ALIASES]; int __perf_evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, char *bf, size_t size); -int __perf_evsel__hw_cache_name(u64 config, char *bf, size_t size); - -const char *__perf_evsel__hw_name(u64 config); -const char *__perf_evsel__sw_name(u64 config); - const char *perf_evsel__name(struct perf_evsel *evsel); int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); From c0a58fb2bdf033df433cad9009c7dac4c6b872b0 Mon Sep 17 00:00:00 2001 From: Samuel Liao Date: Tue, 5 Jun 2012 13:14:59 +0800 Subject: [PATCH 133/612] perf annotate: Check null of sym pointer before using it Sym may be NULL, and that will cause perf to crash. Signed-off-by: Shan Wei Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4FCD95D3.90209@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 34b1c46eaf42..67a2703e666a 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -814,7 +814,7 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx, { struct disasm_line *pos, *n; struct annotation *notes; - const size_t size = symbol__size(sym); + size_t size; struct map_symbol ms = { .map = map, .sym = sym, @@ -834,6 +834,8 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx, if (sym == NULL) return -1; + size = symbol__size(sym); + if (map->dso->annotate_warned) return -1; From 0718467c859f5571dc48d294596f841096f6a47a Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Mon, 18 Jun 2012 15:56:33 -0400 Subject: [PATCH 134/612] x86/nmi: Clean up register_nmi_handler() usage Implement a cleaner and easier to maintain version for the section warning fixes implemented in commit eeaaa96a3a21 ("x86/nmi: Fix section mismatch warnings on 32-bit"). Signed-off-by: Li Zhong Signed-off-by: Don Zickus Cc: Jan Beulich Link: http://lkml.kernel.org/r/1340049393-17771-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 20 +++----------------- arch/x86/kernel/nmi_selftest.c | 7 ++++--- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index dc580c42851c..c0fa356e90de 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -44,28 +44,14 @@ struct nmiaction { const char *name; }; -#define register_nmi_handler(t, fn, fg, n) \ +#define register_nmi_handler(t, fn, fg, n, init...) \ ({ \ - static struct nmiaction fn##_na = { \ + static struct nmiaction init fn##_na = { \ .handler = (fn), \ .name = (n), \ .flags = (fg), \ }; \ - __register_nmi_handler((t), &fn##_na); \ -}) - -/* - * For special handlers that register/unregister in the - * init section only. This should be considered rare. - */ -#define register_nmi_handler_initonly(t, fn, fg, n) \ -({ \ - static struct nmiaction fn##_na __initdata = { \ - .handler = (fn), \ - .name = (n), \ - .flags = (fg), \ - }; \ - __register_nmi_handler((t), &fn##_na); \ + __register_nmi_handler((t), &fn##_na); \ }) int __register_nmi_handler(unsigned int, struct nmiaction *); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 149b8d9c6ad4..6d9582ec0324 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -42,7 +42,8 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) static void __init init_nmi_testsuite(void) { /* trap all the unknown NMIs we may generate */ - register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); + register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk", + __initdata); } static void __init cleanup_nmi_testsuite(void) @@ -64,8 +65,8 @@ static void __init test_nmi_ipi(struct cpumask *mask) { unsigned long timeout; - if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback, - NMI_FLAG_FIRST, "nmi_selftest")) { + if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + NMI_FLAG_FIRST, "nmi_selftest", __initdata)) { nmi_fail = FAILURE; return; } From e1b6fc55da40bc17e20795901cb786e3619f9be9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 18 Jun 2012 11:28:45 +0100 Subject: [PATCH 135/612] x86/microcode: Mark microcode_id[] as __initconst It's not being used for other than creating module aliases (i.e. no loadable section has any reference to it). Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4FDF1EFD020000780008A65D@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fbdfc6917180..c383b3f8f397 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -504,7 +504,7 @@ static struct notifier_block __refdata mc_cpu_notifier = { #ifdef MODULE /* Autoload on Intel and AMD systems */ -static const struct x86_cpu_id microcode_id[] = { +static const struct x86_cpu_id __initconst microcode_id[] = { #ifdef CONFIG_MICROCODE_INTEL { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, #endif From 0d26d1d873a302828e064737746c53a2689e6c0f Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 18 Jun 2012 11:30:20 +0100 Subject: [PATCH 136/612] x86/mm: Mark free_initrd_mem() as __init ... matching various other architectures. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4FDF1F5C020000780008A661@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bc4e9d84157f..e0e6990723e9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -385,7 +385,7 @@ void free_initmem(void) } #ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) +void __init free_initrd_mem(unsigned long start, unsigned long end) { /* * end could be not aligned, and We can not align that, From 0fa0e2f02e8edfbdb5f86d1cab0fa6dc0517489f Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 18 Jun 2012 11:40:04 +0100 Subject: [PATCH 137/612] x86: Move call to print_modules() out of show_regs() Printing the list of loaded modules is really unrelated to what this function is about, and is particularly unnecessary in the context of the SysRQ key handling (gets printed so far over and over). It should really be the caller of the function to decide whether this piece of information is useful (and to avoid redundantly printing it). Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4FDF21A4020000780008A67F@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 1 + arch/x86/kernel/dumpstack_32.c | 1 - arch/x86/kernel/dumpstack_64.c | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 87d3b5d663cd..ae42418bc50f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -271,6 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; + print_modules(); show_regs(regs); #ifdef CONFIG_X86_32 if (user_mode_vm(regs)) { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 3a8aced11ae9..1038a417ea53 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -86,7 +86,6 @@ void show_regs(struct pt_regs *regs) { int i; - print_modules(); __show_regs(regs, !user_mode_vm(regs)); pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index c582e9c5bd1a..b653675d5288 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -254,7 +254,6 @@ void show_regs(struct pt_regs *regs) sp = regs->sp; printk("CPU %d ", cpu); - print_modules(); __show_regs(regs, 1); printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n", cur->comm, cur->pid, task_thread_info(cur), cur); From 2b1b712f050eaf0ac576591281446dc960c0afc5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 20 Jun 2012 21:18:14 -0700 Subject: [PATCH 138/612] x86, reboot: Drop redundant write of reboot_mode We write reboot_mode to BIOS location 0x472 in native_machine_emergency_restart() (reboot.c:542) already, there is no need to then write it again in machine_real_restart(). This means nothing gets written there for MRR_APM, but the APM call is a poweroff call and doesn't use this memory location. Link: http://lkml.kernel.org/n/tip-3i0pfh44c1e3jv5lab0cf7sc@git.kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/reboot.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 6ddb9cd0ced0..6ef559f09acd 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -179,14 +179,6 @@ void __noreturn machine_real_restart(unsigned int type) write_cr3(real_mode_header->trampoline_pgd); #endif - /* - * Write 0x1234 to absolute memory location 0x472. The BIOS reads - * this on booting to tell it to "Bypass memory test (also warm - * boot)". This seems like a fairly standard thing that gets set by - * REBOOT.COM programs, and the previous reset routine did this - * too. */ - *((unsigned short *)0x472) = reboot_mode; - /* Jump to the identity-mapped low memory code */ #ifdef CONFIG_X86_32 asm volatile("jmpl *%0" : : From ce7d16a1752accbff1ff7c813594aad0b7168563 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Jun 2012 10:05:56 -0300 Subject: [PATCH 139/612] [media] smiapp-core: fix compilation build error smiapp-core.c:2472:3: error: implicit declaration of function 'kzalloc' [-Werror=implicit-function-declaration] Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/smiapp/smiapp-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/video/smiapp/smiapp-core.c b/drivers/media/video/smiapp/smiapp-core.c index e8c93c89265a..9cf5bda35fbe 100644 --- a/drivers/media/video/smiapp/smiapp-core.c +++ b/drivers/media/video/smiapp/smiapp-core.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include From b6a509df59c6abd0a2177e3be29642207711145d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 20 Jun 2012 21:30:30 -0300 Subject: [PATCH 140/612] [media] media: pms.c needs linux/slab.h drivers/media/video/pms.c uses kzalloc() and kfree() so it should include to fix build errors and a warning. drivers/media/video/pms.c:1047:2: error: implicit declaration of function 'kzalloc' drivers/media/video/pms.c:1047:6: warning: assignment makes pointer from integer without a cast drivers/media/video/pms.c:1116:2: error: implicit declaration of function 'kfree' Found in mmotm but applies to mainline. Signed-off-by: Randy Dunlap Acked-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/pms.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/video/pms.c b/drivers/media/video/pms.c index 77f9c92186f4..b4c679b3fb0f 100644 --- a/drivers/media/video/pms.c +++ b/drivers/media/video/pms.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include From 9751d7627582fc1cc64625d63bde9528c14f1544 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 21 Jun 2012 10:25:03 -0700 Subject: [PATCH 141/612] x86-64, reboot: Be more paranoid in 64-bit reboot=bios Be a bit more paranoid in the transition back to 16-bit mode. In particular, in case the kernel is residing above the 4 GiB mark, switch to the trampoline GDT, and make the jump after turning off paging a far jump. In theory, none of this should matter, but it is exactly the kind of things that broken SMM or virtualization software could trip up on. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/tip-jopx7y6g6dbcx4tpal8q0jlr@git.kernel.org --- arch/x86/realmode/rm/reboot.S | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S index 6bf8feac5557..f932ea61d1c8 100644 --- a/arch/x86/realmode/rm/reboot.S +++ b/arch/x86/realmode/rm/reboot.S @@ -22,14 +22,18 @@ ENTRY(machine_real_restart_asm) #ifdef CONFIG_X86_64 + /* Switch to trampoline GDT as it is guaranteed < 4 GiB */ + movl $__KERNEL_DS, %eax + movl %eax, %ds + lgdtl pa_tr_gdt /* Disable paging to drop us out of long mode */ movl %cr0, %eax andl $~X86_CR0_PG, %eax movl %eax, %cr0 - jmp 1f /* "A branch" may be needed here, assume near is OK */ + ljmpl $__KERNEL32_CS, $pa_machine_real_restart_paging_off -1: +GLOBAL(machine_real_restart_paging_off) xorl %eax, %eax xorl %edx, %edx movl $MSR_EFER, %ecx From f8bbfd7d28303967ca4e8597de9bdc9bf8b197e7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 23 Feb 2012 17:07:06 +0100 Subject: [PATCH 142/612] oprofile, perf: Use per-cpu framework This changes oprofile_perf.c to use the per-cpu framework. Using the per-cpu framework should avoid error like the following: arch/arm/oprofile/../../../drivers/oprofile/oprofile_perf.c:28:28: error: variably modified 'perf_events' at file scope Reported-by: William Cohen Cc: Will Deacon Signed-off-by: Robert Richter --- drivers/oprofile/oprofile_perf.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/oprofile/oprofile_perf.c b/drivers/oprofile/oprofile_perf.c index efc4b7f308cf..f3cfa0b9adfa 100644 --- a/drivers/oprofile/oprofile_perf.c +++ b/drivers/oprofile/oprofile_perf.c @@ -1,5 +1,6 @@ /* * Copyright 2010 ARM Ltd. + * Copyright 2012 Advanced Micro Devices, Inc., Robert Richter * * Perf-events backend for OProfile. */ @@ -25,7 +26,7 @@ static int oprofile_perf_enabled; static DEFINE_MUTEX(oprofile_perf_mutex); static struct op_counter_config *counter_config; -static struct perf_event **perf_events[NR_CPUS]; +static DEFINE_PER_CPU(struct perf_event **, perf_events); static int num_counters; /* @@ -38,7 +39,7 @@ static void op_overflow_handler(struct perf_event *event, u32 cpu = smp_processor_id(); for (id = 0; id < num_counters; ++id) - if (perf_events[cpu][id] == event) + if (per_cpu(perf_events, cpu)[id] == event) break; if (id != num_counters) @@ -74,7 +75,7 @@ static int op_create_counter(int cpu, int event) { struct perf_event *pevent; - if (!counter_config[event].enabled || perf_events[cpu][event]) + if (!counter_config[event].enabled || per_cpu(perf_events, cpu)[event]) return 0; pevent = perf_event_create_kernel_counter(&counter_config[event].attr, @@ -91,18 +92,18 @@ static int op_create_counter(int cpu, int event) return -EBUSY; } - perf_events[cpu][event] = pevent; + per_cpu(perf_events, cpu)[event] = pevent; return 0; } static void op_destroy_counter(int cpu, int event) { - struct perf_event *pevent = perf_events[cpu][event]; + struct perf_event *pevent = per_cpu(perf_events, cpu)[event]; if (pevent) { perf_event_release_kernel(pevent); - perf_events[cpu][event] = NULL; + per_cpu(perf_events, cpu)[event] = NULL; } } @@ -257,12 +258,12 @@ void oprofile_perf_exit(void) for_each_possible_cpu(cpu) { for (id = 0; id < num_counters; ++id) { - event = perf_events[cpu][id]; + event = per_cpu(perf_events, cpu)[id]; if (event) perf_event_release_kernel(event); } - kfree(perf_events[cpu]); + kfree(per_cpu(perf_events, cpu)); } kfree(counter_config); @@ -277,8 +278,6 @@ int __init oprofile_perf_init(struct oprofile_operations *ops) if (ret) return ret; - memset(&perf_events, 0, sizeof(perf_events)); - num_counters = perf_num_counters(); if (num_counters <= 0) { pr_info("oprofile: no performance counters\n"); @@ -298,9 +297,9 @@ int __init oprofile_perf_init(struct oprofile_operations *ops) } for_each_possible_cpu(cpu) { - perf_events[cpu] = kcalloc(num_counters, + per_cpu(perf_events, cpu) = kcalloc(num_counters, sizeof(struct perf_event *), GFP_KERNEL); - if (!perf_events[cpu]) { + if (!per_cpu(perf_events, cpu)) { pr_info("oprofile: failed to allocate %d perf events " "for cpu %d\n", num_counters, cpu); ret = -ENOMEM; From d9b0cde91c60da0ed5f92cdc3ac878142e6b5f27 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Tue, 29 May 2012 14:31:23 -0700 Subject: [PATCH 143/612] x86-64, gcc: Use -mpreferred-stack-boundary=3 if supported On x86-64, the standard ABI requires alignment to 16 bytes. However, this is not actually necessary in the kernel (we don't do SSE except in very controlled ways); and furthermore, the standard kernel entry on x86-64 actually leaves the stack on an odd 8-byte boundary, which means that gcc will generate extra instructions to keep the stack *mis*aligned! gcc 4.8 adds an -mpreferred-stack-boundary=3 option to override this and lets us save some stack space and a handful of instructions. Note that this causes us to pass -mno-sse twice; this is redundant, but necessary since the cc-option test will fail unless -mno-sse is passed on the same command line. [ hpa: rewrote the patch description ] Signed-off-by: H.J. Lu Link: http://lkml.kernel.org/r/CAMe9rOqPfy3JcZRLaUeCjBe9BVY-P6e0uaSbMi5hvS-6WwQueg@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1f2521434554..b0c5276861ec 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -49,6 +49,9 @@ else KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 + # Use -mpreferred-stack-boundary=3 if supported. + KBUILD_CFLAGS += $(call cc-option,-mno-sse -mpreferred-stack-boundary=3) + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) From 357398e96d8c883b010379a7669df43ed0e2e32b Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 20 Jun 2012 18:39:27 +0200 Subject: [PATCH 144/612] perf/x86: Fix section mismatch in uncore_pci_init() Fix section mismatch in uncore_pci_init(): WARNING: vmlinux.o(.init.text+0x9246): Section mismatch in reference from the function uncore_pci_init() to the function .devexit.text:uncore_pci_remove() The function __init uncore_pci_init() references a function __devexit uncore_pci_remove(). [...] Signed-off-by: Robert Richter Cc: Cc: Link: http://lkml.kernel.org/r/20120620163927.GI5046@erda.amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 28a8413ca199..6f43f9584e3d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -1313,7 +1313,7 @@ static int __devinit uncore_pci_add(struct intel_uncore_type *type, return 0; } -static void __devexit uncore_pci_remove(struct pci_dev *pdev) +static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box = pci_get_drvdata(pdev); struct intel_uncore_pmu *pmu = box->pmu; From ac1534a55d1e87d59a21c09c570605933b551480 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 21 Jun 2012 14:52:40 +0200 Subject: [PATCH 145/612] iommu/amd: Initialize dma_ops for hotplug and sriov devices When a device is added to the system at runtime the AMD IOMMU driver initializes the necessary data structures to handle translation for it. But it forgets to change the per-device dma_ops to point to the AMD IOMMU driver. So mapping actually never happens and all DMA accesses end in an IO_PAGE_FAULT. Fix this. Reported-by: Stefan Assmann Cc: stable@vger.kernel.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a2e418cba0ff..dfe7d37c82c5 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -83,6 +83,8 @@ static struct iommu_ops amd_iommu_ops; static ATOMIC_NOTIFIER_HEAD(ppr_notifier); int amd_iommu_max_glx_val = -1; +static struct dma_map_ops amd_iommu_dma_ops; + /* * general struct to manage commands send to an IOMMU */ @@ -2267,6 +2269,13 @@ static int device_change_notifier(struct notifier_block *nb, list_add_tail(&dma_domain->list, &iommu_pd_list); spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + dev_data = get_dev_data(dev); + + if (!dev_data->passthrough) + dev->archdata.dma_ops = &amd_iommu_dma_ops; + else + dev->archdata.dma_ops = &nommu_dma_ops; + break; case BUS_NOTIFY_DEL_DEVICE: From 90e614bb4c581eb588ec26f130fcdd65aa047fa8 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Wed, 16 May 2012 11:35:04 -0300 Subject: [PATCH 146/612] [media] s5p-fimc: Fix bug in capture node open() When video pipeline initialization fails, the ST_CAPT_BUSY flag needs to be cleared before pm_runtime_put_sync is called. Otherwise the runtime suspend routine tries to suspend device, rather than just turning it off. Also fix potential null pointer dereference in fimc_pipeline_shutdown(). Signed-off-by: Sylwester Nawrocki Signed-off-by: Kyungmin Park Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/s5p-fimc/fimc-capture.c | 2 +- drivers/media/video/s5p-fimc/fimc-mdevice.c | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/media/video/s5p-fimc/fimc-capture.c b/drivers/media/video/s5p-fimc/fimc-capture.c index 354574591908..7083107c2b37 100644 --- a/drivers/media/video/s5p-fimc/fimc-capture.c +++ b/drivers/media/video/s5p-fimc/fimc-capture.c @@ -499,10 +499,10 @@ static int fimc_capture_open(struct file *file) if (ret < 0) { dev_err(&fimc->pdev->dev, "Video pipeline initialization failed\n"); + clear_bit(ST_CAPT_BUSY, &fimc->state); pm_runtime_put_sync(&fimc->pdev->dev); fimc->vid_cap.refcnt--; v4l2_fh_release(file); - clear_bit(ST_CAPT_BUSY, &fimc->state); return ret; } ret = fimc_capture_ctrls_create(fimc); diff --git a/drivers/media/video/s5p-fimc/fimc-mdevice.c b/drivers/media/video/s5p-fimc/fimc-mdevice.c index 6753c45631b8..7450dcdafc87 100644 --- a/drivers/media/video/s5p-fimc/fimc-mdevice.c +++ b/drivers/media/video/s5p-fimc/fimc-mdevice.c @@ -193,9 +193,13 @@ int __fimc_pipeline_shutdown(struct fimc_pipeline *p) int fimc_pipeline_shutdown(struct fimc_pipeline *p) { - struct media_entity *me = &p->subdevs[IDX_SENSOR]->entity; + struct media_entity *me; int ret; + if (!p || !p->subdevs[IDX_SENSOR]) + return -EINVAL; + + me = &p->subdevs[IDX_SENSOR]->entity; mutex_lock(&me->parent->graph_mutex); ret = __fimc_pipeline_shutdown(p); mutex_unlock(&me->parent->graph_mutex); From d0da3c3565a1dd3cdfa374038d6ccae4b90fe142 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Wed, 16 May 2012 13:08:30 -0300 Subject: [PATCH 147/612] [media] s5p-fimc: Don't create multiple active links to same sink entity The driver is supposed to create active media link from sensor N (or its corresponding s5p-mipi-csis entity) to FIMC.N by default. Instead s5p-mipi-csis.N entity gets always connected by a default active link to FIMC.N, regardless of there are parallel bus sensor entities already connected to FIMC.N. Correct this. Signed-off-by: Sylwester Nawrocki Signed-off-by: Kyungmin Park Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/s5p-fimc/fimc-mdevice.c | 23 ++++++++++++--------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/media/video/s5p-fimc/fimc-mdevice.c b/drivers/media/video/s5p-fimc/fimc-mdevice.c index 7450dcdafc87..dffe4da5eaa0 100644 --- a/drivers/media/video/s5p-fimc/fimc-mdevice.c +++ b/drivers/media/video/s5p-fimc/fimc-mdevice.c @@ -502,12 +502,12 @@ static void fimc_md_unregister_entities(struct fimc_md *fmd) * @source: the source entity to create links to all fimc entities from * @sensor: sensor subdev linked to FIMC[fimc_id] entity, may be null * @pad: the source entity pad index - * @fimc_id: index of the fimc device for which link should be enabled + * @link_mask: bitmask of the fimc devices for which link should be enabled */ static int __fimc_md_create_fimc_sink_links(struct fimc_md *fmd, struct media_entity *source, struct v4l2_subdev *sensor, - int pad, int fimc_id) + int pad, int link_mask) { struct fimc_sensor_info *s_info; struct media_entity *sink; @@ -524,7 +524,7 @@ static int __fimc_md_create_fimc_sink_links(struct fimc_md *fmd, if (!fmd->fimc[i]->variant->has_cam_if) continue; - flags = (i == fimc_id) ? MEDIA_LNK_FL_ENABLED : 0; + flags = ((1 << i) & link_mask) ? MEDIA_LNK_FL_ENABLED : 0; sink = &fmd->fimc[i]->vid_cap.subdev.entity; ret = media_entity_create_link(source, pad, sink, @@ -556,7 +556,10 @@ static int __fimc_md_create_fimc_sink_links(struct fimc_md *fmd, if (!fmd->fimc_lite[i]) continue; - flags = (i == fimc_id) ? MEDIA_LNK_FL_ENABLED : 0; + if (link_mask & (1 << (i + FIMC_MAX_DEVS))) + flags = MEDIA_LNK_FL_ENABLED; + else + flags = 0; sink = &fmd->fimc_lite[i]->subdev.entity; ret = media_entity_create_link(source, pad, sink, @@ -618,9 +621,8 @@ static int fimc_md_create_links(struct fimc_md *fmd) struct s5p_fimc_isp_info *pdata; struct fimc_sensor_info *s_info; struct media_entity *source, *sink; - int i, pad, fimc_id = 0; - int ret = 0; - u32 flags; + int i, pad, fimc_id = 0, ret = 0; + u32 flags, link_mask = 0; for (i = 0; i < fmd->num_sensors; i++) { if (fmd->sensor[i].subdev == NULL) @@ -672,19 +674,20 @@ static int fimc_md_create_links(struct fimc_md *fmd) if (source == NULL) continue; + link_mask = 1 << fimc_id++; ret = __fimc_md_create_fimc_sink_links(fmd, source, sensor, - pad, fimc_id++); + pad, link_mask); } - fimc_id = 0; for (i = 0; i < ARRAY_SIZE(fmd->csis); i++) { if (fmd->csis[i].sd == NULL) continue; source = &fmd->csis[i].sd->entity; pad = CSIS_PAD_SOURCE; + link_mask = 1 << fimc_id++; ret = __fimc_md_create_fimc_sink_links(fmd, source, NULL, - pad, fimc_id++); + pad, link_mask); } /* Create immutable links between each FIMC's subdev and video node */ From d547ab66e275e2f6bf703572e943018ef7c391c5 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Wed, 16 May 2012 15:00:26 -0300 Subject: [PATCH 148/612] [media] s5p-fimc: Honour sizeimage in VIDIOC_S_FMT Allow memory buffer size to be increased by means of struct v4l2_pix_plane_format::sizeimage at VIDIOC_S_FMT ioctl. Signed-off-by: Sylwester Nawrocki Signed-off-by: Kyungmin Park Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/s5p-fimc/fimc-capture.c | 7 ++++--- drivers/media/video/s5p-fimc/fimc-core.c | 9 +++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/media/video/s5p-fimc/fimc-capture.c b/drivers/media/video/s5p-fimc/fimc-capture.c index 7083107c2b37..13df7230c8d8 100644 --- a/drivers/media/video/s5p-fimc/fimc-capture.c +++ b/drivers/media/video/s5p-fimc/fimc-capture.c @@ -350,7 +350,8 @@ static int queue_setup(struct vb2_queue *vq, const struct v4l2_format *pfmt, if (pixm) sizes[i] = max(size, pixm->plane_fmt[i].sizeimage); else - sizes[i] = size; + sizes[i] = max_t(u32, size, frame->payload[i]); + allocators[i] = ctx->fimc_dev->alloc_ctx; } @@ -924,10 +925,10 @@ static int fimc_capture_set_format(struct fimc_dev *fimc, struct v4l2_format *f) pix->width = mf->width; pix->height = mf->height; } + fimc_adjust_mplane_format(ff->fmt, pix->width, pix->height, pix); for (i = 0; i < ff->fmt->colplanes; i++) - ff->payload[i] = - (pix->width * pix->height * ff->fmt->depth[i]) / 8; + ff->payload[i] = pix->plane_fmt[i].sizeimage; set_frame_bounds(ff, pix->width, pix->height); /* Reset the composition rectangle if not yet configured */ diff --git a/drivers/media/video/s5p-fimc/fimc-core.c b/drivers/media/video/s5p-fimc/fimc-core.c index 92fc5a20fb76..65124a24c30f 100644 --- a/drivers/media/video/s5p-fimc/fimc-core.c +++ b/drivers/media/video/s5p-fimc/fimc-core.c @@ -741,8 +741,8 @@ void fimc_adjust_mplane_format(struct fimc_fmt *fmt, u32 width, u32 height, pix->width = width; for (i = 0; i < pix->num_planes; ++i) { - u32 bpl = pix->plane_fmt[i].bytesperline; - u32 *sizeimage = &pix->plane_fmt[i].sizeimage; + struct v4l2_plane_pix_format *plane_fmt = &pix->plane_fmt[i]; + u32 bpl = plane_fmt->bytesperline; if (fmt->colplanes > 1 && (bpl == 0 || bpl < pix->width)) bpl = pix->width; /* Planar */ @@ -754,8 +754,9 @@ void fimc_adjust_mplane_format(struct fimc_fmt *fmt, u32 width, u32 height, if (i == 0) /* Same bytesperline for each plane. */ bytesperline = bpl; - pix->plane_fmt[i].bytesperline = bytesperline; - *sizeimage = (pix->width * pix->height * fmt->depth[i]) / 8; + plane_fmt->bytesperline = bytesperline; + plane_fmt->sizeimage = max((pix->width * pix->height * + fmt->depth[i]) / 8, plane_fmt->sizeimage); } } From 0b4b1f199d367762ddb68cb3303431fa6c84a7d6 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Wed, 16 May 2012 15:03:50 -0300 Subject: [PATCH 149/612] [media] s5p-fimc: Remove superfluous checks for buffer type The checks are already done at the v4l2 framework. Signed-off-by: Sylwester Nawrocki Signed-off-by: Kyungmin Park Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/s5p-fimc/fimc-capture.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/media/video/s5p-fimc/fimc-capture.c b/drivers/media/video/s5p-fimc/fimc-capture.c index 13df7230c8d8..0fd12dfbd3db 100644 --- a/drivers/media/video/s5p-fimc/fimc-capture.c +++ b/drivers/media/video/s5p-fimc/fimc-capture.c @@ -819,9 +819,6 @@ static int fimc_cap_g_fmt_mplane(struct file *file, void *fh, struct fimc_dev *fimc = video_drvdata(file); struct fimc_ctx *ctx = fimc->vid_cap.ctx; - if (f->type != V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) - return -EINVAL; - return fimc_fill_format(&ctx->d_frame, f); } @@ -834,9 +831,6 @@ static int fimc_cap_try_fmt_mplane(struct file *file, void *fh, struct v4l2_mbus_framefmt mf; struct fimc_fmt *ffmt = NULL; - if (f->type != V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) - return -EINVAL; - if (pix->pixelformat == V4L2_PIX_FMT_JPEG) { fimc_capture_try_format(ctx, &pix->width, &pix->height, NULL, &pix->pixelformat, @@ -888,8 +882,6 @@ static int fimc_capture_set_format(struct fimc_dev *fimc, struct v4l2_format *f) struct fimc_fmt *s_fmt = NULL; int ret, i; - if (f->type != V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) - return -EINVAL; if (vb2_is_busy(&fimc->vid_cap.vbq)) return -EBUSY; From e3fc82e8b9f550d28f05600b98bcc8c26beef2ef Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Thu, 17 May 2012 14:22:10 -0300 Subject: [PATCH 150/612] [media] s5p-fimc: Prevent lock-up in multiple sensor systems The camera clocks managed by the driver were improperly reference counted and remained disabled when multiple video nodes were opened simultaneously. It manifested itself with following warning: [12.920000] WARNING: at drivers/media/video/s5p-fimc/fimc-mdevice.c:787 __fimc_md_set_camclk+0x1c0/0x1dc() [13.005000] Modules linked in: [13.005000] Backtrace: [13.040000] [] (dump_backtrace+0x0/0x10c) from [] (dump_stack+0x18/0x1c) [13.070000] r7:00000009 r6:00000313 r5:c02d576c r4:00000000 [13.155000] [] (dump_stack+0x0/0x1c) from [] (warn_slowpath_common+0x54/0x6c) [13.285000] [] (warn_slowpath_common+0x0/0x6c) from [] (warn_slowpath_null+0x24/0x2c) [13.360000] r9:e1981010 r8:00000000 r7:c061d3fc r6:e1981010 r5:e1981030 [13.430000] r4:00000000 [13.430000] [] (warn_slowpath_null+0x0/0x2c) from [] (__fimc_md_set_camclk+0x1c0/0x1dc) [13.550000] [] (__fimc_md_set_camclk+0x0/0x1dc) from [] (fimc_md_set_camclk+0x28/0x2c) [13.630000] [] (fimc_md_set_camclk+0x0/0x2c) from [] (__fimc_pipeline_shutdown+0x34/0x50) [13.705000] [] (__fimc_pipeline_shutdown+0x0/0x50) from [] (fimc_pipeline_shutdown+0x40/0x58) [13.765000] r5:e2391200 r4:e2357704 [13.805000] [] (fimc_pipeline_shutdown+0x0/0x58) from [] (fimc_capture_close+0xcc/0xe4) [13.915000] r5:e1b396c0 r4:e2357410 [13.915000] [] (fimc_capture_close+0x0/0xe4) from [] (v4l2_release+0x5c/0x80) [13.970000] r7:00000010 r6:e1d2d990 r5:e1b396c0 r4:e2394800 [14.000000] [] (v4l2_release+0x0/0x80) from [] (fput+0xc0/0x22c) [14.015000] r5:c157ef30 r4:e1b396c0 [14.015000] [] (fput+0x0/0x22c) from [] (filp_close+0x60/0x80) [14.080000] [] (filp_close+0x0/0x80) from [] (sys_close+0xb8/0xf4) [14.125000] r7:00000001 r6:e1b396c0 r5:c1400340 r4:c1400300 [14.125000] [] (sys_close+0x0/0xf4) from [] (ret_fast_syscall+0x0/0x30) [14.205000] r7:00000006 r6:beee5b94 r5:00000003 r4:b6f64fac Fix this, as well as potential memory leaks due to not calling v4l2_fh_release() on some error paths. Also remove some error logs printed for events that aren't critical and are normal conditions for some system configurations. Also check if the device have been properly run-time enabled during video node open. Signed-off-by: Sylwester Nawrocki Signed-off-by: Kyungmin Park Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/s5p-fimc/fimc-capture.c | 44 +++++++++++---------- drivers/media/video/s5p-fimc/fimc-lite.c | 14 ++++--- drivers/media/video/s5p-fimc/fimc-mdevice.c | 19 +++------ drivers/media/video/s5p-fimc/fimc-mdevice.h | 2 - 4 files changed, 37 insertions(+), 42 deletions(-) diff --git a/drivers/media/video/s5p-fimc/fimc-capture.c b/drivers/media/video/s5p-fimc/fimc-capture.c index 0fd12dfbd3db..71e483847a17 100644 --- a/drivers/media/video/s5p-fimc/fimc-capture.c +++ b/drivers/media/video/s5p-fimc/fimc-capture.c @@ -480,37 +480,39 @@ static int fimc_capture_set_default_format(struct fimc_dev *fimc); static int fimc_capture_open(struct file *file) { struct fimc_dev *fimc = video_drvdata(file); - int ret = v4l2_fh_open(file); - - if (ret) - return ret; + int ret; dbg("pid: %d, state: 0x%lx", task_pid_nr(current), fimc->state); - /* Return if the corresponding video mem2mem node is already opened. */ if (fimc_m2m_active(fimc)) return -EBUSY; set_bit(ST_CAPT_BUSY, &fimc->state); - pm_runtime_get_sync(&fimc->pdev->dev); + ret = pm_runtime_get_sync(&fimc->pdev->dev); + if (ret < 0) + return ret; - if (++fimc->vid_cap.refcnt == 1) { - ret = fimc_pipeline_initialize(&fimc->pipeline, - &fimc->vid_cap.vfd->entity, true); - if (ret < 0) { - dev_err(&fimc->pdev->dev, - "Video pipeline initialization failed\n"); - clear_bit(ST_CAPT_BUSY, &fimc->state); - pm_runtime_put_sync(&fimc->pdev->dev); - fimc->vid_cap.refcnt--; - v4l2_fh_release(file); - return ret; - } - ret = fimc_capture_ctrls_create(fimc); + ret = v4l2_fh_open(file); + if (ret) + return ret; - if (!ret && !fimc->vid_cap.user_subdev_api) - ret = fimc_capture_set_default_format(fimc); + if (++fimc->vid_cap.refcnt != 1) + return 0; + + ret = fimc_pipeline_initialize(&fimc->pipeline, + &fimc->vid_cap.vfd->entity, true); + if (ret < 0) { + clear_bit(ST_CAPT_BUSY, &fimc->state); + pm_runtime_put_sync(&fimc->pdev->dev); + fimc->vid_cap.refcnt--; + v4l2_fh_release(file); + return ret; } + ret = fimc_capture_ctrls_create(fimc); + + if (!ret && !fimc->vid_cap.user_subdev_api) + ret = fimc_capture_set_default_format(fimc); + return ret; } diff --git a/drivers/media/video/s5p-fimc/fimc-lite.c b/drivers/media/video/s5p-fimc/fimc-lite.c index 400d701aef04..bbe93e4a8731 100644 --- a/drivers/media/video/s5p-fimc/fimc-lite.c +++ b/drivers/media/video/s5p-fimc/fimc-lite.c @@ -451,21 +451,23 @@ static void fimc_lite_clear_event_counters(struct fimc_lite *fimc) static int fimc_lite_open(struct file *file) { struct fimc_lite *fimc = video_drvdata(file); - int ret = v4l2_fh_open(file); - - if (ret) - return ret; + int ret; set_bit(ST_FLITE_IN_USE, &fimc->state); - pm_runtime_get_sync(&fimc->pdev->dev); + ret = pm_runtime_get_sync(&fimc->pdev->dev); + if (ret < 0) + return ret; if (++fimc->ref_count != 1 || fimc->out_path != FIMC_IO_DMA) + return 0; + + ret = v4l2_fh_open(file); + if (ret < 0) return ret; ret = fimc_pipeline_initialize(&fimc->pipeline, &fimc->vfd->entity, true); if (ret < 0) { - v4l2_err(fimc->vfd, "Video pipeline initialization failed\n"); pm_runtime_put_sync(&fimc->pdev->dev); fimc->ref_count--; v4l2_fh_release(file); diff --git a/drivers/media/video/s5p-fimc/fimc-mdevice.c b/drivers/media/video/s5p-fimc/fimc-mdevice.c index dffe4da5eaa0..52cef4865423 100644 --- a/drivers/media/video/s5p-fimc/fimc-mdevice.c +++ b/drivers/media/video/s5p-fimc/fimc-mdevice.c @@ -741,8 +741,8 @@ static void fimc_md_put_clocks(struct fimc_md *fmd) } static int __fimc_md_set_camclk(struct fimc_md *fmd, - struct fimc_sensor_info *s_info, - bool on) + struct fimc_sensor_info *s_info, + bool on) { struct s5p_fimc_isp_info *pdata = s_info->pdata; struct fimc_camclk_info *camclk; @@ -751,12 +751,10 @@ static int __fimc_md_set_camclk(struct fimc_md *fmd, if (WARN_ON(pdata->clk_id >= FIMC_MAX_CAMCLKS) || fmd == NULL) return -EINVAL; - if (s_info->clk_on == on) - return 0; camclk = &fmd->camclk[pdata->clk_id]; - dbg("camclk %d, f: %lu, clk: %p, on: %d", - pdata->clk_id, pdata->clk_frequency, camclk, on); + dbg("camclk %d, f: %lu, use_count: %d, on: %d", + pdata->clk_id, pdata->clk_frequency, camclk->use_count, on); if (on) { if (camclk->use_count > 0 && @@ -767,11 +765,9 @@ static int __fimc_md_set_camclk(struct fimc_md *fmd, clk_set_rate(camclk->clock, pdata->clk_frequency); camclk->frequency = pdata->clk_frequency; ret = clk_enable(camclk->clock); + dbg("Enabled camclk %d: f: %lu", pdata->clk_id, + clk_get_rate(camclk->clock)); } - s_info->clk_on = 1; - dbg("Enabled camclk %d: f: %lu", pdata->clk_id, - clk_get_rate(camclk->clock)); - return ret; } @@ -780,7 +776,6 @@ static int __fimc_md_set_camclk(struct fimc_md *fmd, if (--camclk->use_count == 0) { clk_disable(camclk->clock); - s_info->clk_on = 0; dbg("Disabled camclk %d", pdata->clk_id); } return ret; @@ -796,8 +791,6 @@ static int __fimc_md_set_camclk(struct fimc_md *fmd, * devices to which sensors can be attached, either directly or through * the MIPI CSI receiver. The clock is allowed here to be used by * multiple sensors concurrently if they use same frequency. - * The per sensor subdev clk_on attribute helps to synchronize accesses - * to the sclk_cam clocks from the video and media device nodes. * This function should only be called when the graph mutex is held. */ int fimc_md_set_camclk(struct v4l2_subdev *sd, bool on) diff --git a/drivers/media/video/s5p-fimc/fimc-mdevice.h b/drivers/media/video/s5p-fimc/fimc-mdevice.h index 3b8a3492a176..1f5dbaff5442 100644 --- a/drivers/media/video/s5p-fimc/fimc-mdevice.h +++ b/drivers/media/video/s5p-fimc/fimc-mdevice.h @@ -47,7 +47,6 @@ struct fimc_camclk_info { * @pdata: sensor's atrributes passed as media device's platform data * @subdev: image sensor v4l2 subdev * @host: fimc device the sensor is currently linked to - * @clk_on: sclk_cam clock's state associated with this subdev * * This data structure applies to image sensor and the writeback subdevs. */ @@ -55,7 +54,6 @@ struct fimc_sensor_info { struct s5p_fimc_isp_info *pdata; struct v4l2_subdev *subdev; struct fimc_dev *host; - bool clk_on; }; /** From 316efab3e949e4fbdacb0975bf33adbad89115db Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Fri, 18 May 2012 13:31:28 -0300 Subject: [PATCH 151/612] [media] s5p-fimc: Fix fimc-lite system wide suspend procedure Only suspend the video pipeline devices if they were active before the pm.suspend() helper is called. This patch prevents following error: /# echo mem > /sys/power/state [ 34.965000] PM: Syncing filesystems ... done. [ 35.035000] Freezing user space processes ... (elapsed 0.01 seconds) done. ... [ 35.105000] dpm_run_callback(): platform_pm_suspend+0x0/0x5c returns -22 [ 35.105000] PM: Device exynos-fimc-lite.1 failed to suspend: error -22 [ 35.105000] PM: Some devices failed to suspend Signed-off-by: Sylwester Nawrocki Signed-off-by: Kyungmin Park Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/s5p-fimc/fimc-lite.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/video/s5p-fimc/fimc-lite.c b/drivers/media/video/s5p-fimc/fimc-lite.c index bbe93e4a8731..b0a8ce61ebf2 100644 --- a/drivers/media/video/s5p-fimc/fimc-lite.c +++ b/drivers/media/video/s5p-fimc/fimc-lite.c @@ -1510,7 +1510,7 @@ static int fimc_lite_suspend(struct device *dev) return 0; ret = fimc_lite_stop_capture(fimc, suspend); - if (ret) + if (ret < 0 || !fimc_lite_active(fimc)) return ret; return fimc_pipeline_shutdown(&fimc->pipeline); From 0a198bcd511fcc60dc5daa203d221aafc95e0471 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Tue, 22 May 2012 13:50:14 -0300 Subject: [PATCH 152/612] [media] s5p-fimc: Shorten pixel formats description Shorten pixel format descriptions that exceed 32 characters so they're not being truncated when queried from user space. Signed-off-by: Sylwester Nawrocki Signed-off-by: Kyungmin Park Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/s5p-fimc/fimc-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/video/s5p-fimc/fimc-core.c b/drivers/media/video/s5p-fimc/fimc-core.c index 65124a24c30f..987bff20cc34 100644 --- a/drivers/media/video/s5p-fimc/fimc-core.c +++ b/drivers/media/video/s5p-fimc/fimc-core.c @@ -153,7 +153,7 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 2, .flags = FMT_FLAGS_M2M, }, { - .name = "YUV 4:2:0 non-contiguous 2-planar, Y/CbCr", + .name = "YUV 4:2:0 non-contig. 2p, Y/CbCr", .fourcc = V4L2_PIX_FMT_NV12M, .color = FIMC_FMT_YCBCR420, .depth = { 8, 4 }, @@ -161,7 +161,7 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 2, .flags = FMT_FLAGS_M2M, }, { - .name = "YUV 4:2:0 non-contiguous 3-planar, Y/Cb/Cr", + .name = "YUV 4:2:0 non-contig. 3p, Y/Cb/Cr", .fourcc = V4L2_PIX_FMT_YUV420M, .color = FIMC_FMT_YCBCR420, .depth = { 8, 2, 2 }, @@ -169,7 +169,7 @@ static struct fimc_fmt fimc_formats[] = { .colplanes = 3, .flags = FMT_FLAGS_M2M, }, { - .name = "YUV 4:2:0 non-contiguous 2-planar, Y/CbCr, tiled", + .name = "YUV 4:2:0 non-contig. 2p, tiled", .fourcc = V4L2_PIX_FMT_NV12MT, .color = FIMC_FMT_YCBCR420, .depth = { 8, 4 }, From 8183e7a7e2a4b542bd3044c5e3b0e116b0a7d2a1 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Fri, 25 May 2012 07:04:01 -0300 Subject: [PATCH 153/612] [media] s5p-fimc: Update to the control handler lock changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 77e7c4e624404c6edb5686b3d5f873c6008ed6b0 "v4l: Allow changing control handler lock" changed the lock field of struct v4l2_ctrl_handler to a pointer and this driver wasn't updated properly. This patch fixes following warning: drivers/media/video/s5p-fimc/fimc-core.c: In function ‘fimc_ctrls_activate’: drivers/media/video/s5p-fimc/fimc-core.c:644: warning: passing argument 1 of ‘mutex_lock’ from incompatible pointer type include/linux/mutex.h:152: note: expected ‘struct mutex *’ but argument is of type ‘struct mutex **’ drivers/media/video/s5p-fimc/fimc-core.c:663: warning: passing argument 1 of ‘mutex_unlock’ from incompatible pointer type include/linux/mutex.h:169: note: expected ‘struct mutex *’ but argument is of type ‘struct mutex **’ Signed-off-by: Sylwester Nawrocki Signed-off-by: Kyungmin Park Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/s5p-fimc/fimc-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/video/s5p-fimc/fimc-core.c b/drivers/media/video/s5p-fimc/fimc-core.c index 987bff20cc34..a4646ca1d56f 100644 --- a/drivers/media/video/s5p-fimc/fimc-core.c +++ b/drivers/media/video/s5p-fimc/fimc-core.c @@ -641,7 +641,7 @@ void fimc_ctrls_activate(struct fimc_ctx *ctx, bool active) if (!ctrls->ready) return; - mutex_lock(&ctrls->handler.lock); + mutex_lock(ctrls->handler.lock); v4l2_ctrl_activate(ctrls->rotate, active); v4l2_ctrl_activate(ctrls->hflip, active); v4l2_ctrl_activate(ctrls->vflip, active); @@ -660,7 +660,7 @@ void fimc_ctrls_activate(struct fimc_ctx *ctx, bool active) ctx->hflip = 0; ctx->vflip = 0; } - mutex_unlock(&ctrls->handler.lock); + mutex_unlock(ctrls->handler.lock); } /* Update maximum value of the alpha color control */ From a60a295986edcbca6603cd42b4e38d7c83d87fb6 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 25 May 2012 07:08:05 -0300 Subject: [PATCH 154/612] [media] s5p-fimc: media_entity_pipeline_start() may fail Take into account media_entity_pipeline_start() may fail. [s.nawrocki: rebased onto latest tree] Signed-off-by: Sakari Ailus Signed-off-by: Sylwester Nawrocki Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/s5p-fimc/fimc-capture.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/media/video/s5p-fimc/fimc-capture.c b/drivers/media/video/s5p-fimc/fimc-capture.c index 71e483847a17..7ace324d30cd 100644 --- a/drivers/media/video/s5p-fimc/fimc-capture.c +++ b/drivers/media/video/s5p-fimc/fimc-capture.c @@ -1045,8 +1045,10 @@ static int fimc_cap_streamon(struct file *file, void *priv, if (fimc_capture_active(fimc)) return -EBUSY; - media_entity_pipeline_start(&p->subdevs[IDX_SENSOR]->entity, + ret = media_entity_pipeline_start(&p->subdevs[IDX_SENSOR]->entity, p->m_pipeline); + if (ret < 0) + return ret; if (fimc->vid_cap.user_subdev_api) { ret = fimc_pipeline_validate(fimc); From a1a5861bd9ca3a6cb7ab89dd836da8cd158986b0 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Fri, 25 May 2012 03:29:38 -0300 Subject: [PATCH 155/612] [media] s5p-fimc: Fix compiler warning in fimc-lite.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch is an update to changed media_entity_pipeline_start() signature in commit af88be3887c1a0b20d0792c3c237a67c73ef3286, "media: Add link_validate() op to check links to the sink pad" It fixes the following warning: drivers/media/video/s5p-fimc/fimc-lite.c: In function ‘fimc_lite_streamon’: drivers/media/video/s5p-fimc/fimc-lite.c:765:29: warning: ignoring return value of ‘media_entity_pipeline_start’, declared with attribute warn_unused_result [-Wunused-result] Signed-off-by: Sachin Kamat Signed-off-by: Sylwester Nawrocki Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/s5p-fimc/fimc-lite.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/media/video/s5p-fimc/fimc-lite.c b/drivers/media/video/s5p-fimc/fimc-lite.c index b0a8ce61ebf2..4d269b8fa1ef 100644 --- a/drivers/media/video/s5p-fimc/fimc-lite.c +++ b/drivers/media/video/s5p-fimc/fimc-lite.c @@ -764,7 +764,9 @@ static int fimc_lite_streamon(struct file *file, void *priv, if (fimc_lite_active(fimc)) return -EBUSY; - media_entity_pipeline_start(&sensor->entity, p->m_pipeline); + ret = media_entity_pipeline_start(&sensor->entity, p->m_pipeline); + if (ret < 0) + return ret; ret = fimc_pipeline_validate(fimc); if (ret) { From f676fa068819357f716953920b764554fe116b3c Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Fri, 25 May 2012 03:29:40 -0300 Subject: [PATCH 156/612] [media] s5p-fimc: Stop media entity pipeline if fimc_pipeline_validate fails Stops the media entity pipeline which was started earlier if fimc_pipeline_validate fails. [s.nawrocki: reworked to not exceed 80 characters line length] Signed-off-by: Sachin Kamat Signed-off-by: Sylwester Nawrocki Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/s5p-fimc/fimc-capture.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/media/video/s5p-fimc/fimc-capture.c b/drivers/media/video/s5p-fimc/fimc-capture.c index 7ace324d30cd..725812aa0c30 100644 --- a/drivers/media/video/s5p-fimc/fimc-capture.c +++ b/drivers/media/video/s5p-fimc/fimc-capture.c @@ -1040,20 +1040,22 @@ static int fimc_cap_streamon(struct file *file, void *priv, { struct fimc_dev *fimc = video_drvdata(file); struct fimc_pipeline *p = &fimc->pipeline; + struct v4l2_subdev *sd = p->subdevs[IDX_SENSOR]; int ret; if (fimc_capture_active(fimc)) return -EBUSY; - ret = media_entity_pipeline_start(&p->subdevs[IDX_SENSOR]->entity, - p->m_pipeline); + ret = media_entity_pipeline_start(&sd->entity, p->m_pipeline); if (ret < 0) return ret; if (fimc->vid_cap.user_subdev_api) { ret = fimc_pipeline_validate(fimc); - if (ret) + if (ret < 0) { + media_entity_pipeline_stop(&sd->entity); return ret; + } } return vb2_streamon(&fimc->vid_cap.vbq, type); } From 11cab711f686893f2696a061dfca30454a624784 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 22 Jun 2012 08:12:12 -0500 Subject: [PATCH 157/612] x86/uv: Fix the UV BAU destination timeout period Correct the calculation of a destination timeout period, which is used to distinguish between a destination timeout and the situation where all the target software ack resources are full and a request is returned immediately. The problem is that integer arithmetic was overflowing, yielding a very large result. Without this fix destination timeouts are identified as resource 'plugged' events and an ipi method of resource releasing is unnecessarily employed. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20120622131212.GA31884@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 59880afa851f..0c48d438cbba 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1811,8 +1811,8 @@ static int calculate_destination_timeout(void) index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; - base = timeout_base_ns[index]; - ts_ns = base * mult1 * mult2; + ts_ns = timeout_base_ns[index]; + ts_ns *= (mult1 * mult2); ret = ts_ns / 1000; } else { /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ From 26ef85770c765bb8b6b6922f8a413872dd8e3979 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 22 Jun 2012 08:13:30 -0500 Subject: [PATCH 158/612] x86/uv: Implement UV BAU runtime enable and disable control via /proc/sgi_uv/ This patch enables the BAU to be turned on or off dynamically. echo "on" > /proc/sgi_uv/ptc_statistics echo "off" > /proc/sgi_uv/ptc_statistics The system may be booted with or without the nobau option. Whether the system currently has the BAU off can be seen in the /proc file -- normally with the baustats script. Each cpu will have a 1 in the bauoff field if the BAU was turned off, so baustats will give a count of cpus that have it off. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20120622131330.GB31884@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 2 + arch/x86/platform/uv/tlb_uv.c | 76 +++++++++++++++++++++++++------- 2 files changed, 63 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 6149b476d9df..847c00b721b2 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -520,6 +520,7 @@ struct ptc_stats { unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ + unsigned long s_enters; /* entries to the driver */ /* destination statistics */ unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ @@ -586,6 +587,7 @@ struct bau_control { int timeout_tries; int ipi_attempts; int conseccompletes; + short nobau; int baudisabled; int set_bau_off; short cpu; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 0c48d438cbba..1492170cbb5a 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -38,6 +38,7 @@ static int timeout_base_ns[] = { static int timeout_us; static int nobau; +static int nobau_perm; static int baudisabled; static spinlock_t disable_lock; static cycles_t congested_cycles; @@ -120,6 +121,40 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); +static void +set_bau_on(void) +{ + int cpu; + struct bau_control *bcp; + + if (nobau_perm) { + pr_info("BAU not initialized; cannot be turned on\n"); + return; + } + nobau = 0; + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->nobau = 0; + } + pr_info("BAU turned on\n"); + return; +} + +static void +set_bau_off(void) +{ + int cpu; + struct bau_control *bcp; + + nobau = 1; + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->nobau = 1; + } + pr_info("BAU turned off\n"); + return; +} + /* * Determine the first node on a uvhub. 'Nodes' are used for kernel * memory allocation. @@ -1079,12 +1114,12 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct ptc_stats *stat; struct bau_control *bcp; - /* kernel was booted 'nobau' */ - if (nobau) - return cpumask; - bcp = &per_cpu(bau_control, cpu); stat = bcp->statp; + stat->s_enters++; + + if (bcp->nobau) + return cpumask; /* bau was disabled due to slow response */ if (bcp->baudisabled) { @@ -1338,29 +1373,32 @@ static inline unsigned long long usec_2_cycles(unsigned long microsec) static int ptc_seq_show(struct seq_file *file, void *data) { struct ptc_stats *stat; + struct bau_control *bcp; int cpu; cpu = *(loff_t *)data; if (!cpu) { seq_printf(file, - "# cpu sent stime self locals remotes ncpus localhub "); + "# cpu bauoff sent stime self locals remotes ncpus localhub "); seq_printf(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); seq_printf(file, - "resetp resett giveup sto bz throt swack recv rtime "); + "resetp resett giveup sto bz throt enters swack recv rtime "); seq_printf(file, "all one mult none retry canc nocan reset rcan "); seq_printf(file, "disable enable wars warshw warwaits\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { - stat = &per_cpu(ptcstats, cpu); + bcp = &per_cpu(bau_control, cpu); + stat = bcp->statp; /* source side statistics */ seq_printf(file, - "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", - cpu, stat->s_requestor, cycles_2_us(stat->s_time), + "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + cpu, bcp->nobau, stat->s_requestor, + cycles_2_us(stat->s_time), stat->s_ntargself, stat->s_ntarglocals, stat->s_ntargremotes, stat->s_ntargcpu, stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, @@ -1369,11 +1407,11 @@ static int ptc_seq_show(struct seq_file *file, void *data) stat->s_ntarguvhub8, stat->s_ntarguvhub4, stat->s_ntarguvhub2, stat->s_ntarguvhub1, stat->s_dtimeout, stat->s_strongnacks); - seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", + seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld ", stat->s_retry_messages, stat->s_retriesok, stat->s_resets_plug, stat->s_resets_timeout, stat->s_giveup, stat->s_stimeout, - stat->s_busy, stat->s_throttles); + stat->s_busy, stat->s_throttles, stat->s_enters); /* destination side statistics */ seq_printf(file, @@ -1438,6 +1476,14 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user, return -EFAULT; optstr[count - 1] = '\0'; + if (!strcmp(optstr, "on")) { + set_bau_on(); + return count; + } else if (!strcmp(optstr, "off")) { + set_bau_off(); + return count; + } + if (strict_strtol(optstr, 10, &input_arg) < 0) { printk(KERN_DEBUG "%s is invalid\n", optstr); return -EINVAL; @@ -1836,6 +1882,8 @@ static void __init init_per_cpu_tunables(void) for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); bcp->baudisabled = 0; + if (nobau) + bcp->nobau = 1; bcp->statp = &per_cpu(ptcstats, cpu); /* time interval to catch a hardware stay-busy bug */ bcp->timeout_interval = usec_2_cycles(2*timeout_us); @@ -2069,9 +2117,6 @@ static int __init uv_bau_init(void) if (!is_uv_system()) return 0; - if (nobau) - return 0; - for_each_possible_cpu(cur_cpu) { mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); @@ -2091,7 +2136,8 @@ static int __init uv_bau_init(void) enable_timeouts(); if (init_per_cpu(nuvhubs, uv_base_pnode)) { - nobau = 1; + set_bau_off(); + nobau_perm = 1; return 0; } From 8b6e511e51f7e540c8e71022318ee4cc9a4567a7 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 22 Jun 2012 08:14:59 -0500 Subject: [PATCH 159/612] x86/uv: Work around UV2 BAU hangs On SGI's UV2 the BAU (Broadcast Assist Unit) driver can hang under a heavy load. To cure this: - Disable the UV2 extended status mode (see UV2_EXT_SHFT), as this mode changes BAU behavior in more ways then just delivering an extra bit of status. Revert status to just two meaningful bits, like UV1. - Use no IPI-style resets on UV2. Just give up the request for whatever the reason it failed and let it be accomplished with the legacy IPI method. - Use no alternate sending descriptor (the former UV2 workaround bcp->using_desc and handle_uv2_busy() stuff). Just disable the use of the BAU for a period of time in favor of the legacy IPI method when the h/w bug leaves a descriptor busy. -- new tunable: giveup_limit determines the threshold at which a hub is so plugged that it should do all requests with the legacy IPI method for a period of time -- generalize disable_for_congestion() (renamed disable_for_period()) for use whenever a hub should avoid using the BAU for a period of time Also: - Fix find_another_by_swack(), which is part of the UV2 bug workaround - Correct and clarify the statistics (new stats s_overipilimit, s_giveuplimit, s_enters, s_ipifordisabled, s_plugged, s_congested) Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20120622131459.GC31884@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 28 ++- arch/x86/platform/uv/tlb_uv.c | 385 +++++++++++++++---------------- 2 files changed, 199 insertions(+), 214 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 847c00b721b2..a06983cdc125 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -140,6 +140,9 @@ #define IPI_RESET_LIMIT 1 /* after this # consecutive successes, bump up the throttle if it was lowered */ #define COMPLETE_THRESHOLD 5 +/* after this # of giveups (fall back to kernel IPI's) disable the use of + the BAU for a period of time */ +#define GIVEUP_LIMIT 100 #define UV_LB_SUBNODEID 0x10 @@ -166,7 +169,6 @@ #define FLUSH_RETRY_TIMEOUT 2 #define FLUSH_GIVEUP 3 #define FLUSH_COMPLETE 4 -#define FLUSH_RETRY_BUSYBUG 5 /* * tuning the action when the numalink network is extremely delayed @@ -175,7 +177,7 @@ microseconds */ #define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */ -#define CONGESTED_PERIOD 30 /* time for the bau to be +#define DISABLED_PERIOD 10 /* time for the bau to be disabled, in seconds */ /* see msg_type: */ #define MSG_NOOP 0 @@ -520,7 +522,12 @@ struct ptc_stats { unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ - unsigned long s_enters; /* entries to the driver */ + unsigned long s_overipilimit; /* over the ipi reset limit */ + unsigned long s_giveuplimit; /* disables, over giveup limit*/ + unsigned long s_enters; /* entries to the driver */ + unsigned long s_ipifordisabled; /* fall back to IPI; disabled */ + unsigned long s_plugged; /* plugged by h/w bug*/ + unsigned long s_congested; /* giveup on long wait */ /* destination statistics */ unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ @@ -588,8 +595,7 @@ struct bau_control { int ipi_attempts; int conseccompletes; short nobau; - int baudisabled; - int set_bau_off; + short baudisabled; short cpu; short osnode; short uvhub_cpu; @@ -598,14 +604,16 @@ struct bau_control { short cpus_in_socket; short cpus_in_uvhub; short partition_base_pnode; - short using_desc; /* an index, like uvhub_cpu */ - unsigned int inuse_map; + short busy; /* all were busy (war) */ unsigned short message_number; unsigned short uvhub_quiesce; short socket_acknowledge_count[DEST_Q_SIZE]; cycles_t send_message; + cycles_t period_end; + cycles_t period_time; spinlock_t uvhub_lock; spinlock_t queue_lock; + spinlock_t disable_lock; /* tunables */ int max_concurr; int max_concurr_const; @@ -616,9 +624,9 @@ struct bau_control { int complete_threshold; int cong_response_us; int cong_reps; - int cong_period; - unsigned long clocks_per_100_usec; - cycles_t period_time; + cycles_t disabled_period; + int period_giveups; + int giveup_limit; long period_requests; struct hub_and_pnode *thp; }; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 1492170cbb5a..71b5d5a07d7b 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1,7 +1,7 @@ /* * SGI UltraViolet TLB flush routines. * - * (c) 2008-2011 Cliff Wickman , SGI. + * (c) 2008-2012 Cliff Wickman , SGI. * * This code is released under the GNU General Public License version 2 or * later. @@ -39,8 +39,6 @@ static int timeout_base_ns[] = { static int timeout_us; static int nobau; static int nobau_perm; -static int baudisabled; -static spinlock_t disable_lock; static cycles_t congested_cycles; /* tunables: */ @@ -48,12 +46,13 @@ static int max_concurr = MAX_BAU_CONCURRENT; static int max_concurr_const = MAX_BAU_CONCURRENT; static int plugged_delay = PLUGGED_DELAY; static int plugsb4reset = PLUGSB4RESET; +static int giveup_limit = GIVEUP_LIMIT; static int timeoutsb4reset = TIMEOUTSB4RESET; static int ipi_reset_limit = IPI_RESET_LIMIT; static int complete_threshold = COMPLETE_THRESHOLD; static int congested_respns_us = CONGESTED_RESPONSE_US; static int congested_reps = CONGESTED_REPS; -static int congested_period = CONGESTED_PERIOD; +static int disabled_period = DISABLED_PERIOD; static struct tunables tunables[] = { {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ @@ -64,7 +63,8 @@ static struct tunables tunables[] = { {&complete_threshold, COMPLETE_THRESHOLD}, {&congested_respns_us, CONGESTED_RESPONSE_US}, {&congested_reps, CONGESTED_REPS}, - {&congested_period, CONGESTED_PERIOD} + {&disabled_period, DISABLED_PERIOD}, + {&giveup_limit, GIVEUP_LIMIT} }; static struct dentry *tunables_dir; @@ -313,7 +313,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, * Both sockets dump their completed count total into * the message's count. */ - smaster->socket_acknowledge_count[mdp->msg_slot] = 0; + *sp = 0; asp = (struct atomic_short *)&msg->acknowledge_count; msg_ack_count = atom_asr(socket_ack_count, asp); @@ -526,16 +526,15 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, } /* - * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. + * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register. + * But not currently used. */ static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) { unsigned long descriptor_status; - unsigned long descriptor_status2; - descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); - descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; - descriptor_status = (descriptor_status << 1) | descriptor_status2; + descriptor_status = + ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1; return descriptor_status; } @@ -566,87 +565,11 @@ int normal_busy(struct bau_control *bcp) */ int handle_uv2_busy(struct bau_control *bcp) { - int busy_one = bcp->using_desc; - int normal = bcp->uvhub_cpu; - int selected = -1; - int i; - unsigned long descriptor_status; - unsigned long status; - int mmr_offset; - struct bau_desc *bau_desc_old; - struct bau_desc *bau_desc_new; - struct bau_control *hmaster = bcp->uvhub_master; struct ptc_stats *stat = bcp->statp; - cycles_t ttm; stat->s_uv2_wars++; - spin_lock(&hmaster->uvhub_lock); - /* try for the original first */ - if (busy_one != normal) { - if (!normal_busy(bcp)) - selected = normal; - } - if (selected < 0) { - /* can't use the normal, select an alternate */ - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - descriptor_status = read_lmmr(mmr_offset); - - /* scan available descriptors 32-63 */ - for (i = 0; i < UV_CPUS_PER_AS; i++) { - if ((hmaster->inuse_map & (1 << i)) == 0) { - status = ((descriptor_status >> - (i * UV_ACT_STATUS_SIZE)) & - UV_ACT_STATUS_MASK) << 1; - if (status != UV2H_DESC_BUSY) { - selected = i + UV_CPUS_PER_AS; - break; - } - } - } - } - - if (busy_one != normal) - /* mark the busy alternate as not in-use */ - hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS)); - - if (selected >= 0) { - /* switch to the selected descriptor */ - if (selected != normal) { - /* set the selected alternate as in-use */ - hmaster->inuse_map |= - (1 << (selected - UV_CPUS_PER_AS)); - if (selected > stat->s_uv2_wars_hw) - stat->s_uv2_wars_hw = selected; - } - bau_desc_old = bcp->descriptor_base; - bau_desc_old += (ITEMS_PER_DESC * busy_one); - bcp->using_desc = selected; - bau_desc_new = bcp->descriptor_base; - bau_desc_new += (ITEMS_PER_DESC * selected); - *bau_desc_new = *bau_desc_old; - } else { - /* - * All are busy. Wait for the normal one for this cpu to - * free up. - */ - stat->s_uv2_war_waits++; - spin_unlock(&hmaster->uvhub_lock); - ttm = get_cycles(); - do { - cpu_relax(); - } while (normal_busy(bcp)); - spin_lock(&hmaster->uvhub_lock); - /* switch to the original descriptor */ - bcp->using_desc = normal; - bau_desc_old = bcp->descriptor_base; - bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc); - bcp->using_desc = (ITEMS_PER_DESC * normal); - bau_desc_new = bcp->descriptor_base; - bau_desc_new += (ITEMS_PER_DESC * normal); - *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */ - } - spin_unlock(&hmaster->uvhub_lock); - return FLUSH_RETRY_BUSYBUG; + bcp->busy = 1; + return FLUSH_GIVEUP; } static int uv2_wait_completion(struct bau_desc *bau_desc, @@ -655,7 +578,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, { unsigned long descriptor_stat; cycles_t ttm; - int desc = bcp->using_desc; + int desc = bcp->uvhub_cpu; long busy_reps = 0; struct ptc_stats *stat = bcp->statp; @@ -663,24 +586,38 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, /* spin on the status MMR, waiting for it to go idle */ while (descriptor_stat != UV2H_DESC_IDLE) { - /* - * Our software ack messages may be blocked because - * there are no swack resources available. As long - * as none of them has timed out hardware will NACK - * our message and its state will stay IDLE. - */ - if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || - (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { + if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) { + /* + * A h/w bug on the destination side may + * have prevented the message being marked + * pending, thus it doesn't get replied to + * and gets continually nacked until it times + * out with a SOURCE_TIMEOUT. + */ stat->s_stimeout++; return FLUSH_GIVEUP; - } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) { - stat->s_strongnacks++; - bcp->conseccompletes = 0; - return FLUSH_GIVEUP; } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { + ttm = get_cycles(); + + /* + * Our retries may be blocked by all destination + * swack resources being consumed, and a timeout + * pending. In that case hardware returns the + * ERROR that looks like a destination timeout. + * Without using the extended status we have to + * deduce from the short time that this was a + * strong nack. + */ + if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { + bcp->conseccompletes = 0; + stat->s_plugged++; + /* FLUSH_RETRY_PLUGGED causes hang on boot */ + return FLUSH_GIVEUP; + } stat->s_dtimeout++; bcp->conseccompletes = 0; - return FLUSH_RETRY_TIMEOUT; + /* FLUSH_RETRY_TIMEOUT causes hang on boot */ + return FLUSH_GIVEUP; } else { busy_reps++; if (busy_reps > 1000000) { @@ -688,9 +625,8 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, busy_reps = 0; ttm = get_cycles(); if ((ttm - bcp->send_message) > - (bcp->clocks_per_100_usec)) { + bcp->timeout_interval) return handle_uv2_busy(bcp); - } } /* * descriptor_stat is still BUSY @@ -714,7 +650,7 @@ static int wait_completion(struct bau_desc *bau_desc, { int right_shift; unsigned long mmr_offset; - int desc = bcp->using_desc; + int desc = bcp->uvhub_cpu; if (desc < UV_CPUS_PER_AS) { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; @@ -793,33 +729,31 @@ static void destination_timeout(struct bau_desc *bau_desc, } /* - * Completions are taking a very long time due to a congested numalink - * network. + * Stop all cpus on a uvhub from using the BAU for a period of time. + * This is reversed by check_enable. */ -static void disable_for_congestion(struct bau_control *bcp, - struct ptc_stats *stat) +static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat) { - /* let only one cpu do this disabling */ - spin_lock(&disable_lock); + int tcpu; + struct bau_control *tbcp; + struct bau_control *hmaster; + cycles_t tm1; - if (!baudisabled && bcp->period_requests && - ((bcp->period_time / bcp->period_requests) > congested_cycles)) { - int tcpu; - struct bau_control *tbcp; - /* it becomes this cpu's job to turn on the use of the - BAU again */ - baudisabled = 1; - bcp->set_bau_off = 1; - bcp->set_bau_on_time = get_cycles(); - bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period); + hmaster = bcp->uvhub_master; + spin_lock(&hmaster->disable_lock); + if (!bcp->baudisabled) { stat->s_bau_disabled++; + tm1 = get_cycles(); for_each_present_cpu(tcpu) { tbcp = &per_cpu(bau_control, tcpu); - tbcp->baudisabled = 1; + if (tbcp->uvhub_master == hmaster) { + tbcp->baudisabled = 1; + tbcp->set_bau_on_time = + tm1 + bcp->disabled_period; + } } } - - spin_unlock(&disable_lock); + spin_unlock(&hmaster->disable_lock); } static void count_max_concurr(int stat, struct bau_control *bcp, @@ -850,16 +784,30 @@ static void record_send_stats(cycles_t time1, cycles_t time2, bcp->period_requests++; bcp->period_time += elapsed; if ((elapsed > congested_cycles) && - (bcp->period_requests > bcp->cong_reps)) - disable_for_congestion(bcp, stat); + (bcp->period_requests > bcp->cong_reps) && + ((bcp->period_time / bcp->period_requests) > + congested_cycles)) { + stat->s_congested++; + disable_for_period(bcp, stat); + } } } else stat->s_requestor--; if (completion_status == FLUSH_COMPLETE && try > 1) stat->s_retriesok++; - else if (completion_status == FLUSH_GIVEUP) + else if (completion_status == FLUSH_GIVEUP) { stat->s_giveup++; + if (get_cycles() > bcp->period_end) + bcp->period_giveups = 0; + bcp->period_giveups++; + if (bcp->period_giveups == 1) + bcp->period_end = get_cycles() + bcp->disabled_period; + if (bcp->period_giveups > bcp->giveup_limit) { + disable_for_period(bcp, stat); + stat->s_giveuplimit++; + } + } } /* @@ -903,7 +851,8 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, * Returns 1 if it gives up entirely and the original cpu mask is to be * returned to the kernel. */ -int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, + struct bau_desc *bau_desc) { int seq_number = 0; int completion_stat = 0; @@ -916,24 +865,23 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) struct bau_control *hmaster = bcp->uvhub_master; struct uv1_bau_msg_header *uv1_hdr = NULL; struct uv2_bau_msg_header *uv2_hdr = NULL; - struct bau_desc *bau_desc; - if (bcp->uvhub_version == 1) + if (bcp->uvhub_version == 1) { + uv1 = 1; uv1_throttle(hmaster, stat); + } while (hmaster->uvhub_quiesce) cpu_relax(); time1 = get_cycles(); + if (uv1) + uv1_hdr = &bau_desc->header.uv1_hdr; + else + uv2_hdr = &bau_desc->header.uv2_hdr; + do { - bau_desc = bcp->descriptor_base; - bau_desc += (ITEMS_PER_DESC * bcp->using_desc); - if (bcp->uvhub_version == 1) { - uv1 = 1; - uv1_hdr = &bau_desc->header.uv1_hdr; - } else - uv2_hdr = &bau_desc->header.uv2_hdr; - if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) { + if (try == 0) { if (uv1) uv1_hdr->msg_type = MSG_REGULAR; else @@ -951,25 +899,24 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) uv1_hdr->sequence = seq_number; else uv2_hdr->sequence = seq_number; - index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; + index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; bcp->send_message = get_cycles(); write_mmr_activation(index); try++; completion_stat = wait_completion(bau_desc, bcp, try); - /* UV2: wait_completion() may change the bcp->using_desc */ handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { bcp->ipi_attempts = 0; + stat->s_overipilimit++; completion_stat = FLUSH_GIVEUP; break; } cpu_relax(); } while ((completion_stat == FLUSH_RETRY_PLUGGED) || - (completion_stat == FLUSH_RETRY_BUSYBUG) || (completion_stat == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); @@ -990,28 +937,33 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) } /* - * The BAU is disabled. When the disabled time period has expired, the cpu - * that disabled it must re-enable it. - * Return 0 if it is re-enabled for all cpus. + * The BAU is disabled for this uvhub. When the disabled time period has + * expired re-enable it. + * Return 0 if it is re-enabled for all cpus on this uvhub. */ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat) { int tcpu; struct bau_control *tbcp; + struct bau_control *hmaster; - if (bcp->set_bau_off) { - if (get_cycles() >= bcp->set_bau_on_time) { - stat->s_bau_reenabled++; - baudisabled = 0; - for_each_present_cpu(tcpu) { - tbcp = &per_cpu(bau_control, tcpu); + hmaster = bcp->uvhub_master; + spin_lock(&hmaster->disable_lock); + if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) { + stat->s_bau_reenabled++; + for_each_present_cpu(tcpu) { + tbcp = &per_cpu(bau_control, tcpu); + if (tbcp->uvhub_master == hmaster) { tbcp->baudisabled = 0; tbcp->period_requests = 0; tbcp->period_time = 0; + tbcp->period_giveups = 0; } - return 0; } + spin_unlock(&hmaster->disable_lock); + return 0; } + spin_unlock(&hmaster->disable_lock); return -1; } @@ -1113,6 +1065,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct cpumask *flush_mask; struct ptc_stats *stat; struct bau_control *bcp; + unsigned long descriptor_status; + unsigned long status; bcp = &per_cpu(bau_control, cpu); stat = bcp->statp; @@ -1121,10 +1075,22 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, if (bcp->nobau) return cpumask; + if (bcp->busy) { + descriptor_status = + read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0); + status = ((descriptor_status >> (bcp->uvhub_cpu * + UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1; + if (status == UV2H_DESC_BUSY) + return cpumask; + bcp->busy = 0; + } + /* bau was disabled due to slow response */ if (bcp->baudisabled) { - if (check_enable(bcp, stat)) + if (check_enable(bcp, stat)) { + stat->s_ipifordisabled++; return cpumask; + } } /* @@ -1140,7 +1106,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, stat->s_ntargself++; bau_desc = bcp->descriptor_base; - bau_desc += (ITEMS_PER_DESC * bcp->using_desc); + bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu); bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) return NULL; @@ -1153,25 +1119,27 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, * uv_flush_send_and_wait returns 0 if all cpu's were messaged, * or 1 if it gave up and the original cpumask should be returned. */ - if (!uv_flush_send_and_wait(flush_mask, bcp)) + if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc)) return NULL; else return cpumask; } /* - * Search the message queue for any 'other' message with the same software - * acknowledge resource bit vector. + * Search the message queue for any 'other' unprocessed message with the + * same software acknowledge resource bit vector as the 'msg' message. */ struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, - struct bau_control *bcp, unsigned char swack_vec) + struct bau_control *bcp) { struct bau_pq_entry *msg_next = msg + 1; + unsigned char swack_vec = msg->swack_vec; if (msg_next > bcp->queue_last) msg_next = bcp->queue_first; - while ((msg_next->swack_vec != 0) && (msg_next != msg)) { - if (msg_next->swack_vec == swack_vec) + while (msg_next != msg) { + if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) && + (msg_next->swack_vec == swack_vec)) return msg_next; msg_next++; if (msg_next > bcp->queue_last) @@ -1200,32 +1168,30 @@ void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) * This message was assigned a swack resource, but no * reserved acknowlegment is pending. * The bug has prevented this message from setting the MMR. - * And no other message has used the same sw_ack resource. - * Do the requested shootdown but do not reply to the msg. - * (the 0 means make no acknowledge) */ - bau_process_message(mdp, bcp, 0); - return; - } - - /* - * Some message has set the MMR 'pending' bit; it might have been - * another message. Look for that message. - */ - other_msg = find_another_by_swack(msg, bcp, msg->swack_vec); - if (other_msg) { - /* There is another. Do not ack the current one. */ - bau_process_message(mdp, bcp, 0); /* - * Let the natural processing of that message acknowledge - * it. Don't get the processing of sw_ack's out of order. + * Some message has set the MMR 'pending' bit; it might have + * been another message. Look for that message. */ - return; + other_msg = find_another_by_swack(msg, bcp); + if (other_msg) { + /* + * There is another. Process this one but do not + * ack it. + */ + bau_process_message(mdp, bcp, 0); + /* + * Let the natural processing of that other message + * acknowledge it. Don't get the processing of sw_ack's + * out of order. + */ + return; + } } /* - * There is no other message using this sw_ack, so it is safe to - * acknowledge it. + * Either the MMR shows this one pending a reply or there is no + * other message using this sw_ack, so it is safe to acknowledge it. */ bau_process_message(mdp, bcp, 1); @@ -1330,7 +1296,8 @@ static void __init enable_timeouts(void) */ mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { - mmr_image |= (1L << UV2_EXT_SHFT); + /* hw bug workaround; do not use extended status */ + mmr_image &= ~(1L << UV2_EXT_SHFT); } write_mmr_misc_control(pnode, mmr_image); } @@ -1379,24 +1346,26 @@ static int ptc_seq_show(struct seq_file *file, void *data) cpu = *(loff_t *)data; if (!cpu) { seq_printf(file, - "# cpu bauoff sent stime self locals remotes ncpus localhub "); + "# cpu bauoff sent stime self locals remotes ncpus localhub "); seq_printf(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); + "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries "); seq_printf(file, - "resetp resett giveup sto bz throt enters swack recv rtime "); + "rok resetp resett giveup sto bz throt disable "); seq_printf(file, - "all one mult none retry canc nocan reset rcan "); + "enable wars warshw warwaits enters ipidis plugged "); seq_printf(file, - "disable enable wars warshw warwaits\n"); + "ipiover glim cong swack recv rtime all one mult "); + seq_printf(file, + "none retry canc nocan reset rcan\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { bcp = &per_cpu(bau_control, cpu); stat = bcp->statp; /* source side statistics */ seq_printf(file, - "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", cpu, bcp->nobau, stat->s_requestor, cycles_2_us(stat->s_time), stat->s_ntargself, stat->s_ntarglocals, @@ -1407,25 +1376,28 @@ static int ptc_seq_show(struct seq_file *file, void *data) stat->s_ntarguvhub8, stat->s_ntarguvhub4, stat->s_ntarguvhub2, stat->s_ntarguvhub1, stat->s_dtimeout, stat->s_strongnacks); - seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld ", + seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", stat->s_retry_messages, stat->s_retriesok, stat->s_resets_plug, stat->s_resets_timeout, stat->s_giveup, stat->s_stimeout, - stat->s_busy, stat->s_throttles, stat->s_enters); + stat->s_busy, stat->s_throttles); + seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + stat->s_bau_disabled, stat->s_bau_reenabled, + stat->s_uv2_wars, stat->s_uv2_wars_hw, + stat->s_uv2_war_waits, stat->s_enters, + stat->s_ipifordisabled, stat->s_plugged, + stat->s_overipilimit, stat->s_giveuplimit, + stat->s_congested); /* destination side statistics */ seq_printf(file, - "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)), stat->d_requestee, cycles_2_us(stat->d_time), stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, stat->d_nomsg, stat->d_retries, stat->d_canceled, stat->d_nocanceled, stat->d_resets, stat->d_rcanceled); - seq_printf(file, "%ld %ld %ld %ld %ld\n", - stat->s_bau_disabled, stat->s_bau_reenabled, - stat->s_uv2_wars, stat->s_uv2_wars_hw, - stat->s_uv2_war_waits); } return 0; } @@ -1439,13 +1411,14 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf, char *buf; int ret; - buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", - "max_concur plugged_delay plugsb4reset", - "timeoutsb4reset ipi_reset_limit complete_threshold", - "congested_response_us congested_reps congested_period", + buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n", + "max_concur plugged_delay plugsb4reset timeoutsb4reset", + "ipi_reset_limit complete_threshold congested_response_us", + "congested_reps disabled_period giveup_limit", max_concurr, plugged_delay, plugsb4reset, timeoutsb4reset, ipi_reset_limit, complete_threshold, - congested_respns_us, congested_reps, congested_period); + congested_respns_us, congested_reps, disabled_period, + giveup_limit); if (!buf) return -ENOMEM; @@ -1616,7 +1589,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user, bcp->complete_threshold = complete_threshold; bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; - bcp->cong_period = congested_period; + bcp->disabled_period = sec_2_cycles(disabled_period); + bcp->giveup_limit = giveup_limit; } return count; } @@ -1745,6 +1719,10 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) * fairness chaining multilevel count replied_to */ } else { + /* + * BIOS uses legacy mode, but UV2 hardware always + * uses native mode for selective broadcasts. + */ uv2_hdr = &bd2->header.uv2_hdr; uv2_hdr->swack_flag = 1; uv2_hdr->base_dest_nasid = @@ -1896,10 +1874,11 @@ static void __init init_per_cpu_tunables(void) bcp->complete_threshold = complete_threshold; bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; - bcp->cong_period = congested_period; - bcp->clocks_per_100_usec = usec_2_cycles(100); + bcp->disabled_period = sec_2_cycles(disabled_period); + bcp->giveup_limit = giveup_limit; spin_lock_init(&bcp->queue_lock); spin_lock_init(&bcp->uvhub_lock); + spin_lock_init(&bcp->disable_lock); } } @@ -2020,7 +1999,6 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, } bcp->uvhub_master = *hmasterp; bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; - bcp->using_desc = bcp->uvhub_cpu; if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { printk(KERN_EMERG "%d cpus per uvhub invalid\n", bcp->uvhub_cpu); @@ -2123,7 +2101,6 @@ static int __init uv_bau_init(void) } nuvhubs = uv_num_possible_blades(); - spin_lock_init(&disable_lock); congested_cycles = usec_2_cycles(congested_respns_us); uv_base_pnode = 0x7fffffff; From 35a468732289372aab506d7cf73f98f0379888ae Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 21 Jun 2012 17:52:52 +0900 Subject: [PATCH 160/612] perf evsel: Fix a build failure on cross compilation The commit c410431cefefd ("perf tools: Reconstruct event with modifiers from perf_event_attr") added the line, but it's broken since it needs to go up 3 directories to get to the kernel root directory, not 2. However host gcc contains /usr/local/include in its search path, so that it can find the perf_event.h in /usr/include. This why we didn't notice the problem yet. But when I tried to cross compile it appears like: CC util/evsel.o util/evsel.c:18:44: error: ../../include/linux/perf_event.h: No such file or directory make: *** [util/evsel.o] Error 1 Looking at the source, it isn't needed at all as evsel.h already included the perf_event.h. So simply remove it would solve the problem. Signed-off-by: Namhyung Kim Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1340268772-5737-1-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 876f639d69ed..3d1f6968f175 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -15,7 +15,6 @@ #include "cpumap.h" #include "thread_map.h" #include "target.h" -#include "../../include/linux/perf_event.h" #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) #define GROUP_FD(group_fd, cpu) (*(int *)xyarray__entry(group_fd, cpu, 0)) From 7a25b2d32b9cb0b813d56ee6109acf90f3c9f1e5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 21 Jun 2012 12:25:16 +0200 Subject: [PATCH 161/612] perf test: Fix parse events test to follow proper raw event name Following commit changed raw event names to carry event modificator. perf evsel: Reconstruct raw event with modifiers from perf_event_attr commit 6eef3d9c2bcf52b7a3c18e609f5838c007b989a4 Author: Arnaldo Carvalho de Melo The perf_evsel__name function now returns ':mod' suffix for raw events, so we need to follow that in current tests. All tests pass now for 'perf test parse' suite. Signed-off-by: Jiri Olsa Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1340274316-5161-1-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events-test.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/parse-events-test.c b/tools/perf/util/parse-events-test.c index 229af6da33a2..a0f61a2a6835 100644 --- a/tools/perf/util/parse-events-test.c +++ b/tools/perf/util/parse-events-test.c @@ -413,19 +413,20 @@ static int test__checkevent_pmu_name(struct perf_evlist *evlist) { struct perf_evsel *evsel; - /* cpu/config=1,name=krava1/u */ + /* cpu/config=1,name=krava/u */ evsel = list_entry(evlist->entries.next, struct perf_evsel, node); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->attr.type); TEST_ASSERT_VAL("wrong config", 1 == evsel->attr.config); TEST_ASSERT_VAL("wrong name", !strcmp(perf_evsel__name(evsel), "krava")); - /* cpu/config=2/" */ + /* cpu/config=2/u" */ evsel = list_entry(evsel->node.next, struct perf_evsel, node); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->attr.type); TEST_ASSERT_VAL("wrong config", 2 == evsel->attr.config); - TEST_ASSERT_VAL("wrong name", !strcmp(perf_evsel__name(evsel), "raw 0x2")); + TEST_ASSERT_VAL("wrong name", + !strcmp(perf_evsel__name(evsel), "raw 0x2:u")); return 0; } From 4069d6f86bebce1a1e3456ef721511b4b81958f8 Mon Sep 17 00:00:00 2001 From: Mattia Dongili Date: Sat, 9 Jun 2012 13:18:08 +0900 Subject: [PATCH 162/612] sony-laptop: use an enum for SNC event types Signed-off-by: Mattia Dongili Signed-off-by: Matthew Garrett --- drivers/platform/x86/sony-laptop.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 210d4ae547c2..615ab06b6cd3 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -1172,6 +1172,10 @@ static int sony_nc_hotkeys_decode(u32 event, unsigned int handle) /* * ACPI callbacks */ +enum event_types { + HOTKEY = 1, + KILLSWITCH +}; static void sony_nc_notify(struct acpi_device *device, u32 event) { u32 real_ev = event; @@ -1196,7 +1200,7 @@ static void sony_nc_notify(struct acpi_device *device, u32 event) /* hotkey event */ case 0x0100: case 0x0127: - ev_type = 1; + ev_type = HOTKEY; real_ev = sony_nc_hotkeys_decode(event, handle); if (real_ev > 0) @@ -1216,7 +1220,7 @@ static void sony_nc_notify(struct acpi_device *device, u32 event) * update the rfkill device status when the * switch is moved. */ - ev_type = 2; + ev_type = KILLSWITCH; sony_call_snc_handle(handle, 0x0100, &result); real_ev = result & 0x03; @@ -1238,7 +1242,7 @@ static void sony_nc_notify(struct acpi_device *device, u32 event) } else { /* old style event */ - ev_type = 1; + ev_type = HOTKEY; sony_laptop_report_input_event(real_ev); } From bb384b5295323ed58260aeaff22d8bbe32988396 Mon Sep 17 00:00:00 2001 From: Marco Chiappero Date: Sat, 9 Jun 2012 13:18:09 +0900 Subject: [PATCH 163/612] sony-laptop: notify userspace of GFX switch position changes Some Vaios come with both integrated and discrete graphics, plus a switch for choosing one of the two. When the switch position is changed, a notification is generated. Signed-off-by: Marco Chiappero Signed-off-by: Mattia Dongili Signed-off-by: Matthew Garrett --- drivers/platform/x86/sony-laptop.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 615ab06b6cd3..4f42e5661bf5 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -1174,7 +1174,8 @@ static int sony_nc_hotkeys_decode(u32 event, unsigned int handle) */ enum event_types { HOTKEY = 1, - KILLSWITCH + KILLSWITCH, + GFX_SWITCH }; static void sony_nc_notify(struct acpi_device *device, u32 event) { @@ -1230,6 +1231,24 @@ static void sony_nc_notify(struct acpi_device *device, u32 event) break; + case 0x0128: + case 0x0146: + /* Hybrid GFX switching */ + sony_call_snc_handle(handle, 0x0000, &result); + dprintk("GFX switch event received (reason: %s)\n", + (result & 0x01) ? + "switch change" : "unknown"); + + /* verify the switch state + * 1: discrete GFX + * 0: integrated GFX + */ + sony_call_snc_handle(handle, 0x0100, &result); + + ev_type = GFX_SWITCH; + real_ev = result & 0xff; + break; + default: dprintk("Unknown event 0x%x for handle 0x%x\n", event, handle); From 15aa5c75468a103cdee1a0e0ec26aad979bf71a5 Mon Sep 17 00:00:00 2001 From: Mattia Dongili Date: Mon, 11 Jun 2012 07:18:31 +0900 Subject: [PATCH 164/612] sony-laptop: store battery care limits on batteries Some models offer the option to store the limits on the battery (firmware?). Signed-off-by: Mattia Dongili Signed-off-by: Matthew Garrett --- drivers/platform/x86/sony-laptop.c | 41 +++++++++++++++--------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 4f42e5661bf5..f045e3e59cd6 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -1916,32 +1916,33 @@ static ssize_t sony_nc_battery_care_limit_store(struct device *dev, * bits 4,5: store the limit into the EC * bits 6,7: store the limit into the battery */ + cmd = 0; - /* - * handle 0x0115 should allow storing on battery too; - * handle 0x0136 same as 0x0115 + health status; - * handle 0x013f, same as 0x0136 but no storing on the battery - * - * Store only inside the EC for now, regardless the handle number - */ - if (value == 0) - /* disable limits */ - cmd = 0x0; + if (value > 0) { + if (value <= 50) + cmd = 0x20; - else if (value <= 50) - cmd = 0x21; + else if (value <= 80) + cmd = 0x10; - else if (value <= 80) - cmd = 0x11; + else if (value <= 100) + cmd = 0x30; - else if (value <= 100) - cmd = 0x31; + else + return -EINVAL; - else - return -EINVAL; + /* + * handle 0x0115 should allow storing on battery too; + * handle 0x0136 same as 0x0115 + health status; + * handle 0x013f, same as 0x0136 but no storing on the battery + */ + if (bcare_ctl->handle != 0x013f) + cmd = cmd | (cmd << 2); - if (sony_call_snc_handle(bcare_ctl->handle, (cmd << 0x10) | 0x0100, - &result)) + cmd = (cmd | 0x1) << 0x10; + } + + if (sony_call_snc_handle(bcare_ctl->handle, cmd | 0x0100, &result)) return -EIO; return count; From 014fc8fbece33d42e2aa92d289fffa213a159321 Mon Sep 17 00:00:00 2001 From: Mattia Dongili Date: Sat, 9 Jun 2012 13:18:11 +0900 Subject: [PATCH 165/612] sony-laptop: add lid backlight support for handle 0x143 Signed-off-by: Mattia Dongili Signed-off-by: Matthew Garrett --- drivers/platform/x86/sony-laptop.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index f045e3e59cd6..a94e13fe1f42 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -1010,6 +1010,7 @@ static ssize_t sony_nc_sysfs_store(struct device *dev, struct sony_backlight_props { struct backlight_device *dev; int handle; + int cmd_base; u8 offset; u8 maxlvl; }; @@ -1037,7 +1038,7 @@ static int sony_nc_get_brightness_ng(struct backlight_device *bd) struct sony_backlight_props *sdev = (struct sony_backlight_props *)bl_get_data(bd); - sony_call_snc_handle(sdev->handle, 0x0200, &result); + sony_call_snc_handle(sdev->handle, sdev->cmd_base + 0x100, &result); return (result & 0xff) - sdev->offset; } @@ -1049,7 +1050,8 @@ static int sony_nc_update_status_ng(struct backlight_device *bd) (struct sony_backlight_props *)bl_get_data(bd); value = bd->props.brightness + sdev->offset; - if (sony_call_snc_handle(sdev->handle, 0x0100 | (value << 16), &result)) + if (sony_call_snc_handle(sdev->handle, sdev->cmd_base | (value << 0x10), + &result)) return -EIO; return value; @@ -2496,6 +2498,7 @@ static void sony_nc_backlight_ng_read_limits(int handle, { u64 offset; int i; + int lvl_table_len = 0; u8 min = 0xff, max = 0x00; unsigned char buffer[32] = { 0 }; @@ -2515,11 +2518,21 @@ static void sony_nc_backlight_ng_read_limits(int handle, if (i < 0) return; + switch (handle) { + case 0x012f: + case 0x0137: + lvl_table_len = 9; + break; + case 0x143: + lvl_table_len = 16; + break; + } + /* the buffer lists brightness levels available, brightness levels are * from position 0 to 8 in the array, other values are used by ALS * control. */ - for (i = 0; i < 9 && i < ARRAY_SIZE(buffer); i++) { + for (i = 0; i < lvl_table_len && i < ARRAY_SIZE(buffer); i++) { dprintk("Brightness level: %d\n", buffer[i]); @@ -2546,14 +2559,22 @@ static void sony_nc_backlight_setup(void) if (sony_find_snc_handle(0x12f) != -1) { ops = &sony_backlight_ng_ops; + sony_bl_props.cmd_base = 0x0100; sony_nc_backlight_ng_read_limits(0x12f, &sony_bl_props); max_brightness = sony_bl_props.maxlvl - sony_bl_props.offset; } else if (sony_find_snc_handle(0x137) != -1) { ops = &sony_backlight_ng_ops; + sony_bl_props.cmd_base = 0x0100; sony_nc_backlight_ng_read_limits(0x137, &sony_bl_props); max_brightness = sony_bl_props.maxlvl - sony_bl_props.offset; + } else if (sony_find_snc_handle(0x143) != -1) { + ops = &sony_backlight_ng_ops; + sony_bl_props.cmd_base = 0x3000; + sony_nc_backlight_ng_read_limits(0x143, &sony_bl_props); + max_brightness = sony_bl_props.maxlvl - sony_bl_props.offset; + } else if (ACPI_SUCCESS(acpi_get_handle(sony_nc_acpi_handle, "GBRT", &unused))) { ops = &sony_backlight_ops; From ca3c2c706de39b3400e57254dce054bf7350efa2 Mon Sep 17 00:00:00 2001 From: Mattia Dongili Date: Sat, 9 Jun 2012 13:18:12 +0900 Subject: [PATCH 166/612] sony-laptop: input initialization should be done before SNC SNC needs input devices so better have those ready before starting handle events. Signed-off-by: Mattia Dongili Signed-off-by: Matthew Garrett --- drivers/platform/x86/sony-laptop.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index a94e13fe1f42..89ff6d845f34 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -2642,6 +2642,12 @@ static int sony_nc_add(struct acpi_device *device) } } + result = sony_laptop_setup_input(device); + if (result) { + pr_err("Unable to create input devices\n"); + goto outplatform; + } + if (ACPI_SUCCESS(acpi_get_handle(sony_nc_acpi_handle, "ECON", &handle))) { int arg = 1; @@ -2659,12 +2665,6 @@ static int sony_nc_add(struct acpi_device *device) } /* setup input devices and helper fifo */ - result = sony_laptop_setup_input(device); - if (result) { - pr_err("Unable to create input devices\n"); - goto outsnc; - } - if (acpi_video_backlight_support()) { pr_info("brightness ignored, must be controlled by ACPI video driver\n"); } else { @@ -2712,22 +2712,21 @@ static int sony_nc_add(struct acpi_device *device) return 0; - out_sysfs: +out_sysfs: for (item = sony_nc_values; item->name; ++item) { device_remove_file(&sony_pf_device->dev, &item->devattr); } sony_nc_backlight_cleanup(); - - sony_laptop_remove_input(); - - outsnc: sony_nc_function_cleanup(sony_pf_device); sony_nc_handles_cleanup(sony_pf_device); - outpresent: +outplatform: + sony_laptop_remove_input(); + +outpresent: sony_pf_remove(); - outwalk: +outwalk: sony_nc_rfkill_cleanup(); return result; } From c7a2918373983b32db3ca35823d930641747e26f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 9 Jun 2012 13:18:13 +0900 Subject: [PATCH 167/612] sony-laptop: fix sony_nc_sysfs_store() We made this an unsigned long and it causes a bug on 64 bit big endian systems when we try to pass the value to sony_nc_int_call(). Also value has to be signed because validate() returns negative error codes. Signed-off-by: Dan Carpenter Signed-off-by: Mattia Dongili Signed-off-by: Matthew Garrett --- drivers/platform/x86/sony-laptop.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 89ff6d845f34..78e6389d2cae 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -973,7 +973,7 @@ static ssize_t sony_nc_sysfs_store(struct device *dev, struct device_attribute *attr, const char *buffer, size_t count) { - unsigned long value = 0; + int value; int ret = 0; struct sony_nc_value *item = container_of(attr, struct sony_nc_value, devattr); @@ -984,7 +984,7 @@ static ssize_t sony_nc_sysfs_store(struct device *dev, if (count > 31) return -EINVAL; - if (kstrtoul(buffer, 10, &value)) + if (kstrtoint(buffer, 10, &value)) return -EINVAL; if (item->validate) @@ -994,7 +994,7 @@ static ssize_t sony_nc_sysfs_store(struct device *dev, return value; ret = sony_nc_int_call(sony_nc_acpi_handle, *item->acpiset, - (int *)&value, NULL); + &value, NULL); if (ret < 0) return -EIO; From 56f4a9f76d8ce7c7cef92905c909aad0c7e5c9db Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 9 Jun 2012 13:18:14 +0900 Subject: [PATCH 168/612] sony-laptop: fix a couple signedness bugs This needs to be signed to handle negative error codes. Remove a redundant check, read_limits is always called with a valid handle. Signed-off-by: Dan Carpenter Signed-off-by: Mattia Dongili Signed-off-by: Matthew Garrett --- drivers/platform/x86/sony-laptop.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 78e6389d2cae..b9499b65c1cd 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -2139,7 +2139,7 @@ static ssize_t sony_nc_thermal_mode_show(struct device *dev, struct device_attribute *attr, char *buffer) { ssize_t count = 0; - unsigned int mode = sony_nc_thermal_mode_get(); + int mode = sony_nc_thermal_mode_get(); if (mode < 0) return mode; @@ -2507,8 +2507,6 @@ static void sony_nc_backlight_ng_read_limits(int handle, props->maxlvl = 0xff; offset = sony_find_snc_handle(handle); - if (offset < 0) - return; /* try to read the boundaries from ACPI tables, if we fail the above * defaults should be reasonable From a1071a5abf1f4d4a7f8324e21b8b32b1342013c7 Mon Sep 17 00:00:00 2001 From: Mattia Dongili Date: Thu, 14 Jun 2012 06:36:02 +0900 Subject: [PATCH 169/612] sony-laptop: correct find_snc_handle failure checks Since bab7084c745bf4d75b760728387f375fd34dc683, find_snc_handle returns -EINVAL, not -1. Signed-off-by: Mattia Dongili Signed-off-by: Matthew Garrett --- drivers/platform/x86/sony-laptop.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index b9499b65c1cd..d456ff0c73b7 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -2555,19 +2555,19 @@ static void sony_nc_backlight_setup(void) const struct backlight_ops *ops = NULL; struct backlight_properties props; - if (sony_find_snc_handle(0x12f) != -1) { + if (sony_find_snc_handle(0x12f) >= 0) { ops = &sony_backlight_ng_ops; sony_bl_props.cmd_base = 0x0100; sony_nc_backlight_ng_read_limits(0x12f, &sony_bl_props); max_brightness = sony_bl_props.maxlvl - sony_bl_props.offset; - } else if (sony_find_snc_handle(0x137) != -1) { + } else if (sony_find_snc_handle(0x137) >= 0) { ops = &sony_backlight_ng_ops; sony_bl_props.cmd_base = 0x0100; sony_nc_backlight_ng_read_limits(0x137, &sony_bl_props); max_brightness = sony_bl_props.maxlvl - sony_bl_props.offset; - } else if (sony_find_snc_handle(0x143) != -1) { + } else if (sony_find_snc_handle(0x143) >= 0) { ops = &sony_backlight_ng_ops; sony_bl_props.cmd_base = 0x3000; sony_nc_backlight_ng_read_limits(0x143, &sony_bl_props); From 57f9616b79549e772cf4dd3aa1d2df5b6c8acdfa Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 12 Jun 2012 19:28:50 +0300 Subject: [PATCH 170/612] ideapad: uninitialized data in ideapad_acpi_add() We only initialize the high bits of "cfg". It probably doesn't cause a problem given that this is platform specific code and doesn't have to worry about endianness etc. But it's sort of messy. Signed-off-by: Dan Carpenter Signed-off-by: Matthew Garrett --- drivers/platform/x86/ideapad-laptop.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index 4f20f8dd3d7c..17f6dfd8dbfb 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -694,10 +694,10 @@ MODULE_DEVICE_TABLE(acpi, ideapad_device_ids); static int __devinit ideapad_acpi_add(struct acpi_device *adevice) { int ret, i; - unsigned long cfg; + int cfg; struct ideapad_private *priv; - if (read_method_int(adevice->handle, "_CFG", (int *)&cfg)) + if (read_method_int(adevice->handle, "_CFG", &cfg)) return -ENODEV; priv = kzalloc(sizeof(*priv), GFP_KERNEL); @@ -721,7 +721,7 @@ static int __devinit ideapad_acpi_add(struct acpi_device *adevice) goto input_failed; for (i = 0; i < IDEAPAD_RFKILL_DEV_NUM; i++) { - if (test_bit(ideapad_rfk_data[i].cfgbit, &cfg)) + if (test_bit(ideapad_rfk_data[i].cfgbit, &priv->cfg)) ideapad_register_rfkill(adevice, i); else priv->rfk[i] = NULL; From 88ca518b0bb4161e5f20f8a1d9cc477cae294e54 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 25 Jun 2012 15:07:17 +0200 Subject: [PATCH 171/612] intel_ips: blacklist HP ProBook laptops intel_ips driver spews the warning message "ME failed to update for more than 1s, likely hung" at each second endlessly on HP ProBook laptops with IronLake. As this has never worked, better to blacklist the driver for now. Cc: Signed-off-by: Takashi Iwai Signed-off-by: Matthew Garrett --- drivers/platform/x86/intel_ips.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c index 0ffdb3cde2bb..9af4257d4901 100644 --- a/drivers/platform/x86/intel_ips.c +++ b/drivers/platform/x86/intel_ips.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -1485,6 +1486,24 @@ static DEFINE_PCI_DEVICE_TABLE(ips_id_table) = { MODULE_DEVICE_TABLE(pci, ips_id_table); +static int ips_blacklist_callback(const struct dmi_system_id *id) +{ + pr_info("Blacklisted intel_ips for %s\n", id->ident); + return 1; +} + +static const struct dmi_system_id ips_blacklist[] = { + { + .callback = ips_blacklist_callback, + .ident = "HP ProBook", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP ProBook"), + }, + }, + { } /* terminating entry */ +}; + static int ips_probe(struct pci_dev *dev, const struct pci_device_id *id) { u64 platform_info; @@ -1494,6 +1513,9 @@ static int ips_probe(struct pci_dev *dev, const struct pci_device_id *id) u16 htshi, trc, trc_required_mask; u8 tse; + if (dmi_check_system(ips_blacklist)) + return -ENODEV; + ips = kzalloc(sizeof(struct ips_driver), GFP_KERNEL); if (!ips) return -ENOMEM; From 9e303f228c24d529bf479196d290eedc2a04cd5e Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sat, 16 Jun 2012 22:01:25 +0300 Subject: [PATCH 172/612] gpio/omap: fix irq loss while in idle with debounce on It seems that currently GPIO module is not working correctly during idle when debounce is enabled - the system almost never responds to button presses (observed on OMAP3530 ES2.1 and OMAP3630 ES1.2 pandora boards). Even though wakeups are probably working, it seems that the GPIO module itself is unable to detect input events and generate interrupts. OMAP35x TRM also states that: "If the debounce clock is inactive, the debounce cell gates all input signals and thus cannot be used." So whenever we are disabling debounce clocks (for PM or other reasons), be sure the module's debounce feature is disabled too. Cc: Kevin Hilman Signed-off-by: Grazvydas Ignotas Signed-off-by: Kevin Hilman --- drivers/gpio/gpio-omap.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c index c4ed1722734c..ff213e70fa5a 100644 --- a/drivers/gpio/gpio-omap.c +++ b/drivers/gpio/gpio-omap.c @@ -174,12 +174,22 @@ static inline void _gpio_dbck_enable(struct gpio_bank *bank) if (bank->dbck_enable_mask && !bank->dbck_enabled) { clk_enable(bank->dbck); bank->dbck_enabled = true; + + __raw_writel(bank->dbck_enable_mask, + bank->base + bank->regs->debounce_en); } } static inline void _gpio_dbck_disable(struct gpio_bank *bank) { if (bank->dbck_enable_mask && bank->dbck_enabled) { + /* + * Disable debounce before cutting it's clock. If debounce is + * enabled but the clock is not, GPIO module seems to be unable + * to detect events and generate interrupts at least on OMAP3. + */ + __raw_writel(0, bank->base + bank->regs->debounce_en); + clk_disable(bank->dbck); bank->dbck_enabled = false; } From da3789628f88684d3f0fb4e6a6bc086c395ac3cb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 27 Jun 2012 13:08:42 -0300 Subject: [PATCH 173/612] perf tools: Stop using a global trace events description list The pevent thing is per perf.data file, so I made it stop being static and become a perf_session member, so tools processing perf.data files use perf_session and _there_ we read the trace events description into session->pevent and then change everywhere to stop using that single global pevent variable and use the per session one. Note that it _doesn't_ fall backs to trace__event_id, as we're not interested at all in what is present in the /sys/kernel/debug/tracing/events in the workstation doing the analysis, just in what is in the perf.data file. This patch also introduces perf_session__set_tracepoints_handlers that is the perf perf.data/session way to associate handlers to tracepoint events by resolving their IDs using the events descriptions stored in a perf.data file. Make 'perf sched' use it. Reported-by: Dmitry Antipov Tested-by: Dmitry Antipov Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: linaro-dev@lists.linaro.org Cc: patches@linaro.org Link: http://lkml.kernel.org/r/20120625232016.GA28525@infradead.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kmem.c | 37 ++++--- tools/perf/builtin-lock.c | 4 +- tools/perf/builtin-sched.c | 36 ++++--- tools/perf/builtin-script.c | 66 ++++++++----- tools/perf/util/evlist.c | 4 +- tools/perf/util/evlist.h | 3 + tools/perf/util/header.c | 31 +++--- .../util/scripting-engines/trace-event-perl.c | 28 +++--- .../scripting-engines/trace-event-python.c | 21 ++-- tools/perf/util/session.c | 56 +++++++++++ tools/perf/util/session.h | 10 ++ tools/perf/util/trace-event-parse.c | 58 +++++------ tools/perf/util/trace-event-read.c | 97 ++++++++++--------- tools/perf/util/trace-event-scripting.c | 7 +- tools/perf/util/trace-event.h | 38 ++++---- 15 files changed, 314 insertions(+), 182 deletions(-) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 547af48deb4f..ce35015f2dc6 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -57,6 +57,11 @@ static unsigned long nr_allocs, nr_cross_allocs; #define PATH_SYS_NODE "/sys/devices/system/node" +struct perf_kmem { + struct perf_tool tool; + struct perf_session *session; +}; + static void init_cpunode_map(void) { FILE *fp; @@ -278,14 +283,16 @@ static void process_free_event(void *data, s_alloc->alloc_cpu = -1; } -static void process_raw_event(union perf_event *raw_event __used, void *data, +static void process_raw_event(struct perf_tool *tool, + union perf_event *raw_event __used, void *data, int cpu, u64 timestamp, struct thread *thread) { + struct perf_kmem *kmem = container_of(tool, struct perf_kmem, tool); struct event_format *event; int type; - type = trace_parse_common_type(data); - event = trace_find_event(type); + type = trace_parse_common_type(kmem->session->pevent, data); + event = pevent_find_event(kmem->session->pevent, type); if (!strcmp(event->name, "kmalloc") || !strcmp(event->name, "kmem_cache_alloc")) { @@ -306,7 +313,7 @@ static void process_raw_event(union perf_event *raw_event __used, void *data, } } -static int process_sample_event(struct perf_tool *tool __used, +static int process_sample_event(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel __used, @@ -322,16 +329,18 @@ static int process_sample_event(struct perf_tool *tool __used, dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); - process_raw_event(event, sample->raw_data, sample->cpu, + process_raw_event(tool, event, sample->raw_data, sample->cpu, sample->time, thread); return 0; } -static struct perf_tool perf_kmem = { - .sample = process_sample_event, - .comm = perf_event__process_comm, - .ordered_samples = true, +static struct perf_kmem perf_kmem = { + .tool = { + .sample = process_sample_event, + .comm = perf_event__process_comm, + .ordered_samples = true, + }, }; static double fragmentation(unsigned long n_req, unsigned long n_alloc) @@ -486,11 +495,15 @@ static void sort_result(void) static int __cmd_kmem(void) { int err = -EINVAL; - struct perf_session *session = perf_session__new(input_name, O_RDONLY, - 0, false, &perf_kmem); + struct perf_session *session; + + session = perf_session__new(input_name, O_RDONLY, 0, false, + &perf_kmem.tool); if (session == NULL) return -ENOMEM; + perf_kmem.session = session; + if (perf_session__create_kernel_maps(session) < 0) goto out_delete; @@ -498,7 +511,7 @@ static int __cmd_kmem(void) goto out_delete; setup_pager(); - err = perf_session__process_events(session, &perf_kmem); + err = perf_session__process_events(session, &perf_kmem.tool); if (err != 0) goto out_delete; sort_result(); diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index fd53319de20d..b3c428548868 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -724,8 +724,8 @@ process_raw_event(void *data, int cpu, u64 timestamp, struct thread *thread) struct event_format *event; int type; - type = trace_parse_common_type(data); - event = trace_find_event(type); + type = trace_parse_common_type(session->pevent, data); + event = pevent_find_event(session->pevent, type); if (!strcmp(event->name, "lock_acquire")) process_lock_acquire_event(data, event, cpu, timestamp, thread); diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 9fe77b185338..7a9ad2b1ee76 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -43,6 +43,11 @@ static u64 sleep_measurement_overhead; static unsigned long nr_tasks; +struct perf_sched { + struct perf_tool tool; + struct perf_session *session; +}; + struct sched_atom; struct task_desc { @@ -1597,6 +1602,8 @@ static int perf_sched__process_tracepoint_sample(struct perf_tool *tool, struct perf_evsel *evsel, struct machine *machine) { + struct perf_sched *sched = container_of(tool, struct perf_sched, tool); + struct pevent *pevent = sched->session->pevent; struct thread *thread = machine__findnew_thread(machine, sample->pid); if (thread == NULL) { @@ -1612,7 +1619,8 @@ static int perf_sched__process_tracepoint_sample(struct perf_tool *tool, tracepoint_handler f = evsel->handler.func; if (evsel->handler.data == NULL) - evsel->handler.data = trace_find_event(evsel->attr.config); + evsel->handler.data = pevent_find_event(pevent, + evsel->attr.config); f(tool, evsel->handler.data, sample, machine, thread); } @@ -1620,12 +1628,14 @@ static int perf_sched__process_tracepoint_sample(struct perf_tool *tool, return 0; } -static struct perf_tool perf_sched = { - .sample = perf_sched__process_tracepoint_sample, - .comm = perf_event__process_comm, - .lost = perf_event__process_lost, - .fork = perf_event__process_task, - .ordered_samples = true, +static struct perf_sched perf_sched = { + .tool = { + .sample = perf_sched__process_tracepoint_sample, + .comm = perf_event__process_comm, + .lost = perf_event__process_lost, + .fork = perf_event__process_task, + .ordered_samples = true, + }, }; static void read_events(bool destroy, struct perf_session **psession) @@ -1640,16 +1650,20 @@ static void read_events(bool destroy, struct perf_session **psession) { "sched:sched_process_exit", process_sched_exit_event, }, { "sched:sched_migrate_task", process_sched_migrate_task_event, }, }; - struct perf_session *session = perf_session__new(input_name, O_RDONLY, - 0, false, &perf_sched); + struct perf_session *session; + + session = perf_session__new(input_name, O_RDONLY, 0, false, + &perf_sched.tool); if (session == NULL) die("No Memory"); - err = perf_evlist__set_tracepoints_handlers_array(session->evlist, handlers); + perf_sched.session = session; + + err = perf_session__set_tracepoints_handlers(session, handlers); assert(err == 0); if (perf_session__has_traces(session, "record -R")) { - err = perf_session__process_events(session, &perf_sched); + err = perf_session__process_events(session, &perf_sched.tool); if (err) die("Failed to process events, error %d", err); diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 8fecd3b8130a..1e60ab70b2b1 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -28,6 +28,11 @@ static bool system_wide; static const char *cpu_list; static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); +struct perf_script { + struct perf_tool tool; + struct perf_session *session; +}; + enum perf_output_field { PERF_OUTPUT_COMM = 1U << 0, PERF_OUTPUT_TID = 1U << 1, @@ -257,7 +262,8 @@ static int perf_session__check_output_opt(struct perf_session *session) return 0; } -static void print_sample_start(struct perf_sample *sample, +static void print_sample_start(struct pevent *pevent, + struct perf_sample *sample, struct thread *thread, struct perf_evsel *evsel) { @@ -302,8 +308,14 @@ static void print_sample_start(struct perf_sample *sample, if (PRINT_FIELD(EVNAME)) { if (attr->type == PERF_TYPE_TRACEPOINT) { - type = trace_parse_common_type(sample->raw_data); - event = trace_find_event(type); + /* + * XXX Do we really need this here? + * perf_evlist__set_tracepoint_names should have done + * this already + */ + type = trace_parse_common_type(pevent, + sample->raw_data); + event = pevent_find_event(pevent, type); if (event) evname = event->name; } else @@ -404,6 +416,7 @@ static void print_sample_bts(union perf_event *event, } static void process_event(union perf_event *event __unused, + struct pevent *pevent, struct perf_sample *sample, struct perf_evsel *evsel, struct machine *machine, @@ -414,7 +427,7 @@ static void process_event(union perf_event *event __unused, if (output[attr->type].fields == 0) return; - print_sample_start(sample, thread, evsel); + print_sample_start(pevent, sample, thread, evsel); if (is_bts_event(attr)) { print_sample_bts(event, sample, evsel, machine, thread); @@ -422,7 +435,7 @@ static void process_event(union perf_event *event __unused, } if (PRINT_FIELD(TRACE)) - print_trace_event(sample->cpu, sample->raw_data, + print_trace_event(pevent, sample->cpu, sample->raw_data, sample->raw_size); if (PRINT_FIELD(ADDR)) @@ -453,7 +466,8 @@ static int default_stop_script(void) return 0; } -static int default_generate_script(const char *outfile __unused) +static int default_generate_script(struct pevent *pevent __unused, + const char *outfile __unused) { return 0; } @@ -491,6 +505,7 @@ static int process_sample_event(struct perf_tool *tool __used, struct machine *machine) { struct addr_location al; + struct perf_script *scr = container_of(tool, struct perf_script, tool); struct thread *thread = machine__findnew_thread(machine, event->ip.tid); if (thread == NULL) { @@ -522,24 +537,27 @@ static int process_sample_event(struct perf_tool *tool __used, if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) return 0; - scripting_ops->process_event(event, sample, evsel, machine, thread); + scripting_ops->process_event(event, scr->session->pevent, + sample, evsel, machine, thread); evsel->hists.stats.total_period += sample->period; return 0; } -static struct perf_tool perf_script = { - .sample = process_sample_event, - .mmap = perf_event__process_mmap, - .comm = perf_event__process_comm, - .exit = perf_event__process_task, - .fork = perf_event__process_task, - .attr = perf_event__process_attr, - .event_type = perf_event__process_event_type, - .tracing_data = perf_event__process_tracing_data, - .build_id = perf_event__process_build_id, - .ordered_samples = true, - .ordering_requires_timestamps = true, +static struct perf_script perf_script = { + .tool = { + .sample = process_sample_event, + .mmap = perf_event__process_mmap, + .comm = perf_event__process_comm, + .exit = perf_event__process_task, + .fork = perf_event__process_task, + .attr = perf_event__process_attr, + .event_type = perf_event__process_event_type, + .tracing_data = perf_event__process_tracing_data, + .build_id = perf_event__process_build_id, + .ordered_samples = true, + .ordering_requires_timestamps = true, + }, }; extern volatile int session_done; @@ -555,7 +573,7 @@ static int __cmd_script(struct perf_session *session) signal(SIGINT, sig_handler); - ret = perf_session__process_events(session, &perf_script); + ret = perf_session__process_events(session, &perf_script.tool); if (debug_mode) pr_err("Misordered timestamps: %" PRIu64 "\n", nr_unordered); @@ -1337,10 +1355,13 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) if (!script_name) setup_pager(); - session = perf_session__new(input_name, O_RDONLY, 0, false, &perf_script); + session = perf_session__new(input_name, O_RDONLY, 0, false, + &perf_script.tool); if (session == NULL) return -ENOMEM; + perf_script.session = session; + if (cpu_list) { if (perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap)) return -1; @@ -1386,7 +1407,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) return -1; } - err = scripting_ops->generate_script("perf-script"); + err = scripting_ops->generate_script(session->pevent, + "perf-script"); goto out; } diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 7400fb3fc50c..f74e9560350e 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -224,8 +224,8 @@ out_free_attrs: return err; } -static struct perf_evsel * - perf_evlist__find_tracepoint_by_id(struct perf_evlist *evlist, int id) +struct perf_evsel * +perf_evlist__find_tracepoint_by_id(struct perf_evlist *evlist, int id) { struct perf_evsel *evsel; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 989bee9624c2..40d4d3cdced0 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -73,6 +73,9 @@ int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist, #define perf_evlist__set_tracepoints_handlers_array(evlist, array) \ perf_evlist__set_tracepoints_handlers(evlist, array, ARRAY_SIZE(array)) +struct perf_evsel * +perf_evlist__find_tracepoint_by_id(struct perf_evlist *evlist, int id); + void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel, int cpu, int thread, u64 id); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index a5e2015319ee..5a47aba46759 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1474,15 +1474,15 @@ out: static int process_tracing_data(struct perf_file_section *section __unused, struct perf_header *ph __unused, - int feat __unused, int fd) + int feat __unused, int fd, void *data) { - trace_report(fd, false); + trace_report(fd, data, false); return 0; } static int process_build_id(struct perf_file_section *section, struct perf_header *ph, - int feat __unused, int fd) + int feat __unused, int fd, void *data __used) { if (perf_header__read_build_ids(ph, fd, section->offset, section->size)) pr_debug("Failed to read buildids, continuing...\n"); @@ -1493,7 +1493,7 @@ struct feature_ops { int (*write)(int fd, struct perf_header *h, struct perf_evlist *evlist); void (*print)(struct perf_header *h, int fd, FILE *fp); int (*process)(struct perf_file_section *section, - struct perf_header *h, int feat, int fd); + struct perf_header *h, int feat, int fd, void *data); const char *name; bool full_only; }; @@ -1988,7 +1988,7 @@ int perf_file_header__read(struct perf_file_header *header, static int perf_file_section__process(struct perf_file_section *section, struct perf_header *ph, - int feat, int fd, void *data __used) + int feat, int fd, void *data) { if (lseek(fd, section->offset, SEEK_SET) == (off_t)-1) { pr_debug("Failed to lseek to %" PRIu64 " offset for feature " @@ -2004,7 +2004,7 @@ static int perf_file_section__process(struct perf_file_section *section, if (!feat_ops[feat].process) return 0; - return feat_ops[feat].process(section, ph, feat, fd); + return feat_ops[feat].process(section, ph, feat, fd, data); } static int perf_file_header__read_pipe(struct perf_pipe_file_header *header, @@ -2093,9 +2093,11 @@ static int read_attr(int fd, struct perf_header *ph, return ret <= 0 ? -1 : 0; } -static int perf_evsel__set_tracepoint_name(struct perf_evsel *evsel) +static int perf_evsel__set_tracepoint_name(struct perf_evsel *evsel, + struct pevent *pevent) { - struct event_format *event = trace_find_event(evsel->attr.config); + struct event_format *event = pevent_find_event(pevent, + evsel->attr.config); char bf[128]; if (event == NULL) @@ -2109,13 +2111,14 @@ static int perf_evsel__set_tracepoint_name(struct perf_evsel *evsel) return 0; } -static int perf_evlist__set_tracepoint_names(struct perf_evlist *evlist) +static int perf_evlist__set_tracepoint_names(struct perf_evlist *evlist, + struct pevent *pevent) { struct perf_evsel *pos; list_for_each_entry(pos, &evlist->entries, node) { if (pos->attr.type == PERF_TYPE_TRACEPOINT && - perf_evsel__set_tracepoint_name(pos)) + perf_evsel__set_tracepoint_name(pos, pevent)) return -1; } @@ -2198,12 +2201,12 @@ int perf_session__read_header(struct perf_session *session, int fd) event_count = f_header.event_types.size / sizeof(struct perf_trace_event_type); } - perf_header__process_sections(header, fd, NULL, + perf_header__process_sections(header, fd, &session->pevent, perf_file_section__process); lseek(fd, header->data_offset, SEEK_SET); - if (perf_evlist__set_tracepoint_names(session->evlist)) + if (perf_evlist__set_tracepoint_names(session->evlist, session->pevent)) goto out_delete_evlist; header->frozen = 1; @@ -2419,8 +2422,8 @@ int perf_event__process_tracing_data(union perf_event *event, lseek(session->fd, offset + sizeof(struct tracing_data_event), SEEK_SET); - size_read = trace_report(session->fd, session->repipe); - + size_read = trace_report(session->fd, &session->pevent, + session->repipe); padding = ALIGN(size_read, sizeof(u64)) - size_read; if (read(session->fd, buf, padding) < 0) diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index 4c1b3d72a1d2..b3620fe12763 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -233,7 +233,8 @@ static void define_event_symbols(struct event_format *event, define_event_symbols(event, ev_name, args->next); } -static inline struct event_format *find_cache_event(int type) +static inline +struct event_format *find_cache_event(struct pevent *pevent, int type) { static char ev_name[256]; struct event_format *event; @@ -241,7 +242,7 @@ static inline struct event_format *find_cache_event(int type) if (events[type]) return events[type]; - events[type] = event = trace_find_event(type); + events[type] = event = pevent_find_event(pevent, type); if (!event) return NULL; @@ -252,7 +253,8 @@ static inline struct event_format *find_cache_event(int type) return event; } -static void perl_process_tracepoint(union perf_event *pevent __unused, +static void perl_process_tracepoint(union perf_event *perf_event __unused, + struct pevent *pevent, struct perf_sample *sample, struct perf_evsel *evsel, struct machine *machine __unused, @@ -275,13 +277,13 @@ static void perl_process_tracepoint(union perf_event *pevent __unused, if (evsel->attr.type != PERF_TYPE_TRACEPOINT) return; - type = trace_parse_common_type(data); + type = trace_parse_common_type(pevent, data); - event = find_cache_event(type); + event = find_cache_event(pevent, type); if (!event) die("ug! no event found for type %d", type); - pid = trace_parse_common_pid(data); + pid = trace_parse_common_pid(pevent, data); sprintf(handler, "%s::%s", event->system, event->name); @@ -314,7 +316,8 @@ static void perl_process_tracepoint(union perf_event *pevent __unused, offset = field->offset; XPUSHs(sv_2mortal(newSVpv((char *)data + offset, 0))); } else { /* FIELD_IS_NUMERIC */ - val = read_size(data + field->offset, field->size); + val = read_size(pevent, data + field->offset, + field->size); if (field->flags & FIELD_IS_SIGNED) { XPUSHs(sv_2mortal(newSViv(val))); } else { @@ -368,14 +371,15 @@ static void perl_process_event_generic(union perf_event *pevent __unused, LEAVE; } -static void perl_process_event(union perf_event *pevent, +static void perl_process_event(union perf_event *event, + struct pevent *pevent, struct perf_sample *sample, struct perf_evsel *evsel, struct machine *machine, struct thread *thread) { - perl_process_tracepoint(pevent, sample, evsel, machine, thread); - perl_process_event_generic(pevent, sample, evsel, machine, thread); + perl_process_tracepoint(event, pevent, sample, evsel, machine, thread); + perl_process_event_generic(event, sample, evsel, machine, thread); } static void run_start_sub(void) @@ -448,7 +452,7 @@ static int perl_stop_script(void) return 0; } -static int perl_generate_script(const char *outfile) +static int perl_generate_script(struct pevent *pevent, const char *outfile) { struct event_format *event = NULL; struct format_field *f; @@ -495,7 +499,7 @@ static int perl_generate_script(const char *outfile) fprintf(ofp, "sub trace_begin\n{\n\t# optional\n}\n\n"); fprintf(ofp, "sub trace_end\n{\n\t# optional\n}\n\n"); - while ((event = trace_find_next_event(event))) { + while ((event = trace_find_next_event(pevent, event))) { fprintf(ofp, "sub %s::%s\n{\n", event->system, event->name); fprintf(ofp, "\tmy ("); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index acb9795286c4..a8ca2f8179a9 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -190,7 +190,8 @@ static void define_event_symbols(struct event_format *event, define_event_symbols(event, ev_name, args->next); } -static inline struct event_format *find_cache_event(int type) +static inline +struct event_format *find_cache_event(struct pevent *pevent, int type) { static char ev_name[256]; struct event_format *event; @@ -198,7 +199,7 @@ static inline struct event_format *find_cache_event(int type) if (events[type]) return events[type]; - events[type] = event = trace_find_event(type); + events[type] = event = pevent_find_event(pevent, type); if (!event) return NULL; @@ -209,7 +210,8 @@ static inline struct event_format *find_cache_event(int type) return event; } -static void python_process_event(union perf_event *pevent __unused, +static void python_process_event(union perf_event *perf_event __unused, + struct pevent *pevent, struct perf_sample *sample, struct perf_evsel *evsel __unused, struct machine *machine __unused, @@ -233,13 +235,13 @@ static void python_process_event(union perf_event *pevent __unused, if (!t) Py_FatalError("couldn't create Python tuple"); - type = trace_parse_common_type(data); + type = trace_parse_common_type(pevent, data); - event = find_cache_event(type); + event = find_cache_event(pevent, type); if (!event) die("ug! no event found for type %d", type); - pid = trace_parse_common_pid(data); + pid = trace_parse_common_pid(pevent, data); sprintf(handler_name, "%s__%s", event->system, event->name); @@ -284,7 +286,8 @@ static void python_process_event(union perf_event *pevent __unused, offset = field->offset; obj = PyString_FromString((char *)data + offset); } else { /* FIELD_IS_NUMERIC */ - val = read_size(data + field->offset, field->size); + val = read_size(pevent, data + field->offset, + field->size); if (field->flags & FIELD_IS_SIGNED) { if ((long long)val >= LONG_MIN && (long long)val <= LONG_MAX) @@ -438,7 +441,7 @@ out: return err; } -static int python_generate_script(const char *outfile) +static int python_generate_script(struct pevent *pevent, const char *outfile) { struct event_format *event = NULL; struct format_field *f; @@ -487,7 +490,7 @@ static int python_generate_script(const char *outfile) fprintf(ofp, "def trace_end():\n"); fprintf(ofp, "\tprint \"in trace_end\"\n\n"); - while ((event = trace_find_next_event(event))) { + while ((event = trace_find_next_event(pevent, event))) { fprintf(ofp, "def %s__%s(", event->system, event->name); fprintf(ofp, "event_name, "); fprintf(ofp, "context, "); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 6b305fbcc986..f5baff1495e6 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -14,6 +14,7 @@ #include "sort.h" #include "util.h" #include "cpumap.h" +#include "event-parse.h" static int perf_session__open(struct perf_session *self, bool force) { @@ -1610,3 +1611,58 @@ void perf_session__fprintf_info(struct perf_session *session, FILE *fp, perf_header__fprintf_info(session, fp, full); fprintf(fp, "# ========\n#\n"); } + + +int __perf_session__set_tracepoints_handlers(struct perf_session *session, + const struct perf_evsel_str_handler *assocs, + size_t nr_assocs) +{ + struct perf_evlist *evlist = session->evlist; + struct event_format *format; + struct perf_evsel *evsel; + char *tracepoint, *name; + size_t i; + int err; + + for (i = 0; i < nr_assocs; i++) { + err = -ENOMEM; + tracepoint = strdup(assocs[i].name); + if (tracepoint == NULL) + goto out; + + err = -ENOENT; + name = strchr(tracepoint, ':'); + if (name == NULL) + goto out_free; + + *name++ = '\0'; + format = pevent_find_event_by_name(session->pevent, + tracepoint, name); + if (format == NULL) { + /* + * Adding a handler for an event not in the session, + * just ignore it. + */ + goto next; + } + + evsel = perf_evlist__find_tracepoint_by_id(evlist, format->id); + if (evsel == NULL) + goto next; + + err = -EEXIST; + if (evsel->handler.func != NULL) + goto out_free; + evsel->handler.func = assocs[i].handler; +next: + free(tracepoint); + } + + err = 0; +out: + return err; + +out_free: + free(tracepoint); + goto out; +} diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index c71a1a7b05ed..7c435bde6eb0 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -33,6 +33,7 @@ struct perf_session { struct machine host_machine; struct rb_root machines; struct perf_evlist *evlist; + struct pevent *pevent; /* * FIXME: Need to split this up further, we need global * stats + per event stats. 'perf diff' also needs @@ -158,4 +159,13 @@ int perf_session__cpu_bitmap(struct perf_session *session, const char *cpu_list, unsigned long *cpu_bitmap); void perf_session__fprintf_info(struct perf_session *s, FILE *fp, bool full); + +struct perf_evsel_str_handler; + +int __perf_session__set_tracepoints_handlers(struct perf_session *session, + const struct perf_evsel_str_handler *assocs, + size_t nr_assocs); + +#define perf_session__set_tracepoints_handlers(session, array) \ + __perf_session__set_tracepoints_handlers(session, array, ARRAY_SIZE(array)) #endif /* __PERF_SESSION_H */ diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index df2fddbf0cd2..a51bd86f4d09 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -32,29 +32,25 @@ int header_page_size_size; int header_page_ts_size; int header_page_data_offset; -struct pevent *perf_pevent; -static struct pevent *pevent; - bool latency_format; -int read_trace_init(int file_bigendian, int host_bigendian) +struct pevent *read_trace_init(int file_bigendian, int host_bigendian) { - if (pevent) - return 0; + struct pevent *pevent = pevent_alloc(); - perf_pevent = pevent_alloc(); - pevent = perf_pevent; + if (pevent != NULL) { + pevent_set_flag(pevent, PEVENT_NSEC_OUTPUT); + pevent_set_file_bigendian(pevent, file_bigendian); + pevent_set_host_bigendian(pevent, host_bigendian); + } - pevent_set_flag(pevent, PEVENT_NSEC_OUTPUT); - pevent_set_file_bigendian(pevent, file_bigendian); - pevent_set_host_bigendian(pevent, host_bigendian); - - return 0; + return pevent; } static int get_common_field(struct scripting_context *context, int *offset, int *size, const char *type) { + struct pevent *pevent = context->pevent; struct event_format *event; struct format_field *field; @@ -150,7 +146,7 @@ void *raw_field_ptr(struct event_format *event, const char *name, void *data) return data + field->offset; } -int trace_parse_common_type(void *data) +int trace_parse_common_type(struct pevent *pevent, void *data) { struct pevent_record record; @@ -158,7 +154,7 @@ int trace_parse_common_type(void *data) return pevent_data_type(pevent, &record); } -int trace_parse_common_pid(void *data) +int trace_parse_common_pid(struct pevent *pevent, void *data) { struct pevent_record record; @@ -166,27 +162,21 @@ int trace_parse_common_pid(void *data) return pevent_data_pid(pevent, &record); } -unsigned long long read_size(void *ptr, int size) +unsigned long long read_size(struct pevent *pevent, void *ptr, int size) { return pevent_read_number(pevent, ptr, size); } -struct event_format *trace_find_event(int type) -{ - return pevent_find_event(pevent, type); -} - - -void print_trace_event(int cpu, void *data, int size) +void print_trace_event(struct pevent *pevent, int cpu, void *data, int size) { struct event_format *event; struct pevent_record record; struct trace_seq s; int type; - type = trace_parse_common_type(data); + type = trace_parse_common_type(pevent, data); - event = trace_find_event(type); + event = pevent_find_event(pevent, type); if (!event) { warning("ug! no event found for type %d", type); return; @@ -203,8 +193,8 @@ void print_trace_event(int cpu, void *data, int size) printf("\n"); } -void print_event(int cpu, void *data, int size, unsigned long long nsecs, - char *comm) +void print_event(struct pevent *pevent, int cpu, void *data, int size, + unsigned long long nsecs, char *comm) { struct pevent_record record; struct trace_seq s; @@ -227,7 +217,8 @@ void print_event(int cpu, void *data, int size, unsigned long long nsecs, printf("\n"); } -void parse_proc_kallsyms(char *file, unsigned int size __unused) +void parse_proc_kallsyms(struct pevent *pevent, + char *file, unsigned int size __unused) { unsigned long long addr; char *func; @@ -258,7 +249,8 @@ void parse_proc_kallsyms(char *file, unsigned int size __unused) } } -void parse_ftrace_printk(char *file, unsigned int size __unused) +void parse_ftrace_printk(struct pevent *pevent, + char *file, unsigned int size __unused) { unsigned long long addr; char *printk; @@ -282,17 +274,19 @@ void parse_ftrace_printk(char *file, unsigned int size __unused) } } -int parse_ftrace_file(char *buf, unsigned long size) +int parse_ftrace_file(struct pevent *pevent, char *buf, unsigned long size) { return pevent_parse_event(pevent, buf, size, "ftrace"); } -int parse_event_file(char *buf, unsigned long size, char *sys) +int parse_event_file(struct pevent *pevent, + char *buf, unsigned long size, char *sys) { return pevent_parse_event(pevent, buf, size, sys); } -struct event_format *trace_find_next_event(struct event_format *event) +struct event_format *trace_find_next_event(struct pevent *pevent, + struct event_format *event) { static int idx; diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c index f097e0dd6c5c..719ed74a8565 100644 --- a/tools/perf/util/trace-event-read.c +++ b/tools/perf/util/trace-event-read.c @@ -114,20 +114,20 @@ static void skip(int size) }; } -static unsigned int read4(void) +static unsigned int read4(struct pevent *pevent) { unsigned int data; read_or_die(&data, 4); - return __data2host4(perf_pevent, data); + return __data2host4(pevent, data); } -static unsigned long long read8(void) +static unsigned long long read8(struct pevent *pevent) { unsigned long long data; read_or_die(&data, 8); - return __data2host8(perf_pevent, data); + return __data2host8(pevent, data); } static char *read_string(void) @@ -168,12 +168,12 @@ static char *read_string(void) return str; } -static void read_proc_kallsyms(void) +static void read_proc_kallsyms(struct pevent *pevent) { unsigned int size; char *buf; - size = read4(); + size = read4(pevent); if (!size) return; @@ -181,29 +181,29 @@ static void read_proc_kallsyms(void) read_or_die(buf, size); buf[size] = '\0'; - parse_proc_kallsyms(buf, size); + parse_proc_kallsyms(pevent, buf, size); free(buf); } -static void read_ftrace_printk(void) +static void read_ftrace_printk(struct pevent *pevent) { unsigned int size; char *buf; - size = read4(); + size = read4(pevent); if (!size) return; buf = malloc_or_die(size); read_or_die(buf, size); - parse_ftrace_printk(buf, size); + parse_ftrace_printk(pevent, buf, size); free(buf); } -static void read_header_files(void) +static void read_header_files(struct pevent *pevent) { unsigned long long size; char *header_event; @@ -214,7 +214,7 @@ static void read_header_files(void) if (memcmp(buf, "header_page", 12) != 0) die("did not read header page"); - size = read8(); + size = read8(pevent); skip(size); /* @@ -227,47 +227,48 @@ static void read_header_files(void) if (memcmp(buf, "header_event", 13) != 0) die("did not read header event"); - size = read8(); + size = read8(pevent); header_event = malloc_or_die(size); read_or_die(header_event, size); free(header_event); } -static void read_ftrace_file(unsigned long long size) +static void read_ftrace_file(struct pevent *pevent, unsigned long long size) { char *buf; buf = malloc_or_die(size); read_or_die(buf, size); - parse_ftrace_file(buf, size); + parse_ftrace_file(pevent, buf, size); free(buf); } -static void read_event_file(char *sys, unsigned long long size) +static void read_event_file(struct pevent *pevent, char *sys, + unsigned long long size) { char *buf; buf = malloc_or_die(size); read_or_die(buf, size); - parse_event_file(buf, size, sys); + parse_event_file(pevent, buf, size, sys); free(buf); } -static void read_ftrace_files(void) +static void read_ftrace_files(struct pevent *pevent) { unsigned long long size; int count; int i; - count = read4(); + count = read4(pevent); for (i = 0; i < count; i++) { - size = read8(); - read_ftrace_file(size); + size = read8(pevent); + read_ftrace_file(pevent, size); } } -static void read_event_files(void) +static void read_event_files(struct pevent *pevent) { unsigned long long size; char *sys; @@ -275,15 +276,15 @@ static void read_event_files(void) int count; int i,x; - systems = read4(); + systems = read4(pevent); for (i = 0; i < systems; i++) { sys = read_string(); - count = read4(); + count = read4(pevent); for (x=0; x < count; x++) { - size = read8(); - read_event_file(sys, size); + size = read8(pevent); + read_event_file(pevent, sys, size); } } } @@ -377,7 +378,7 @@ static int calc_index(void *ptr, int cpu) return (unsigned long)ptr - (unsigned long)cpu_data[cpu].page; } -struct pevent_record *trace_peek_data(int cpu) +struct pevent_record *trace_peek_data(struct pevent *pevent, int cpu) { struct pevent_record *data; void *page = cpu_data[cpu].page; @@ -399,15 +400,15 @@ struct pevent_record *trace_peek_data(int cpu) /* FIXME: handle header page */ if (header_page_ts_size != 8) die("expected a long long type for timestamp"); - cpu_data[cpu].timestamp = data2host8(perf_pevent, ptr); + cpu_data[cpu].timestamp = data2host8(pevent, ptr); ptr += 8; switch (header_page_size_size) { case 4: - cpu_data[cpu].page_size = data2host4(perf_pevent, ptr); + cpu_data[cpu].page_size = data2host4(pevent, ptr); ptr += 4; break; case 8: - cpu_data[cpu].page_size = data2host8(perf_pevent, ptr); + cpu_data[cpu].page_size = data2host8(pevent, ptr); ptr += 8; break; default: @@ -421,10 +422,10 @@ read_again: if (idx >= cpu_data[cpu].page_size) { get_next_page(cpu); - return trace_peek_data(cpu); + return trace_peek_data(pevent, cpu); } - type_len_ts = data2host4(perf_pevent, ptr); + type_len_ts = data2host4(pevent, ptr); ptr += 4; type_len = type_len4host(type_len_ts); @@ -434,14 +435,14 @@ read_again: case RINGBUF_TYPE_PADDING: if (!delta) die("error, hit unexpected end of page"); - length = data2host4(perf_pevent, ptr); + length = data2host4(pevent, ptr); ptr += 4; length *= 4; ptr += length; goto read_again; case RINGBUF_TYPE_TIME_EXTEND: - extend = data2host4(perf_pevent, ptr); + extend = data2host4(pevent, ptr); ptr += 4; extend <<= TS_SHIFT; extend += delta; @@ -452,7 +453,7 @@ read_again: ptr += 12; break; case 0: - length = data2host4(perf_pevent, ptr); + length = data2host4(pevent, ptr); ptr += 4; die("here! length=%d", length); break; @@ -477,17 +478,17 @@ read_again: return data; } -struct pevent_record *trace_read_data(int cpu) +struct pevent_record *trace_read_data(struct pevent *pevent, int cpu) { struct pevent_record *data; - data = trace_peek_data(cpu); + data = trace_peek_data(pevent, cpu); cpu_data[cpu].next = NULL; return data; } -ssize_t trace_report(int fd, bool __repipe) +ssize_t trace_report(int fd, struct pevent **ppevent, bool __repipe) { char buf[BUFSIZ]; char test[] = { 23, 8, 68 }; @@ -519,30 +520,32 @@ ssize_t trace_report(int fd, bool __repipe) file_bigendian = buf[0]; host_bigendian = bigendian(); - read_trace_init(file_bigendian, host_bigendian); + *ppevent = read_trace_init(file_bigendian, host_bigendian); + if (*ppevent == NULL) + die("read_trace_init failed"); read_or_die(buf, 1); long_size = buf[0]; - page_size = read4(); + page_size = read4(*ppevent); - read_header_files(); + read_header_files(*ppevent); - read_ftrace_files(); - read_event_files(); - read_proc_kallsyms(); - read_ftrace_printk(); + read_ftrace_files(*ppevent); + read_event_files(*ppevent); + read_proc_kallsyms(*ppevent); + read_ftrace_printk(*ppevent); size = calc_data_size - 1; calc_data_size = 0; repipe = false; if (show_funcs) { - pevent_print_funcs(perf_pevent); + pevent_print_funcs(*ppevent); return size; } if (show_printk) { - pevent_print_printk(perf_pevent); + pevent_print_printk(*ppevent); return size; } diff --git a/tools/perf/util/trace-event-scripting.c b/tools/perf/util/trace-event-scripting.c index 18ae6c1831d3..474aa7a7df43 100644 --- a/tools/perf/util/trace-event-scripting.c +++ b/tools/perf/util/trace-event-scripting.c @@ -36,6 +36,7 @@ static int stop_script_unsupported(void) } static void process_event_unsupported(union perf_event *event __unused, + struct pevent *pevent __unused, struct perf_sample *sample __unused, struct perf_evsel *evsel __unused, struct machine *machine __unused, @@ -61,7 +62,8 @@ static int python_start_script_unsupported(const char *script __unused, return -1; } -static int python_generate_script_unsupported(const char *outfile __unused) +static int python_generate_script_unsupported(struct pevent *pevent __unused, + const char *outfile __unused) { print_python_unsupported_msg(); @@ -122,7 +124,8 @@ static int perl_start_script_unsupported(const char *script __unused, return -1; } -static int perl_generate_script_unsupported(const char *outfile __unused) +static int perl_generate_script_unsupported(struct pevent *pevent __unused, + const char *outfile __unused) { print_perl_unsupported_msg(); diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index 639852ac1117..8fef1d6687b7 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -8,6 +8,7 @@ struct machine; struct perf_sample; union perf_event; +struct perf_tool; struct thread; extern int header_page_size_size; @@ -29,35 +30,36 @@ enum { int bigendian(void); -int read_trace_init(int file_bigendian, int host_bigendian); -void print_trace_event(int cpu, void *data, int size); +struct pevent *read_trace_init(int file_bigendian, int host_bigendian); +void print_trace_event(struct pevent *pevent, int cpu, void *data, int size); -void print_event(int cpu, void *data, int size, unsigned long long nsecs, - char *comm); +void print_event(struct pevent *pevent, int cpu, void *data, int size, + unsigned long long nsecs, char *comm); -int parse_ftrace_file(char *buf, unsigned long size); -int parse_event_file(char *buf, unsigned long size, char *sys); +int parse_ftrace_file(struct pevent *pevent, char *buf, unsigned long size); +int parse_event_file(struct pevent *pevent, + char *buf, unsigned long size, char *sys); -struct pevent_record *trace_peek_data(int cpu); -struct event_format *trace_find_event(int type); +struct pevent_record *trace_peek_data(struct pevent *pevent, int cpu); unsigned long long raw_field_value(struct event_format *event, const char *name, void *data); void *raw_field_ptr(struct event_format *event, const char *name, void *data); -void parse_proc_kallsyms(char *file, unsigned int size __unused); -void parse_ftrace_printk(char *file, unsigned int size __unused); +void parse_proc_kallsyms(struct pevent *pevent, char *file, unsigned int size); +void parse_ftrace_printk(struct pevent *pevent, char *file, unsigned int size); -ssize_t trace_report(int fd, bool repipe); +ssize_t trace_report(int fd, struct pevent **pevent, bool repipe); -int trace_parse_common_type(void *data); -int trace_parse_common_pid(void *data); +int trace_parse_common_type(struct pevent *pevent, void *data); +int trace_parse_common_pid(struct pevent *pevent, void *data); -struct event_format *trace_find_next_event(struct event_format *event); -unsigned long long read_size(void *ptr, int size); +struct event_format *trace_find_next_event(struct pevent *pevent, + struct event_format *event); +unsigned long long read_size(struct pevent *pevent, void *ptr, int size); unsigned long long eval_flag(const char *flag); -struct pevent_record *trace_read_data(int cpu); +struct pevent_record *trace_read_data(struct pevent *pevent, int cpu); int read_tracing_data(int fd, struct list_head *pattrs); struct tracing_data { @@ -77,11 +79,12 @@ struct scripting_ops { int (*start_script) (const char *script, int argc, const char **argv); int (*stop_script) (void); void (*process_event) (union perf_event *event, + struct pevent *pevent, struct perf_sample *sample, struct perf_evsel *evsel, struct machine *machine, struct thread *thread); - int (*generate_script) (const char *outfile); + int (*generate_script) (struct pevent *pevent, const char *outfile); }; int script_spec_register(const char *spec, struct scripting_ops *ops); @@ -90,6 +93,7 @@ void setup_perl_scripting(void); void setup_python_scripting(void); struct scripting_context { + struct pevent *pevent; void *event_data; }; From 209bd9e3e14712d74c8bebb028afda905d689f1c Mon Sep 17 00:00:00 2001 From: "Pierre-Loup A. Griffais" Date: Fri, 22 Jun 2012 11:38:13 -0700 Subject: [PATCH 174/612] perf symbols: Follow .gnu_debuglink section to find separate symbols The .gnu_debuglink section is specified to contain the filename of the debug info file, as well as a CRC that can be used to validate it. This doesn't currently use the checksum and relies on the usual build-id matching for validation. This provides more context: http://sourceware.org/gdb/onlinedocs/gdb/Separate-Debug-Files.html Signed-off-by: Pierre-Loup A. Griffais Reported-by: Mike Sartain Tested-by: Mike Sartain Cc: Ingo Molnar Cc: Linus Torvalds Cc: Mike Sartain Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4FE4BB95.3080309@nvidia.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 65 +++++++++++++++++++++++++++++++++++++++- tools/perf/util/symbol.h | 1 + 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 3e2e5ea0f03f..994f4ffdcd05 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1590,11 +1590,62 @@ out: return err; } +static int filename__read_debuglink(const char *filename, + char *debuglink, size_t size) +{ + int fd, err = -1; + Elf *elf; + GElf_Ehdr ehdr; + GElf_Shdr shdr; + Elf_Data *data; + Elf_Scn *sec; + Elf_Kind ek; + + fd = open(filename, O_RDONLY); + if (fd < 0) + goto out; + + elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL); + if (elf == NULL) { + pr_debug2("%s: cannot read %s ELF file.\n", __func__, filename); + goto out_close; + } + + ek = elf_kind(elf); + if (ek != ELF_K_ELF) + goto out_close; + + if (gelf_getehdr(elf, &ehdr) == NULL) { + pr_err("%s: cannot get elf header.\n", __func__); + goto out_close; + } + + sec = elf_section_by_name(elf, &ehdr, &shdr, + ".gnu_debuglink", NULL); + if (sec == NULL) + goto out_close; + + data = elf_getdata(sec, NULL); + if (data == NULL) + goto out_close; + + /* the start of this section is a zero-terminated string */ + strncpy(debuglink, data->d_buf, size); + + elf_end(elf); + +out_close: + close(fd); +out: + return err; +} + char dso__symtab_origin(const struct dso *dso) { static const char origin[] = { [SYMTAB__KALLSYMS] = 'k', [SYMTAB__JAVA_JIT] = 'j', + [SYMTAB__DEBUGLINK] = 'l', [SYMTAB__BUILD_ID_CACHE] = 'B', [SYMTAB__FEDORA_DEBUGINFO] = 'f', [SYMTAB__UBUNTU_DEBUGINFO] = 'u', @@ -1662,10 +1713,22 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter) */ want_symtab = 1; restart: - for (dso->symtab_type = SYMTAB__BUILD_ID_CACHE; + for (dso->symtab_type = SYMTAB__DEBUGLINK; dso->symtab_type != SYMTAB__NOT_FOUND; dso->symtab_type++) { switch (dso->symtab_type) { + case SYMTAB__DEBUGLINK: { + char *debuglink; + strncpy(name, dso->long_name, size); + debuglink = name + dso->long_name_len; + while (debuglink != name && *debuglink != '/') + debuglink--; + if (*debuglink == '/') + debuglink++; + filename__read_debuglink(dso->long_name, debuglink, + size - (debuglink - name)); + } + break; case SYMTAB__BUILD_ID_CACHE: /* skip the locally configured cache if a symfs is given */ if (symbol_conf.symfs[0] || diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index af0752b1aca1..a884b99017f0 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -257,6 +257,7 @@ enum symtab_type { SYMTAB__KALLSYMS = 0, SYMTAB__GUEST_KALLSYMS, SYMTAB__JAVA_JIT, + SYMTAB__DEBUGLINK, SYMTAB__BUILD_ID_CACHE, SYMTAB__FEDORA_DEBUGINFO, SYMTAB__UBUNTU_DEBUGINFO, From 08942f6d5d992e9486b07653fd87ea8182a22fa0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 20 Jun 2012 15:08:06 +0900 Subject: [PATCH 175/612] perf bench: Documentation update The current perf-bench documentation has a couple of typos and even lacks entire description of mem subsystem. Fix it. Reported-by: Ingo Molnar Signed-off-by: Namhyung Kim Acked-by: Hitoshi Mitake Cc: Hitoshi Mitake Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1340172486-17805-1-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-bench.txt | 78 ++++++++++++++++++++++++- tools/perf/bench/mem-memcpy.c | 4 +- tools/perf/bench/mem-memset.c | 8 +-- tools/perf/builtin-bench.c | 4 +- 4 files changed, 83 insertions(+), 11 deletions(-) diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt index a3dbadb26ef5..f3c716a4cad3 100644 --- a/tools/perf/Documentation/perf-bench.txt +++ b/tools/perf/Documentation/perf-bench.txt @@ -12,7 +12,7 @@ SYNOPSIS DESCRIPTION ----------- -This 'perf bench' command is general framework for benchmark suites. +This 'perf bench' command is a general framework for benchmark suites. COMMON OPTIONS -------------- @@ -45,14 +45,20 @@ SUBSYSTEM 'sched':: Scheduler and IPC mechanisms. +'mem':: + Memory access performance. + +'all':: + All benchmark subsystems. + SUITES FOR 'sched' ~~~~~~~~~~~~~~~~~~ *messaging*:: Suite for evaluating performance of scheduler and IPC mechanisms. Based on hackbench by Rusty Russell. -Options of *pipe* -^^^^^^^^^^^^^^^^^ +Options of *messaging* +^^^^^^^^^^^^^^^^^^^^^^ -p:: --pipe:: Use pipe() instead of socketpair() @@ -115,6 +121,72 @@ Example of *pipe* 59004 ops/sec --------------------- +SUITES FOR 'mem' +~~~~~~~~~~~~~~~~ +*memcpy*:: +Suite for evaluating performance of simple memory copy in various ways. + +Options of *memcpy* +^^^^^^^^^^^^^^^^^^^ +-l:: +--length:: +Specify length of memory to copy (default: 1MB). +Available units are B, KB, MB, GB and TB (case insensitive). + +-r:: +--routine:: +Specify routine to copy (default: default). +Available routines are depend on the architecture. +On x86-64, x86-64-unrolled, x86-64-movsq and x86-64-movsb are supported. + +-i:: +--iterations:: +Repeat memcpy invocation this number of times. + +-c:: +--clock:: +Use perf's cpu-cycles event instead of gettimeofday syscall. + +-o:: +--only-prefault:: +Show only the result with page faults before memcpy. + +-n:: +--no-prefault:: +Show only the result without page faults before memcpy. + +*memset*:: +Suite for evaluating performance of simple memory set in various ways. + +Options of *memset* +^^^^^^^^^^^^^^^^^^^ +-l:: +--length:: +Specify length of memory to set (default: 1MB). +Available units are B, KB, MB, GB and TB (case insensitive). + +-r:: +--routine:: +Specify routine to set (default: default). +Available routines are depend on the architecture. +On x86-64, x86-64-unrolled, x86-64-stosq and x86-64-stosb are supported. + +-i:: +--iterations:: +Repeat memset invocation this number of times. + +-c:: +--clock:: +Use perf's cpu-cycles event instead of gettimeofday syscall. + +-o:: +--only-prefault:: +Show only the result with page faults before memset. + +-n:: +--no-prefault:: +Show only the result without page faults before memset. + SEE ALSO -------- linkperf:perf[1] diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c index 71557225bf92..d990365cafa0 100644 --- a/tools/perf/bench/mem-memcpy.c +++ b/tools/perf/bench/mem-memcpy.c @@ -32,13 +32,13 @@ static bool no_prefault; static const struct option options[] = { OPT_STRING('l', "length", &length_str, "1MB", "Specify length of memory to copy. " - "available unit: B, MB, GB (upper and lower)"), + "Available units: B, KB, MB, GB and TB (upper and lower)"), OPT_STRING('r', "routine", &routine, "default", "Specify routine to copy"), OPT_INTEGER('i', "iterations", &iterations, "repeat memcpy() invocation this number of times"), OPT_BOOLEAN('c', "clock", &use_clock, - "Use CPU clock for measuring"), + "Use cycles event instead of gettimeofday() for measuring"), OPT_BOOLEAN('o', "only-prefault", &only_prefault, "Show only the result with page faults before memcpy()"), OPT_BOOLEAN('n', "no-prefault", &no_prefault, diff --git a/tools/perf/bench/mem-memset.c b/tools/perf/bench/mem-memset.c index e9079185bd72..bf0d5f552017 100644 --- a/tools/perf/bench/mem-memset.c +++ b/tools/perf/bench/mem-memset.c @@ -31,14 +31,14 @@ static bool no_prefault; static const struct option options[] = { OPT_STRING('l', "length", &length_str, "1MB", - "Specify length of memory to copy. " - "available unit: B, MB, GB (upper and lower)"), + "Specify length of memory to set. " + "Available units: B, KB, MB, GB and TB (upper and lower)"), OPT_STRING('r', "routine", &routine, "default", - "Specify routine to copy"), + "Specify routine to set"), OPT_INTEGER('i', "iterations", &iterations, "repeat memset() invocation this number of times"), OPT_BOOLEAN('c', "clock", &use_clock, - "Use CPU clock for measuring"), + "Use cycles event instead of gettimeofday() for measuring"), OPT_BOOLEAN('o', "only-prefault", &only_prefault, "Show only the result with page faults before memset()"), OPT_BOOLEAN('n', "no-prefault", &no_prefault, diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index b0e74ab2d7a2..1f3100216448 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -33,7 +33,7 @@ struct bench_suite { }; \ /* sentinel: easy for help */ -#define suite_all { "all", "test all suite (pseudo suite)", NULL } +#define suite_all { "all", "Test all benchmark suites", NULL } static struct bench_suite sched_suites[] = { { "messaging", @@ -75,7 +75,7 @@ static struct bench_subsys subsystems[] = { "memory access performance", mem_suites }, { "all", /* sentinel: easy for help */ - "test all subsystem (pseudo subsystem)", + "all benchmark subsystem", NULL }, { NULL, NULL, From 300aa941650e98966ad85847527537df5b11a87e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 11 Jun 2012 13:48:41 -0600 Subject: [PATCH 176/612] perf report: Delay sample_type checks in pipe mode The pipeline: perf record -a -g -o - sleep 5 |perf inject -v -b | perf report -g -i - generates the warning: Selected -g but no callchain data. Did you call 'perf record' without -g? The problem is that the header data is not written to the pipe, so the sample_type has not been available when perf_report__setup_sample_type is called. For pipe mode, record dumps the sample type as part of the synthesized events stream -- perf_event__synthesize_attrs(). Handle this be detecting pipe mode and not doing early sanity checks on sample_type. Signed-off-by: David Ahern Tested-by: Tim Chen Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Link: http://lkml.kernel.org/r/1339444121-26236-1-git-send-email-dsahern@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 40b0ffc3ad3b..69b1c1185159 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -245,11 +245,12 @@ static int process_read_event(struct perf_tool *tool, return 0; } +/* For pipe mode, sample_type is not currently set */ static int perf_report__setup_sample_type(struct perf_report *rep) { struct perf_session *self = rep->session; - if (!(self->sample_type & PERF_SAMPLE_CALLCHAIN)) { + if (!self->fd_pipe && !(self->sample_type & PERF_SAMPLE_CALLCHAIN)) { if (sort__has_parent) { ui__error("Selected --sort parent, but no " "callchain data. Did you call " @@ -272,7 +273,8 @@ static int perf_report__setup_sample_type(struct perf_report *rep) } if (sort__branch_mode == 1) { - if (!(self->sample_type & PERF_SAMPLE_BRANCH_STACK)) { + if (!self->fd_pipe && + !(self->sample_type & PERF_SAMPLE_BRANCH_STACK)) { ui__error("Selected -b but no branch data. " "Did you call perf record without -b?\n"); return -1; From d9873ab79376d5c0112ed09e14783067dc65e808 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 11 Jun 2012 19:13:32 -0600 Subject: [PATCH 177/612] perf tools: Trivial build fix References to OUTPUT should not be followed by a '/'. When a build output directory is not specified for this case you get: gcc -o builtin-annotate.o -c ... -I/util ... which is wrong. Signed-off-by: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Link: http://lkml.kernel.org/r/1339463612-30937-1-git-send-email-dsahern@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index d698c118a602..75d74e5db8d5 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -155,7 +155,7 @@ endif ### --- END CONFIGURATION SECTION --- -BASIC_CFLAGS = -Iutil/include -Iarch/$(ARCH)/include -I$(OUTPUT)/util -I$(TRACE_EVENT_DIR) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE +BASIC_CFLAGS = -Iutil/include -Iarch/$(ARCH)/include -I$(OUTPUT)util -I$(TRACE_EVENT_DIR) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE BASIC_LDFLAGS = # Guard against environment variables From 0be61ebc18b919dddbdbcd1c4f42513c310ecf59 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 18 Jun 2012 09:28:16 -0400 Subject: [PATCH 178/612] tracing/selftest: Add a WARN_ON() if a tracer test fails Add a WARN_ON() output on test failures so that they are easier to detect in automated tests. Although, the WARN_ON() will not print if the test causes the system to crash, obviously. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 49249c28690d..748f6401edf6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -830,6 +830,8 @@ int register_tracer(struct tracer *type) current_trace = saved_tracer; if (ret) { printk(KERN_CONT "FAILED!\n"); + /* Add the warning after printing 'FAILED' */ + WARN_ON(1); goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ From b102f1d0f1cd0bb5ec82e5aeb1e33502d6ad6710 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Jun 2012 09:41:39 +0900 Subject: [PATCH 179/612] tracing/kvm: Use __print_hex() for kvm_emulate_insn tracepoint The kvm_emulate_insn tracepoint used __print_insn() for printing its instructions. However it makes the format of the event hard to parse as it reveals TP internals. Fortunately, kernel provides __print_hex for almost same purpose, we can use it instead of open coding it. The user-space can be changed to parse it later. That means raw kernel tracing will not be affected by this change: # cd /sys/kernel/debug/tracing/ # cat events/kvm/kvm_emulate_insn/format name: kvm_emulate_insn ID: 29 format: ... print fmt: "%x:%llx:%s (%s)%s", REC->csbase, REC->rip, __print_hex(REC->insn, REC->len), \ __print_symbolic(REC->flags, { 0, "real" }, { (1 << 0) | (1 << 1), "vm16" }, \ { (1 << 0), "prot16" }, { (1 << 0) | (1 << 2), "prot32" }, { (1 << 0) | (1 << 3), "prot64" }), \ REC->failed ? " failed" : "" # echo 1 > events/kvm/kvm_emulate_insn/enable # cat trace # tracer: nop # # entries-in-buffer/entries-written: 2183/2183 #P:12 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | qemu-kvm-1782 [002] ...1 140.931636: kvm_emulate_insn: 0:c102fa25:89 10 (prot32) qemu-kvm-1781 [004] ...1 140.931637: kvm_emulate_insn: 0:c102fa25:89 10 (prot32) Link: http://lkml.kernel.org/n/tip-wfw6y3b9ugtey8snaow9nmg5@git.kernel.org Link: http://lkml.kernel.org/r/1340757701-10711-2-git-send-email-namhyung@kernel.org Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Namhyung Kim Cc: kvm@vger.kernel.org Acked-by: Avi Kivity Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- arch/x86/kvm/trace.h | 12 +----------- include/trace/ftrace.h | 1 + 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 911d2641f14c..62d02e3c3ed6 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -710,16 +710,6 @@ TRACE_EVENT(kvm_skinit, __entry->rip, __entry->slb) ); -#define __print_insn(insn, ilen) ({ \ - int i; \ - const char *ret = p->buffer + p->len; \ - \ - for (i = 0; i < ilen; ++i) \ - trace_seq_printf(p, " %02x", insn[i]); \ - trace_seq_printf(p, "%c", 0); \ - ret; \ - }) - #define KVM_EMUL_INSN_F_CR0_PE (1 << 0) #define KVM_EMUL_INSN_F_EFL_VM (1 << 1) #define KVM_EMUL_INSN_F_CS_D (1 << 2) @@ -786,7 +776,7 @@ TRACE_EVENT(kvm_emulate_insn, TP_printk("%x:%llx:%s (%s)%s", __entry->csbase, __entry->rip, - __print_insn(__entry->insn, __entry->len), + __print_hex(__entry->insn, __entry->len), __print_symbolic(__entry->flags, kvm_trace_symbol_emul_flags), __entry->failed ? " failed" : "" diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 769724944fc6..c6bc2faaf261 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -571,6 +571,7 @@ static inline void ftrace_test_probe_##call(void) \ #undef __print_flags #undef __print_symbolic +#undef __print_hex #undef __get_dynamic_array #undef __get_str From 6d158a813efcd09661c23f16ddf7e2ff834cb20c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 27 Jun 2012 20:46:14 -0400 Subject: [PATCH 180/612] tracing: Remove NR_CPUS array from trace_iterator Replace the NR_CPUS array of buffer_iter from the trace_iterator with an allocated array. This will just create an array of possible CPUS instead of the max number specified. The use of NR_CPUS in that array caused allocation failures for machines that were tight on memory. This did not cause any failures to the system itself (no crashes), but caused unnecessary failures for reading the trace files. Added a helper function called 'trace_buffer_iter()' that returns the buffer_iter item or NULL if it is not defined or the array was not allocated. Some routines do not require the array (tracing_open_pipe() for one). Reported-by: Dave Jones Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 +- kernel/trace/trace.c | 27 ++++++++++++++++++--------- kernel/trace/trace.h | 8 ++++++++ kernel/trace/trace_functions_graph.c | 2 +- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 1aff18346c71..af961d6f7ab1 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -65,7 +65,7 @@ struct trace_iterator { void *private; int cpu_file; struct mutex mutex; - struct ring_buffer_iter *buffer_iter[NR_CPUS]; + struct ring_buffer_iter **buffer_iter; unsigned long iter_flags; /* trace_seq for __print_flags() and __print_symbolic() etc. */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 748f6401edf6..b2af14e94c28 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1710,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk); static void trace_iterator_increment(struct trace_iterator *iter) { + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); + iter->idx++; - if (iter->buffer_iter[iter->cpu]) - ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + if (buf_iter) + ring_buffer_read(buf_iter, NULL); } static struct trace_entry * @@ -1720,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; - struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); @@ -1858,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) tr->data[cpu]->skipped_entries = 0; - if (!iter->buffer_iter[cpu]) + buf_iter = trace_buffer_iter(iter, cpu); + if (!buf_iter) return; - buf_iter = iter->buffer_iter[cpu]; ring_buffer_iter_reset(buf_iter); /* @@ -2207,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) int trace_empty(struct trace_iterator *iter) { + struct ring_buffer_iter *buf_iter; int cpu; /* If we are looking at one CPU buffer, only check that one */ if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { cpu = iter->cpu_file; - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) @@ -2223,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter) } for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) @@ -2383,6 +2388,8 @@ __tracing_open(struct inode *inode, struct file *file) if (!iter) return ERR_PTR(-ENOMEM); + iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + GFP_KERNEL); /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. @@ -2443,6 +2450,7 @@ __tracing_open(struct inode *inode, struct file *file) fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); + kfree(iter->buffer_iter); seq_release_private(inode, file); return ERR_PTR(-ENOMEM); } @@ -2483,6 +2491,7 @@ static int tracing_release(struct inode *inode, struct file *file) mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); kfree(iter->trace); + kfree(iter->buffer_iter); seq_release_private(inode, file); return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5aec220d2de0..55e1f7f0db12 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -317,6 +317,14 @@ struct tracer { #define TRACE_PIPE_ALL_CPU -1 +static inline struct ring_buffer_iter * +trace_buffer_iter(struct trace_iterator *iter, int cpu) +{ + if (iter->buffer_iter && iter->buffer_iter[cpu]) + return iter->buffer_iter[cpu]; + return NULL; +} + int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void trace_wake_up(void); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a7d2a4c653d8..ce27c8ba8d31 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter, next = &data->ret; } else { - ring_iter = iter->buffer_iter[iter->cpu]; + ring_iter = trace_buffer_iter(iter, iter->cpu); /* First peek to compare current entry and the next one */ if (ring_iter) From a5fb833172eca69136e9ee1ada778e404086ab8a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 Jun 2012 13:35:04 -0400 Subject: [PATCH 181/612] ring-buffer: Fix uninitialized read_stamp The ring buffer reader page is used to swap a page from the writable ring buffer. If the writer happens to be on that page, it ends up on the reader page, but will simply move off of it, back into the writable ring buffer as writes are added. The time stamp passed back to the readers is stored in the cpu_buffer per CPU descriptor. This stamp is updated when a swap of the reader page takes place, and it reads the current stamp from the page taken from the writable ring buffer. Everytime a writer goes to a new page, it updates the time stamp of that page. The problem happens if a reader reads a page from an empty per CPU ring buffer. If the buffer is empty, the swap still takes place, placing the writer at the start of the reader page. If at a later time, a write happens, it updates the page's time stamp and continues. But the problem is that the read_stamp does not get updated, because the page was already swapped. The solution to this was to not swap the page if the ring buffer happens to be empty. This also removes the side effect that the writes on the reader page will not get updated because the writer never gets back on the reader page without a swap. That is, if a read happens on an empty buffer, but then no reads happen for a while. If a swap took place, and the writer were to start writing a lot of data (function tracer), it will start overflowing the ring buffer and overwrite the older data. But because the writer never goes back onto the reader page, the data left on the reader page never gets overwritten. This causes the reader to see really old data, followed by a jump to newer data. Link: http://lkml.kernel.org/r/1340060577-9112-1-git-send-email-dhsharp@google.com Google-Bug-Id: 6410455 Reported-by: David Sharp tested-by: David Sharp Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d0f6a8a0e5e..82a3e0c56b1d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) if (cpu_buffer->commit_page == cpu_buffer->reader_page) goto out; + /* Don't bother swapping if the ring buffer is empty */ + if (rb_num_of_entries(cpu_buffer) == 0) + goto out; + /* * Reset the reader page to size zero. */ From f526a4ce2643e2636c091cf2bf0ec1442110c5b7 Mon Sep 17 00:00:00 2001 From: Konstantin Stepanyuk Date: Wed, 27 Jun 2012 15:52:14 -0600 Subject: [PATCH 182/612] tools lib traceevent: Fix clean target in Makefile Dependency files were not cleaned up. Add missing space to fix the issue. Signed-off-by: Konstantin Stepanyuk Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1340833934-18783-1-git-send-email-konstantin.stepanyuk@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile index 3d69aa9ff51e..423f4b81ecce 100644 --- a/tools/lib/traceevent/Makefile +++ b/tools/lib/traceevent/Makefile @@ -290,7 +290,7 @@ install_lib: all_cmd install_plugins install_python install: install_lib clean: - $(RM) *.o *~ $(TARGETS) *.a *.so $(VERSION_FILES).*.d + $(RM) *.o *~ $(TARGETS) *.a *.so $(VERSION_FILES) .*.d $(RM) tags TAGS endif # skip-makefile From 6545e3a8f0666b60b26202a5271a94f7cd9601a8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 22 Jun 2012 17:10:14 +0900 Subject: [PATCH 183/612] tools lib traceevent: Teach [ce]tags about libtraceeevent error codes As we use a macro trick to sync each error codes with its description string, teach [ce]tags to process them properly. This patch modifies the libtraceevent's Makefile not a kernel one. Suggested-by: Steven Rostedt Signed-off-by: Namhyung Kim Link: http://lkml.kernel.org/n/tip-3101nsbg52glxdqih291qj74@git.kernel.org Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1340352615-20737-1-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile index 423f4b81ecce..34a577e7d554 100644 --- a/tools/lib/traceevent/Makefile +++ b/tools/lib/traceevent/Makefile @@ -270,11 +270,13 @@ endif tags: force $(RM) tags - find . -name '*.[ch]' | xargs ctags --extra=+f --c-kinds=+px + find . -name '*.[ch]' | xargs ctags --extra=+f --c-kinds=+px \ + --regex-c++='/_PE\(([^,)]*).*/PEVENT_ERRNO__\1/' TAGS: force $(RM) TAGS - find . -name '*.[ch]' | xargs etags + find . -name '*.[ch]' | xargs etags \ + --regex='/_PE(\([^,)]*\).*/PEVENT_ERRNO__\1/' define do_install $(print_install) \ From 860df5833e461beba4bf9f3304a7919da98fd789 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 22 Jun 2012 14:37:36 +0900 Subject: [PATCH 184/612] tools lib traceevent: Make dependency files regeneratable Ingo reported that libtraceevent doesn't clean out dependency (.d) files and it can cause a build error when the libgcc package upgraded: comet:~/tip/tools/perf> make -j SUBDIR ../lib/traceevent/ make[1]: *** No rule to make target `/usr/lib/gcc/x86_64-redhat-linux/4.7.0/include/stddef.h', needed by `event-parse.o'. Stop. make: *** [../lib/traceevent//libtraceevent.a] Error 2 So this patch makes the .d files depends on the source and header files also, so that it can be re-generated as needed. NOTE: This code is copied from the GNU make manual page (4.14 Generating Prerequisites Automatically). Reported-by: Ingo Molnar Signed-off-by: Namhyung Kim Cc: David Ahern Cc: Ingo Molnar Cc: Jiri Olsa Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1340343462-15556-3-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile index 34a577e7d554..46c2f6b7b123 100644 --- a/tools/lib/traceevent/Makefile +++ b/tools/lib/traceevent/Makefile @@ -250,8 +250,12 @@ endef all_objs := $(sort $(ALL_OBJS)) all_deps := $(all_objs:%.o=.%.d) +# let .d file also depends on the source and header files define check_deps - $(CC) -M $(CFLAGS) $< > $@; + @set -e; $(RM) $@; \ + $(CC) -M $(CFLAGS) $< > $@.$$$$; \ + sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ + $(RM) $@.$$$$ endef $(gui_deps): ks_version.h From 600da3cfe19496485c5d8d52ff703590a0bd53f6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 22 Jun 2012 17:10:15 +0900 Subject: [PATCH 185/612] tools lib traceevent: Check string is really printable When libtraceevent parses format fields, it assumes that array of 1 byte is string but it's not always true. The kvm_emulate_insn contains 15 u8 array of insn that contains (binary) instructions. Thus when it's printed, it'll have broken output like below: kvm_emulate_insn: [FAILED TO PARSE] rip=3238197797 csbase=0 len=2 \ insn=<89>P^]& flags=5 failed=0 With this patch: kvm_emulate_insn: [FAILED TO PARSE] rip=3238197797 csbase=0 len=2 \ insn=ARRAY[89, 10, 5d, c3, 8d, b4, 26, 00, 00, 00, 00, 55, 89, e5, 3e] flags=5 failed=0 Suggested-by: Steven Rostedt Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1340352615-20737-2-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/event-parse.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 554828219c33..b203a50c0f22 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -3589,6 +3589,16 @@ static void print_mac_arg(struct trace_seq *s, int mac, void *data, int size, trace_seq_printf(s, fmt, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5]); } +static int is_printable_array(char *p, unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len && p[i]; i++) + if (!isprint(p[i])) + return 0; + return 1; +} + static void print_event_fields(struct trace_seq *s, void *data, int size, struct event_format *event) { @@ -3608,7 +3618,8 @@ static void print_event_fields(struct trace_seq *s, void *data, int size, len = offset >> 16; offset &= 0xffff; } - if (field->flags & FIELD_IS_STRING) { + if (field->flags & FIELD_IS_STRING && + is_printable_array(data + offset, len)) { trace_seq_printf(s, "%s", (char *)data + offset); } else { trace_seq_puts(s, "ARRAY["); @@ -3619,6 +3630,7 @@ static void print_event_fields(struct trace_seq *s, void *data, int size, *((unsigned char *)data + offset + i)); } trace_seq_putc(s, ']'); + field->flags &= ~FIELD_IS_STRING; } } else { val = pevent_read_number(event->pevent, data + field->offset, From b700807196ac8d87e00fed9fda80ab89b7f56db6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Jun 2012 09:41:40 +0900 Subject: [PATCH 186/612] tools lib traceevent: Use local variable 'field' Use local variable 'field' to reduce typing. It is needed by later patch not to exceed 80 column. Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1340757701-10711-3-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/event-parse.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index b203a50c0f22..63d02be5480f 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -3214,6 +3214,7 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, { struct pevent *pevent = event->pevent; struct print_flag_sym *flag; + struct format_field *field; unsigned long long val, fval; unsigned long addr; char *str; @@ -3228,27 +3229,29 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, print_str_to_seq(s, format, len_arg, arg->atom.atom); return; case PRINT_FIELD: - if (!arg->field.field) { - arg->field.field = pevent_find_any_field(event, arg->field.name); - if (!arg->field.field) + field = arg->field.field; + if (!field) { + field = pevent_find_any_field(event, arg->field.name); + if (!field) die("field %s not found", arg->field.name); + arg->field.field = field; } /* Zero sized fields, mean the rest of the data */ - len = arg->field.field->size ? : size - arg->field.field->offset; + len = field->size ? : size - field->offset; /* * Some events pass in pointers. If this is not an array * and the size is the same as long_size, assume that it * is a pointer. */ - if (!(arg->field.field->flags & FIELD_IS_ARRAY) && - arg->field.field->size == pevent->long_size) { - addr = *(unsigned long *)(data + arg->field.field->offset); + if (!(field->flags & FIELD_IS_ARRAY) && + field->size == pevent->long_size) { + addr = *(unsigned long *)(data + field->offset); trace_seq_printf(s, "%lx", addr); break; } str = malloc_or_die(len + 1); - memcpy(str, data + arg->field.field->offset, len); + memcpy(str, data + field->offset, len); str[len] = 0; print_str_to_seq(s, format, len_arg, str); free(str); From e080e6f1c863242ff709046d0486d09c46dc484a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Jun 2012 09:41:41 +0900 Subject: [PATCH 187/612] tools lib traceevent: Add support for __print_hex() Since the __print_hex() function is used in print fmt now, add corresponding parser routines. This makes the output of perf script on the kvm_emulate_insn event not to fail any more. before: kvm_emulate_insn: [FAILED TO PARSE] rip=3238197797 ... after: kvm_emulate_insn: 0:c102fa25:89 10 (prot32) Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1340757701-10711-4-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/event-parse.c | 75 ++++++++++++++++++- tools/lib/traceevent/event-parse.h | 7 ++ .../util/scripting-engines/trace-event-perl.c | 4 + .../scripting-engines/trace-event-python.c | 4 + 4 files changed, 89 insertions(+), 1 deletion(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 63d02be5480f..b1abd39923d6 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -697,6 +697,10 @@ static void free_arg(struct print_arg *arg) free_arg(arg->symbol.field); free_flag_sym(arg->symbol.symbols); break; + case PRINT_HEX: + free_arg(arg->hex.field); + free_arg(arg->hex.size); + break; case PRINT_TYPE: free(arg->typecast.type); free_arg(arg->typecast.item); @@ -2259,6 +2263,45 @@ process_symbols(struct event_format *event, struct print_arg *arg, char **tok) return EVENT_ERROR; } +static enum event_type +process_hex(struct event_format *event, struct print_arg *arg, char **tok) +{ + struct print_arg *field; + enum event_type type; + char *token; + + memset(arg, 0, sizeof(*arg)); + arg->type = PRINT_HEX; + + field = alloc_arg(); + type = process_arg(event, field, &token); + + if (test_type_token(type, token, EVENT_DELIM, ",")) + goto out_free; + + arg->hex.field = field; + + free_token(token); + + field = alloc_arg(); + type = process_arg(event, field, &token); + + if (test_type_token(type, token, EVENT_DELIM, ")")) + goto out_free; + + arg->hex.size = field; + + free_token(token); + type = read_token_item(tok); + return type; + + out_free: + free_arg(field); + free_token(token); + *tok = NULL; + return EVENT_ERROR; +} + static enum event_type process_dynamic_array(struct event_format *event, struct print_arg *arg, char **tok) { @@ -2488,6 +2531,10 @@ process_function(struct event_format *event, struct print_arg *arg, is_symbolic_field = 1; return process_symbols(event, arg, tok); } + if (strcmp(token, "__print_hex") == 0) { + free_token(token); + return process_hex(event, arg, tok); + } if (strcmp(token, "__get_str") == 0) { free_token(token); return process_str(event, arg, tok); @@ -2995,6 +3042,7 @@ eval_num_arg(void *data, int size, struct event_format *event, struct print_arg break; case PRINT_FLAGS: case PRINT_SYMBOL: + case PRINT_HEX: break; case PRINT_TYPE: val = eval_num_arg(data, size, event, arg->typecast.item); @@ -3218,8 +3266,9 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, unsigned long long val, fval; unsigned long addr; char *str; + unsigned char *hex; int print; - int len; + int i, len; switch (arg->type) { case PRINT_NULL: @@ -3284,6 +3333,23 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, } } break; + case PRINT_HEX: + field = arg->hex.field->field.field; + if (!field) { + str = arg->hex.field->field.name; + field = pevent_find_any_field(event, str); + if (!field) + die("field %s not found", str); + arg->hex.field->field.field = field; + } + hex = data + field->offset; + len = eval_num_arg(data, size, event, arg->hex.size); + for (i = 0; i < len; i++) { + if (i) + trace_seq_putc(s, ' '); + trace_seq_printf(s, "%02x", hex[i]); + } + break; case PRINT_TYPE: break; @@ -4294,6 +4360,13 @@ static void print_args(struct print_arg *args) trace_seq_destroy(&s); printf(")"); break; + case PRINT_HEX: + printf("__print_hex("); + print_args(args->hex.field); + printf(", "); + print_args(args->hex.size); + printf(")"); + break; case PRINT_STRING: case PRINT_BSTRING: printf("__get_str(%s)", args->string.string); diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h index ac997bc7b592..5772ad8cb386 100644 --- a/tools/lib/traceevent/event-parse.h +++ b/tools/lib/traceevent/event-parse.h @@ -226,6 +226,11 @@ struct print_arg_symbol { struct print_flag_sym *symbols; }; +struct print_arg_hex { + struct print_arg *field; + struct print_arg *size; +}; + struct print_arg_dynarray { struct format_field *field; struct print_arg *index; @@ -253,6 +258,7 @@ enum print_arg_type { PRINT_FIELD, PRINT_FLAGS, PRINT_SYMBOL, + PRINT_HEX, PRINT_TYPE, PRINT_STRING, PRINT_BSTRING, @@ -270,6 +276,7 @@ struct print_arg { struct print_arg_typecast typecast; struct print_arg_flags flags; struct print_arg_symbol symbol; + struct print_arg_hex hex; struct print_arg_func func; struct print_arg_string string; struct print_arg_op op; diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index b3620fe12763..02dfa19a467f 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -209,6 +209,10 @@ static void define_event_symbols(struct event_format *event, define_symbolic_values(args->symbol.symbols, ev_name, cur_field_name); break; + case PRINT_HEX: + define_event_symbols(event, ev_name, args->hex.field); + define_event_symbols(event, ev_name, args->hex.size); + break; case PRINT_BSTRING: case PRINT_DYNAMIC_ARRAY: case PRINT_STRING: diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index a8ca2f8179a9..ce4d1b0c3862 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -166,6 +166,10 @@ static void define_event_symbols(struct event_format *event, define_values(PRINT_SYMBOL, args->symbol.symbols, ev_name, cur_field_name); break; + case PRINT_HEX: + define_event_symbols(event, ev_name, args->hex.field); + define_event_symbols(event, ev_name, args->hex.size); + break; case PRINT_STRING: break; case PRINT_TYPE: From 50d8f9e393c5a1a8864fda75e3a9f2b800497a61 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 11 Jun 2012 15:28:53 +0900 Subject: [PATCH 188/612] tools lib traceevent: Replace malloc_or_die to plain malloc in alloc_event() Because the only caller of the alloc_event() (pevent_parse_event) checks return value properly, it can be changed to use plain malloc. Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1339396133-9839-1-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/event-parse.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index b1abd39923d6..83f0a8add177 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -616,7 +616,9 @@ static struct event_format *alloc_event(void) { struct event_format *event; - event = malloc_or_die(sizeof(*event)); + event = malloc(sizeof(*event)); + if (!event) + return NULL; memset(event, 0, sizeof(*event)); return event; From 7582732f57e547929d4c65ad8e38f3446e0538d8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 29 Jun 2012 09:22:54 +0200 Subject: [PATCH 189/612] perf tools: Fix hw breakpoint's type modifier parsing Fixing the hw breakpoint's type modifier parsing to allow all possible combinations of 'rwx' characters. Adding automated tests to the parsing test suite. Reported-by: Jovi Zhang Original-patch-by: Namhyung Kim Signed-off-by: Jiri Olsa Cc: Jovi Zhang Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120629072254.GA940@krava.brq.redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events-test.c | 37 +++++++++++++++++++++++++++++ tools/perf/util/parse-events.c | 16 ++++++++++--- tools/perf/util/parse-events.l | 2 +- 3 files changed, 51 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/parse-events-test.c b/tools/perf/util/parse-events-test.c index a0f61a2a6835..de81fe1f9329 100644 --- a/tools/perf/util/parse-events-test.c +++ b/tools/perf/util/parse-events-test.c @@ -181,6 +181,22 @@ static int test__checkevent_breakpoint_w(struct perf_evlist *evlist) return 0; } +static int test__checkevent_breakpoint_rw(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries); + TEST_ASSERT_VAL("wrong type", + PERF_TYPE_BREAKPOINT == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", 0 == evsel->attr.config); + TEST_ASSERT_VAL("wrong bp_type", + (HW_BREAKPOINT_R|HW_BREAKPOINT_W) == evsel->attr.bp_type); + TEST_ASSERT_VAL("wrong bp_len", + HW_BREAKPOINT_LEN_4 == evsel->attr.bp_len); + return 0; +} + static int test__checkevent_tracepoint_modifier(struct perf_evlist *evlist) { struct perf_evsel *evsel = list_entry(evlist->entries.next, @@ -352,6 +368,19 @@ static int test__checkevent_breakpoint_w_modifier(struct perf_evlist *evlist) return test__checkevent_breakpoint_w(evlist); } +static int test__checkevent_breakpoint_rw_modifier(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip); + + return test__checkevent_breakpoint_rw(evlist); +} + static int test__checkevent_pmu(struct perf_evlist *evlist) { @@ -585,6 +614,14 @@ static struct test__event_st test__events[] = { .name = "instructions:H", .check = test__checkevent_exclude_guest_modifier, }, + [26] = { + .name = "mem:0:rw", + .check = test__checkevent_breakpoint_rw, + }, + [27] = { + .name = "mem:0:rw:kp", + .check = test__checkevent_breakpoint_rw_modifier, + }, }; #define TEST__EVENTS_CNT (sizeof(test__events) / sizeof(struct test__event_st)) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 0cc27da30ddb..7ae76af709f1 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -383,21 +383,31 @@ parse_breakpoint_type(const char *type, struct perf_event_attr *attr) if (!type || !type[i]) break; +#define CHECK_SET_TYPE(bit) \ +do { \ + if (attr->bp_type & bit) \ + return -EINVAL; \ + else \ + attr->bp_type |= bit; \ +} while (0) + switch (type[i]) { case 'r': - attr->bp_type |= HW_BREAKPOINT_R; + CHECK_SET_TYPE(HW_BREAKPOINT_R); break; case 'w': - attr->bp_type |= HW_BREAKPOINT_W; + CHECK_SET_TYPE(HW_BREAKPOINT_W); break; case 'x': - attr->bp_type |= HW_BREAKPOINT_X; + CHECK_SET_TYPE(HW_BREAKPOINT_X); break; default: return -EINVAL; } } +#undef CHECK_SET_TYPE + if (!attr->bp_type) /* Default */ attr->bp_type = HW_BREAKPOINT_R | HW_BREAKPOINT_W; diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 488362e14133..a06689474210 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -76,7 +76,7 @@ num_hex 0x[a-fA-F0-9]+ num_raw_hex [a-fA-F0-9]+ name [a-zA-Z_*?][a-zA-Z0-9_*?]* modifier_event [ukhpGH]{1,8} -modifier_bp [rwx] +modifier_bp [rwx]{1,3} %% From 287e74aa3db097469bdca401f33f00ef20dc710d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 28 Jun 2012 23:18:49 +0200 Subject: [PATCH 190/612] perf evsel: Handle hw breakpoints event names in perf_evsel__name() Adding hw breakpoint events hook in the perf_evsel__name function, to display event names properly all over the perf tools. Updated hw breakpoints events tests. Signed-off-by: Jiri Olsa Cc: Jovi Zhang Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1340918329-3012-3-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 30 +++++++++++++++++++++++++++++ tools/perf/util/parse-events-test.c | 10 ++++++++++ tools/perf/util/parse-events.c | 4 +--- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 3d1f6968f175..e81771364867 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -15,6 +15,7 @@ #include "cpumap.h" #include "thread_map.h" #include "target.h" +#include "../../../include/linux/hw_breakpoint.h" #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) #define GROUP_FD(group_fd, cpu) (*(int *)xyarray__entry(group_fd, cpu, 0)) @@ -152,6 +153,31 @@ static int perf_evsel__sw_name(struct perf_evsel *evsel, char *bf, size_t size) return r + perf_evsel__add_modifiers(evsel, bf + r, size - r); } +static int __perf_evsel__bp_name(char *bf, size_t size, u64 addr, u64 type) +{ + int r; + + r = scnprintf(bf, size, "mem:0x%" PRIx64 ":", addr); + + if (type & HW_BREAKPOINT_R) + r += scnprintf(bf + r, size - r, "r"); + + if (type & HW_BREAKPOINT_W) + r += scnprintf(bf + r, size - r, "w"); + + if (type & HW_BREAKPOINT_X) + r += scnprintf(bf + r, size - r, "x"); + + return r; +} + +static int perf_evsel__bp_name(struct perf_evsel *evsel, char *bf, size_t size) +{ + struct perf_event_attr *attr = &evsel->attr; + int r = __perf_evsel__bp_name(bf, size, attr->bp_addr, attr->bp_type); + return r + perf_evsel__add_modifiers(evsel, bf + r, size - r); +} + const char *perf_evsel__hw_cache[PERF_COUNT_HW_CACHE_MAX] [PERF_EVSEL__MAX_ALIASES] = { { "L1-dcache", "l1-d", "l1d", "L1-data", }, @@ -285,6 +311,10 @@ const char *perf_evsel__name(struct perf_evsel *evsel) scnprintf(bf, sizeof(bf), "%s", "unknown tracepoint"); break; + case PERF_TYPE_BREAKPOINT: + perf_evsel__bp_name(evsel, bf, sizeof(bf)); + break; + default: scnprintf(bf, sizeof(bf), "%s", "unknown attr type"); break; diff --git a/tools/perf/util/parse-events-test.c b/tools/perf/util/parse-events-test.c index de81fe1f9329..dd0c306a0698 100644 --- a/tools/perf/util/parse-events-test.c +++ b/tools/perf/util/parse-events-test.c @@ -325,6 +325,8 @@ static int test__checkevent_breakpoint_modifier(struct perf_evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); + TEST_ASSERT_VAL("wrong name", + !strcmp(perf_evsel__name(evsel), "mem:0x0:rw:u")); return test__checkevent_breakpoint(evlist); } @@ -338,6 +340,8 @@ static int test__checkevent_breakpoint_x_modifier(struct perf_evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); + TEST_ASSERT_VAL("wrong name", + !strcmp(perf_evsel__name(evsel), "mem:0x0:x:k")); return test__checkevent_breakpoint_x(evlist); } @@ -351,6 +355,8 @@ static int test__checkevent_breakpoint_r_modifier(struct perf_evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip); + TEST_ASSERT_VAL("wrong name", + !strcmp(perf_evsel__name(evsel), "mem:0x0:r:hp")); return test__checkevent_breakpoint_r(evlist); } @@ -364,6 +370,8 @@ static int test__checkevent_breakpoint_w_modifier(struct perf_evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip); + TEST_ASSERT_VAL("wrong name", + !strcmp(perf_evsel__name(evsel), "mem:0x0:w:up")); return test__checkevent_breakpoint_w(evlist); } @@ -377,6 +385,8 @@ static int test__checkevent_breakpoint_rw_modifier(struct perf_evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip); + TEST_ASSERT_VAL("wrong name", + !strcmp(perf_evsel__name(evsel), "mem:0x0:rw:kp")); return test__checkevent_breakpoint_rw(evlist); } diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 7ae76af709f1..1dc44dc69133 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -418,7 +418,6 @@ int parse_events_add_breakpoint(struct list_head **list, int *idx, void *ptr, char *type) { struct perf_event_attr attr; - char name[MAX_NAME_LEN]; memset(&attr, 0, sizeof(attr)); attr.bp_addr = (unsigned long) ptr; @@ -437,8 +436,7 @@ int parse_events_add_breakpoint(struct list_head **list, int *idx, attr.type = PERF_TYPE_BREAKPOINT; - snprintf(name, MAX_NAME_LEN, "mem:%p:%s", ptr, type ? type : "rw"); - return add_event(list, idx, &attr, name); + return add_event(list, idx, &attr, NULL); } static int config_term(struct perf_event_attr *attr, From 9bc8f9fe2c6e3778202c76ef85ef291567c00cb8 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 14 Jun 2012 22:38:37 +0200 Subject: [PATCH 191/612] perf tools: Fix generation of pmu list The internal pmu list was never used. With each perf_pmu__find() call the pmu structure was created new by parsing sysfs. Beside this it caused memory leaks. We now keep all pmus by adding them to the list. Also, pmu_lookup() should return pmus that do not expose the format specifier in sysfs. We need a valid internal pmu list in a later patch to iterate over all pmus that exist in the system. Signed-off-by: Robert Richter Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1339706321-8802-3-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 74d0948ec368..67715a42cd6d 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -72,7 +72,7 @@ static int pmu_format(char *name, struct list_head *format) "%s/bus/event_source/devices/%s/format", sysfs, name); if (stat(path, &st) < 0) - return -1; + return 0; /* no error if format does not exist */ if (pmu_format_parse(path, format)) return -1; @@ -252,6 +252,7 @@ static struct perf_pmu *pmu_lookup(char *name) list_splice(&aliases, &pmu->aliases); pmu->name = strdup(name); pmu->type = type; + list_add_tail(&pmu->list, &pmus); return pmu; } From 1388d715dd7d0f494c93dfdef6ab26719218b868 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 19 Jun 2012 17:48:05 +0200 Subject: [PATCH 192/612] perf symbols: Add '.note' check into search for NOTE section Adding '.note' section name to be check when looking for notes section. The '.note' name is used by kernel VDSO. Signed-off-by: Jiri Olsa Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1340120894-9465-15-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 994f4ffdcd05..50958bbeb26a 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1478,14 +1478,31 @@ static int elf_read_build_id(Elf *elf, void *bf, size_t size) goto out; } - sec = elf_section_by_name(elf, &ehdr, &shdr, - ".note.gnu.build-id", NULL); - if (sec == NULL) { + /* + * Check following sections for notes: + * '.note.gnu.build-id' + * '.notes' + * '.note' (VDSO specific) + */ + do { + sec = elf_section_by_name(elf, &ehdr, &shdr, + ".note.gnu.build-id", NULL); + if (sec) + break; + sec = elf_section_by_name(elf, &ehdr, &shdr, ".notes", NULL); - if (sec == NULL) - goto out; - } + if (sec) + break; + + sec = elf_section_by_name(elf, &ehdr, &shdr, + ".note", NULL); + if (sec) + break; + + return err; + + } while (0); data = elf_getdata(sec, NULL); if (data == NULL) From 339ce005091b156c2af4c016c6ba9c1f87cd826a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 19 Jun 2012 17:48:11 +0200 Subject: [PATCH 193/612] perf tools: Adding round_up/round_down macros Adding round_up and round_down macros. They will be used in upcoming patches. Signed-off-by: Jiri Olsa Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1340120894-9465-21-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/include/linux/kernel.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/perf/util/include/linux/kernel.h b/tools/perf/util/include/linux/kernel.h index 1eb804fd3fbf..b6842c1d02a8 100644 --- a/tools/perf/util/include/linux/kernel.h +++ b/tools/perf/util/include/linux/kernel.h @@ -108,4 +108,14 @@ int eprintf(int level, #define pr_debug3(fmt, ...) pr_debugN(3, pr_fmt(fmt), ##__VA_ARGS__) #define pr_debug4(fmt, ...) pr_debugN(4, pr_fmt(fmt), ##__VA_ARGS__) +/* + * This looks more complex than it should be. But we need to + * get the type for the ~ right in round_down (it needs to be + * as wide as the result!), and we want to evaluate the macro + * arguments just once each. + */ +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) + #endif From 44b99462d9d776522e174d6c531ce5ccef309e26 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Fri, 22 Jun 2012 11:50:05 -0700 Subject: [PATCH 194/612] ring-buffer: Fix crash due to uninitialized new_pages list head The new_pages list head in the cpu_buffer is not initialized. When adding pages to the ring buffer, if the memory allocation fails in ring_buffer_resize, the clean up handler tries to free up the allocated pages from all the cpu buffers. The panic is caused by referencing the uninitialized new_pages list head. Initializing the new_pages list head in rb_allocate_cpu_buffer fixes this. Link: http://lkml.kernel.org/r/1340391005-10880-1-git-send-email-vnagarnaik@google.com Cc: Justin Teravest Cc: David Sharp Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d0f6a8a0e5e..ba39cbabdc9f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); ret = rb_allocate_pages(cpu_buffer, nr_pages); if (ret < 0) From 48fdc72f23ad9a9956e524a47843135d0bbc3317 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Fri, 29 Jun 2012 12:31:41 -0700 Subject: [PATCH 195/612] ring-buffer: Fix accounting of entries when removing pages When removing pages from the ring buffer, its state is not reset. This means that the counters need to be correctly updated to account for the pages removed. Update the overrun counter to reflect the removed events from the pages. Link: http://lkml.kernel.org/r/1340998301-1715-1-git-send-email-vnagarnaik@google.com Cc: Justin Teravest Cc: David Sharp Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ba39cbabdc9f..f765465bffe4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1347,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) * If something was added to this page, it was full * since it is not the tail page. So we deduct the * bytes consumed in ring buffer from here. - * No need to update overruns, since this page is - * deleted from ring buffer and its entries are - * already accounted for. + * Increment overrun to account for the lost events. */ + local_add(page_entries, &cpu_buffer->overrun); local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); } From 954e482bde20b0e208fd4d34ef26e10afd194600 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 24 May 2012 18:19:45 -0700 Subject: [PATCH 196/612] x86/copy_user_generic: Optimize copy_user_generic with CPU erms feature According to Intel 64 and IA-32 SDM and Optimization Reference Manual, beginning with Ivybridge, REG string operation using MOVSB and STOSB can provide both flexible and high-performance REG string operations in cases like memory copy. Enhancement availability is indicated by CPUID.7.0.EBX[9] (Enhanced REP MOVSB/ STOSB). If CPU erms feature is detected, patch copy_user_generic with enhanced fast string version of copy_user_generic. A few new macros are defined to reduce duplicate code in ALTERNATIVE and ALTERNATIVE_2. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1337908785-14015-1-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative.h | 74 ++++++++++++++++++++++++------ arch/x86/include/asm/uaccess_64.h | 11 ++++- arch/x86/kernel/x8664_ksyms_64.c | 1 + 3 files changed, 70 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 49331bedc158..70780689599a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -75,23 +75,54 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ +#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" + +#define b_replacement(number) "663"#number +#define e_replacement(number) "664"#number + +#define alt_slen "662b-661b" +#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" + +#define ALTINSTR_ENTRY(feature, number) \ + " .long 661b - .\n" /* label */ \ + " .long " b_replacement(number)"f - .\n" /* new instruction */ \ + " .word " __stringify(feature) "\n" /* feature bit */ \ + " .byte " alt_slen "\n" /* source len */ \ + " .byte " alt_rlen(number) "\n" /* replacement len */ + +#define DISCARD_ENTRY(number) /* rlen <= slen */ \ + " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" + +#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ + b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" + /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ - \ - "661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - " .long 661b - .\n" /* label */ \ - " .long 663f - .\n" /* new instruction */ \ - " .word " __stringify(feature) "\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .discard,\"aw\",@progbits\n" \ - " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ - ".previous\n" \ - ".section .altinstr_replacement, \"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" + OLDINSTR(oldinstr) \ + ".section .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature, 1) \ + ".previous\n" \ + ".section .discard,\"aw\",@progbits\n" \ + DISCARD_ENTRY(1) \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ + ".previous" + +#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ + OLDINSTR(oldinstr) \ + ".section .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature1, 1) \ + ALTINSTR_ENTRY(feature2, 2) \ + ".previous\n" \ + ".section .discard,\"aw\",@progbits\n" \ + DISCARD_ENTRY(1) \ + DISCARD_ENTRY(2) \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ + ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ + ".previous" /* * This must be included *after* the definition of ALTERNATIVE due to @@ -139,6 +170,19 @@ static inline int alternatives_text_reserved(void *start, void *end) asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \ : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) +/* + * Like alternative_call, but there are two features and respective functions. + * If CPU has feature2, function2 is used. + * Otherwise, if CPU has feature1, function1 is used. + * Otherwise, old function is used. + */ +#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ + output, input...) \ + asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ + "call %P[new2]", feature2) \ + : output : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ + [new2] "i" (newfunc2), ## input) + /* * use this macro(s) if you need more than one output parameter * in alternative_io diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 8e796fbbf9c6..d8def8b3dba0 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -17,6 +17,8 @@ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long +copy_user_enhanced_fast_string(void *to, const void *from, unsigned len); +__must_check unsigned long copy_user_generic_string(void *to, const void *from, unsigned len); __must_check unsigned long copy_user_generic_unrolled(void *to, const void *from, unsigned len); @@ -26,9 +28,16 @@ copy_user_generic(void *to, const void *from, unsigned len) { unsigned ret; - alternative_call(copy_user_generic_unrolled, + /* + * If CPU has ERMS feature, use copy_user_enhanced_fast_string. + * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. + * Otherwise, use copy_user_generic_unrolled. + */ + alternative_call_2(copy_user_generic_unrolled, copy_user_generic_string, X86_FEATURE_REP_GOOD, + copy_user_enhanced_fast_string, + X86_FEATURE_ERMS, ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), "=d" (len)), "1" (to), "2" (from), "3" (len) diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 9796c2f3d074..6020f6f5927c 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -28,6 +28,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic_string); EXPORT_SYMBOL(copy_user_generic_unrolled); +EXPORT_SYMBOL(copy_user_enhanced_fast_string); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); From c9fc3f778a6a215ace14ee556067c73982b6d40f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 21 Jun 2012 14:07:16 +0200 Subject: [PATCH 197/612] x86, microcode: Sanitize per-cpu microcode reloading interface Microcode reloading in a per-core manner is a very bad idea for both major x86 vendors. And the thing is, we have such interface with which we can end up with different microcode versions applied on different cores of an otherwise homogeneous wrt (family,model,stepping) system. So turn off the possibility of doing that per core and allow it only system-wide. This is a minimal fix which we'd like to see in stable too thus the more-or-less arbitrary decision to allow system-wide reloading only on the BSP: $ echo 1 > /sys/devices/system/cpu/cpu0/microcode/reload ... and disable the interface on the other cores: $ echo 1 > /sys/devices/system/cpu/cpu23/microcode/reload -bash: echo: write error: Invalid argument Also, allowing the reload only from one CPU (the BSP in that case) doesn't allow the reload procedure to degenerate into an O(n^2) deal when triggering reloads from all /sys/devices/system/cpu/cpuX/microcode/reload sysfs nodes simultaneously. A more generic fix will follow. Cc: Henrique de Moraes Holschuh Cc: Peter Zijlstra Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1340280437-7718-2-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/microcode_core.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fbdfc6917180..24b852b61be3 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -298,19 +298,31 @@ static ssize_t reload_store(struct device *dev, const char *buf, size_t size) { unsigned long val; - int cpu = dev->id; - ssize_t ret = 0; + int cpu; + ssize_t ret = 0, tmp_ret; + + /* allow reload only from the BSP */ + if (boot_cpu_data.cpu_index != dev->id) + return -EINVAL; ret = kstrtoul(buf, 0, &val); if (ret) return ret; - if (val == 1) { - get_online_cpus(); - if (cpu_online(cpu)) - ret = reload_for_cpu(cpu); - put_online_cpus(); + if (val != 1) + return size; + + get_online_cpus(); + for_each_online_cpu(cpu) { + tmp_ret = reload_for_cpu(cpu); + if (tmp_ret != 0) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + + /* save retval of the first encountered reload error */ + if (!ret) + ret = tmp_ret; } + put_online_cpus(); if (!ret) ret = size; From 3d8986bc7f309483ee09c7a02888bab09072c19b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 21 Jun 2012 14:07:17 +0200 Subject: [PATCH 198/612] x86, microcode: Make reload interface per system The reload interface should be per-system so that a full system ucode reload happens (on each core) when doing echo 1 > /sys/devices/system/cpu/microcode/reload Move it to the cpu subsys directory instead of it being per-cpu. Cc: Henrique de Moraes Holschuh Cc: Peter Zijlstra Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1340280437-7718-3-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_core.c | 36 +++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 24b852b61be3..947e4c64b1d8 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -301,10 +301,6 @@ static ssize_t reload_store(struct device *dev, int cpu; ssize_t ret = 0, tmp_ret; - /* allow reload only from the BSP */ - if (boot_cpu_data.cpu_index != dev->id) - return -EINVAL; - ret = kstrtoul(buf, 0, &val); if (ret) return ret; @@ -351,7 +347,6 @@ static DEVICE_ATTR(version, 0400, version_show, NULL); static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); static struct attribute *mc_default_attrs[] = { - &dev_attr_reload.attr, &dev_attr_version.attr, &dev_attr_processor_flags.attr, NULL @@ -528,6 +523,16 @@ static const struct x86_cpu_id microcode_id[] = { MODULE_DEVICE_TABLE(x86cpu, microcode_id); #endif +static struct attribute *cpu_root_microcode_attrs[] = { + &dev_attr_reload.attr, + NULL +}; + +static struct attribute_group cpu_root_microcode_group = { + .name = "microcode", + .attrs = cpu_root_microcode_attrs, +}; + static int __init microcode_init(void) { struct cpuinfo_x86 *c = &cpu_data(0); @@ -559,9 +564,17 @@ static int __init microcode_init(void) if (error) goto out_pdev; + error = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + if (error) { + pr_err("Error creating microcode group!\n"); + goto out_driver; + } + error = microcode_dev_init(); if (error) - goto out_driver; + goto out_ucode_group; register_syscore_ops(&mc_syscore_ops); register_hotcpu_notifier(&mc_cpu_notifier); @@ -571,7 +584,11 @@ static int __init microcode_init(void) return 0; -out_driver: + out_ucode_group: + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + out_driver: get_online_cpus(); mutex_lock(µcode_mutex); @@ -580,7 +597,7 @@ out_driver: mutex_unlock(µcode_mutex); put_online_cpus(); -out_pdev: + out_pdev: platform_device_unregister(microcode_pdev); return error; @@ -596,6 +613,9 @@ static void __exit microcode_exit(void) unregister_hotcpu_notifier(&mc_cpu_notifier); unregister_syscore_ops(&mc_syscore_ops); + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + get_online_cpus(); mutex_lock(µcode_mutex); From 64941d8930e619ef37227f88c6ee17e2624c65d2 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 2 Jul 2012 15:06:22 +0900 Subject: [PATCH 199/612] sh: Fix up se7721 GPIOLIB=y build warnings. Presently the SH7720/21 serial code uses asm/gpio.h to get at the CPU GPIO port definitions, but in the case of GPIOLIB=y this also includes references to generic GPIOLIB routines that we don't have any function declarations for, tripping up on -Werror=implicit-function-declaration with newer gcc versions: CC arch/sh/kernel/cpu/sh3/serial-sh7720.o In file included from include/linux/sh_pfc.h:14:0, from arch/sh/include/asm/gpio.h:23, from arch/sh/kernel/cpu/sh3/serial-sh7720.c:5: include/asm-generic/gpio.h: In function 'gpio_get_value_cansleep': include/asm-generic/gpio.h:220:2: error: implicit declaration of function '__gpio_get_value' [-Werror=implicit-function-declaration] include/asm-generic/gpio.h: In function 'gpio_set_value_cansleep': include/asm-generic/gpio.h:226:2: error: implicit declaration of function '__gpio_set_value' [-Werror=implicit-function-declaration] In file included from arch/sh/include/asm/gpio.h:23:0, from arch/sh/kernel/cpu/sh3/serial-sh7720.c:5: include/linux/sh_pfc.h: At top level: include/linux/sh_pfc.h:121:19: error: field 'chip' has incomplete type Switch to using the cpu/ version for the port definitions explicitly. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh3/serial-sh7720.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/kernel/cpu/sh3/serial-sh7720.c b/arch/sh/kernel/cpu/sh3/serial-sh7720.c index 8832c526cdf9..c4a0336660dd 100644 --- a/arch/sh/kernel/cpu/sh3/serial-sh7720.c +++ b/arch/sh/kernel/cpu/sh3/serial-sh7720.c @@ -2,7 +2,7 @@ #include #include #include -#include +#include static void sh7720_sci_init_pins(struct uart_port *port, unsigned int cflag) { From 8f53dc724a83a0082184fa27df80c25c7df47340 Mon Sep 17 00:00:00 2001 From: Hiroshi DOYU Date: Wed, 27 Jun 2012 12:54:01 +0300 Subject: [PATCH 200/612] iommu/tegra: smmu: Fix unsleepable memory allocation allo_pdir() is called in smmu_iommu_domain_init() with spin_lock held. memory allocations in it have to be atomic/unsleepable. Signed-off-by: Hiroshi DOYU Reported-by: Chris Wright Acked-by: Chris Wright Cc: stable@vger.kernel.org Signed-off-by: Joerg Roedel --- drivers/iommu/tegra-smmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index ecd679043d77..3f3d09d560ea 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -550,13 +550,13 @@ static int alloc_pdir(struct smmu_as *as) return 0; as->pte_count = devm_kzalloc(smmu->dev, - sizeof(as->pte_count[0]) * SMMU_PDIR_COUNT, GFP_KERNEL); + sizeof(as->pte_count[0]) * SMMU_PDIR_COUNT, GFP_ATOMIC); if (!as->pte_count) { dev_err(smmu->dev, "failed to allocate smmu_device PTE cunters\n"); return -ENOMEM; } - as->pdir_page = alloc_page(GFP_KERNEL | __GFP_DMA); + as->pdir_page = alloc_page(GFP_ATOMIC | __GFP_DMA); if (!as->pdir_page) { dev_err(smmu->dev, "failed to allocate smmu_device page directory\n"); From 68ee6d22376411f8ec668413f1b632a34192a807 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 27 Jun 2012 12:08:55 +0300 Subject: [PATCH 201/612] dma-debug: debugfs_create_bool() takes a u32 pointer Even though it has "bool" in the name, you have pass a u32 pointer to debugfs_create_bool(). Otherwise you get memory corruption in write_file_bool(). Fortunately in this case the corruption happens in an alignment hole between variables so it doesn't cause any problems. Signed-off-by: Dan Carpenter Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 518aea714d21..66ce41489133 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -78,7 +78,7 @@ static LIST_HEAD(free_entries); static DEFINE_SPINLOCK(free_entries_lock); /* Global disable flag - will be set in case of an error */ -static bool global_disable __read_mostly; +static u32 global_disable __read_mostly; /* Global error count */ static u32 error_count; @@ -657,7 +657,7 @@ static int dma_debug_fs_init(void) global_disable_dent = debugfs_create_bool("disabled", 0444, dma_debug_dent, - (u32 *)&global_disable); + &global_disable); if (!global_disable_dent) goto out_err; From 3775d4818d72081e2afa2aed2442a2b9ecfc5eab Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 27 Jun 2012 12:09:18 +0300 Subject: [PATCH 202/612] iommu/amd: fix type bug in flush code write_file_bool() modifies 32 bits of data, so "amd_iommu_unmap_flush" needs to be 32 bits as well or we'll corrupt memory. Fortunately it looks like the data is aligned with a gap after the declaration so this is harmless in production. Signed-off-by: Dan Carpenter Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 2 +- drivers/iommu/amd_iommu_init.c | 2 +- drivers/iommu/amd_iommu_types.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index dfe7d37c82c5..625626391f2d 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -404,7 +404,7 @@ static void amd_iommu_stats_init(void) return; de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, - (u32 *)&amd_iommu_unmap_flush); + &amd_iommu_unmap_flush); amd_iommu_stats_add(&compl_wait); amd_iommu_stats_add(&cnt_map_single); diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index c04ddca7f12f..a33612f3206f 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -129,7 +129,7 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ +u32 amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 24355559a2ad..c1b1d489817e 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -652,7 +652,7 @@ extern unsigned long *amd_iommu_pd_alloc_bitmap; * If true, the addresses will be flushed on unmap time, not when * they are reused */ -extern bool amd_iommu_unmap_flush; +extern u32 amd_iommu_unmap_flush; /* Smallest number of PASIDs supported by any IOMMU in the system */ extern u32 amd_iommu_max_pasids; From 76a8349dfdb775d387e9767db3092e410403138a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 14 Jun 2012 12:36:17 -0600 Subject: [PATCH 203/612] perf script: Fix format regression due to libtraceevent merge Consider the commands: perf record -e sched:sched_switch -fo /tmp/perf.data -a -- sleep 1 perf script -i /tmp/perf.data In v3.4 the output has the form (lines wrapped here) perf 29214 [005] 821043.582596: sched_switch: prev_comm=perf prev_pid=29214 prev_prio=120 prev_state=S ==> next_comm=swapper/5 next_pid=0 next_prio=120 In 3.5 that same line has become: perf 29214 [005] 821043.582596: sched_switch: <...>-29214 [005] 0.000000000: sched_switch: prev_comm=perf prev_pid=29214 prev_prio=120 prev_state=S ==> next_comm=swapper/5 next_pid=0 next_prio=120 Note the duplicates in the output -- pid, cpu, event name. With this patch the v3.4 output is restored: perf 29214 [005] 821043.582596: sched_switch: prev_comm=perf prev_pid=29214 prev_prio=120 prev_state=S ==> next_comm=swapper/5 next_pid=0 next_prio=120 v3: Remove that pesky newline too. Output now matches v3.4 (pre-libtracevent). v2: Change print_trace_event function local to perf per Steve's comments. Signed-off-by: David Ahern Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1339698977-68962-1-git-send-email-dsahern@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/trace-event-parse.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index df2fddbf0cd2..5dd3b5ec8411 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -198,9 +198,8 @@ void print_trace_event(int cpu, void *data, int size) record.data = data; trace_seq_init(&s); - pevent_print_event(pevent, &s, &record); + pevent_event_info(&s, event, &record); trace_seq_do_printf(&s); - printf("\n"); } void print_event(int cpu, void *data, int size, unsigned long long nsecs, From 207b5792696206663a38e525b9793644895bad3b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 1 Jul 2012 16:11:37 -0600 Subject: [PATCH 204/612] perf kvm: Fix regression with guest machine creation Commit 743eb868657bdb1b26c7b24077ca21c67c82c777 reworked when the machines were created. Prior to this commit guest machines could be created in perf_event__process_kernel_mmap() while processing kernel MMAP events. This commit assumes that the machines exist by the time perf_session_deliver_event is called (e.g., during processing of build id events) - which is not always correct. One example is the use of default guest args (--guestkallsyms and --guestmodules) for short times where no samples hit within a guest module. For this case no build id is added to the file header. No build id == no machine created. That leads to the next example -- the use of no-buildid (-B) on the record for all perf-kvm invocations. In both cases perf report dies with a SEGFAULT of the form: (gdb) bt 0 0x000000000046dd7b in machine__mmap_name (self=0x0, bf=0x7fffffffbd20 "q\021", size=4096) at util/map.c:715 1 0x0000000000444161 in perf_event__process_kernel_mmap (tool=0x7fffffffdd80, event=0x7ffff7fb4120, machine=0x0) at util/event.c:562 2 0x0000000000444642 in perf_event__process_mmap (tool=0x7fffffffdd80, event=0x7ffff7fb4120, sample=0x7fffffffd210, machine=0x0) at util/event.c:668 3 0x0000000000470e0b in perf_session_deliver_event (session=0x915ca0, event=0x7ffff7fb4120, sample=0x7fffffffd210, tool=0x7fffffffdd80, file_offset=8480) at util/session.c:979 4 0x000000000047032e in flush_sample_queue (s=0x915ca0, tool=0x7fffffffdd80) at util/session.c:679 5 0x0000000000471c8d in __perf_session__process_events (session=0x915ca0, data_offset=400, data_size=150448, file_size=150848, tool= 0x7fffffffdd80) at util/session.c:1363 6 0x0000000000471d42 in perf_session__process_events (self=0x915ca0, tool=0x7fffffffdd80) at util/session.c:1379 7 0x000000000042484a in __cmd_report (rep=0x7fffffffdd80) at builtin-report.c:368 8 0x0000000000425bf1 in cmd_report (argc=0, argv=0x915b00, prefix=0x0) at builtin-report.c:756 9 0x0000000000438505 in __cmd_report (argc=4, argv=0x7fffffffe260) at builtin-kvm.c:84 10 0x000000000043882a in cmd_kvm (argc=4, argv=0x7fffffffe260, prefix=0x0) at builtin-kvm.c:131 11 0x00000000004152cd in run_builtin (p=0x7a54e8, argc=9, argv=0x7fffffffe260) at perf.c:273 12 0x00000000004154c7 in handle_internal_command (argc=9, argv=0x7fffffffe260) at perf.c:345 13 0x0000000000415613 in run_argv (argcp=0x7fffffffe14c, argv=0x7fffffffe140) at perf.c:389 14 0x0000000000415899 in main (argc=9, argv=0x7fffffffe260) at perf.c:487 Fix by allowing the machine to be created in perf_session_deliver_event. Tested with --guestmount option and default guest args, with and without -B arg on record for both and for short (10 seconds) and long (10 minutes) windows. Reported-by: Pradeep Kumar Surisetty Signed-off-by: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Pradeep Kumar Surisetty Link: http://lkml.kernel.org/r/1341180697-64515-1-git-send-email-dsahern@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index c3e399bcf18d..56142d0fb8d7 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -926,7 +926,7 @@ static struct machine * else pid = event->ip.pid; - return perf_session__find_machine(session, pid); + return perf_session__findnew_machine(session, pid); } return perf_session__find_host_machine(session); From 7ed97ad41ffa94040dfd593948962a7e9e7b0db9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 2 Jul 2012 09:12:57 -0600 Subject: [PATCH 205/612] perf kvm: Fix segfault with report and mixed guestmount use Using the guestmount option on record: $ perf kvm --guest --host --guestmount=/tmp/guest-mount record -ag But not the subsequent report: $ perf kvm report causes a SEGFAULT in the usual place: (gdb) bt 0 0x0000000000470356 in machine__mmap_name (self=0x0, bf=0x7fffffffbdb0 " z\370\367\377\177", size= 4096) at util/map.c:712 1 0x00000000004453e8 in perf_event__process_kernel_mmap (tool=0x7fffffffde10, event=0x7ffff7f87e38, machine=0x0) at util/event.c:550 2 0x00000000004458c9 in perf_event__process_mmap (tool=0x7fffffffde10, event=0x7ffff7f87e38, sample= 0x7fffffffd2a0, machine=0x0) at util/event.c:656 3 0x00000000004733e0 in perf_session_deliver_event (session=0x91aca0, event=0x7ffff7f87e38, sample= 0x7fffffffd2a0, tool=0x7fffffffde10, file_offset=7736) at util/session.c:979 ... The MMAP events in this case already contain the full path to the module. No need to require it for the report path to. Signed-off-by: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1341241977-71535-1-git-send-email-dsahern@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 35ae56864e4f..a1f4e3669142 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -669,25 +669,26 @@ struct machine *machines__find(struct rb_root *self, pid_t pid) struct machine *machines__findnew(struct rb_root *self, pid_t pid) { char path[PATH_MAX]; - const char *root_dir; + const char *root_dir = ""; struct machine *machine = machines__find(self, pid); - if (!machine || machine->pid != pid) { - if (pid == HOST_KERNEL_ID || pid == DEFAULT_GUEST_KERNEL_ID) - root_dir = ""; - else { - if (!symbol_conf.guestmount) - goto out; - sprintf(path, "%s/%d", symbol_conf.guestmount, pid); - if (access(path, R_OK)) { - pr_err("Can't access file %s\n", path); - goto out; - } - root_dir = path; + if (machine && (machine->pid == pid)) + goto out; + + if ((pid != HOST_KERNEL_ID) && + (pid != DEFAULT_GUEST_KERNEL_ID) && + (symbol_conf.guestmount)) { + sprintf(path, "%s/%d", symbol_conf.guestmount, pid); + if (access(path, R_OK)) { + pr_err("Can't access file %s\n", path); + machine = NULL; + goto out; } - machine = machines__add(self, pid, root_dir); + root_dir = path; } + machine = machines__add(self, pid, root_dir); + out: return machine; } From 17d7a1123f0f6d532830152564cc812cc73db2f3 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Mon, 2 Jul 2012 22:46:17 +0900 Subject: [PATCH 206/612] perf bench: Fix confused variable namings and descriptions in mem subsystem As Namhyung Kim pointed, there are confused namings and descriptions of words "cycle" and "clock" in mem-memset.c and mem-memcpy.c. With the option "-c" (or "--clock", now renamed as "--cycle"), mem subsystem measures cost of memset() and memcpy() with cpu-cycles event. But current mem subsystem source code contains lots of confused variable namings and descriptions with "clock" (e.g. the variable use_clock). This is a very bad style because there is another software event named "cpu-clock". This patch replaces wrong usage of "clock" to "cycle". v2: modified Documentation/perf-bench.txt for the descriptions of --cycle option Signed-off-by: Hitoshi Mitake Cc: Ingo Molnar Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1341236777-18457-1-git-send-email-h.mitake@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-bench.txt | 4 +- tools/perf/bench/mem-memcpy.c | 80 ++++++++++++------------- tools/perf/bench/mem-memset.c | 80 ++++++++++++------------- 3 files changed, 82 insertions(+), 82 deletions(-) diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt index f3c716a4cad3..7065cd6fbdfc 100644 --- a/tools/perf/Documentation/perf-bench.txt +++ b/tools/perf/Documentation/perf-bench.txt @@ -144,7 +144,7 @@ On x86-64, x86-64-unrolled, x86-64-movsq and x86-64-movsb are supported. Repeat memcpy invocation this number of times. -c:: ---clock:: +--cycle:: Use perf's cpu-cycles event instead of gettimeofday syscall. -o:: @@ -176,7 +176,7 @@ On x86-64, x86-64-unrolled, x86-64-stosq and x86-64-stosb are supported. Repeat memset invocation this number of times. -c:: ---clock:: +--cycle:: Use perf's cpu-cycles event instead of gettimeofday syscall. -o:: diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c index d990365cafa0..02dad5d3359b 100644 --- a/tools/perf/bench/mem-memcpy.c +++ b/tools/perf/bench/mem-memcpy.c @@ -24,8 +24,8 @@ static const char *length_str = "1MB"; static const char *routine = "default"; static int iterations = 1; -static bool use_clock; -static int clock_fd; +static bool use_cycle; +static int cycle_fd; static bool only_prefault; static bool no_prefault; @@ -37,7 +37,7 @@ static const struct option options[] = { "Specify routine to copy"), OPT_INTEGER('i', "iterations", &iterations, "repeat memcpy() invocation this number of times"), - OPT_BOOLEAN('c', "clock", &use_clock, + OPT_BOOLEAN('c', "cycle", &use_cycle, "Use cycles event instead of gettimeofday() for measuring"), OPT_BOOLEAN('o', "only-prefault", &only_prefault, "Show only the result with page faults before memcpy()"), @@ -76,27 +76,27 @@ static const char * const bench_mem_memcpy_usage[] = { NULL }; -static struct perf_event_attr clock_attr = { +static struct perf_event_attr cycle_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }; -static void init_clock(void) +static void init_cycle(void) { - clock_fd = sys_perf_event_open(&clock_attr, getpid(), -1, -1, 0); + cycle_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, 0); - if (clock_fd < 0 && errno == ENOSYS) + if (cycle_fd < 0 && errno == ENOSYS) die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); else - BUG_ON(clock_fd < 0); + BUG_ON(cycle_fd < 0); } -static u64 get_clock(void) +static u64 get_cycle(void) { int ret; u64 clk; - ret = read(clock_fd, &clk, sizeof(u64)); + ret = read(cycle_fd, &clk, sizeof(u64)); BUG_ON(ret != sizeof(u64)); return clk; @@ -119,9 +119,9 @@ static void alloc_mem(void **dst, void **src, size_t length) die("memory allocation failed - maybe length is too large?\n"); } -static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault) +static u64 do_memcpy_cycle(memcpy_t fn, size_t len, bool prefault) { - u64 clock_start = 0ULL, clock_end = 0ULL; + u64 cycle_start = 0ULL, cycle_end = 0ULL; void *src = NULL, *dst = NULL; int i; @@ -130,14 +130,14 @@ static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault) if (prefault) fn(dst, src, len); - clock_start = get_clock(); + cycle_start = get_cycle(); for (i = 0; i < iterations; ++i) fn(dst, src, len); - clock_end = get_clock(); + cycle_end = get_cycle(); free(src); free(dst); - return clock_end - clock_start; + return cycle_end - cycle_start; } static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault) @@ -182,17 +182,17 @@ int bench_mem_memcpy(int argc, const char **argv, int i; size_t len; double result_bps[2]; - u64 result_clock[2]; + u64 result_cycle[2]; argc = parse_options(argc, argv, options, bench_mem_memcpy_usage, 0); - if (use_clock) - init_clock(); + if (use_cycle) + init_cycle(); len = (size_t)perf_atoll((char *)length_str); - result_clock[0] = result_clock[1] = 0ULL; + result_cycle[0] = result_cycle[1] = 0ULL; result_bps[0] = result_bps[1] = 0.0; if ((s64)len <= 0) { @@ -223,11 +223,11 @@ int bench_mem_memcpy(int argc, const char **argv, if (!only_prefault && !no_prefault) { /* show both of results */ - if (use_clock) { - result_clock[0] = - do_memcpy_clock(routines[i].fn, len, false); - result_clock[1] = - do_memcpy_clock(routines[i].fn, len, true); + if (use_cycle) { + result_cycle[0] = + do_memcpy_cycle(routines[i].fn, len, false); + result_cycle[1] = + do_memcpy_cycle(routines[i].fn, len, true); } else { result_bps[0] = do_memcpy_gettimeofday(routines[i].fn, @@ -237,9 +237,9 @@ int bench_mem_memcpy(int argc, const char **argv, len, true); } } else { - if (use_clock) { - result_clock[pf] = - do_memcpy_clock(routines[i].fn, + if (use_cycle) { + result_cycle[pf] = + do_memcpy_cycle(routines[i].fn, len, only_prefault); } else { result_bps[pf] = @@ -251,12 +251,12 @@ int bench_mem_memcpy(int argc, const char **argv, switch (bench_format) { case BENCH_FORMAT_DEFAULT: if (!only_prefault && !no_prefault) { - if (use_clock) { - printf(" %14lf Clock/Byte\n", - (double)result_clock[0] + if (use_cycle) { + printf(" %14lf Cycle/Byte\n", + (double)result_cycle[0] / (double)len); - printf(" %14lf Clock/Byte (with prefault)\n", - (double)result_clock[1] + printf(" %14lf Cycle/Byte (with prefault)\n", + (double)result_cycle[1] / (double)len); } else { print_bps(result_bps[0]); @@ -265,9 +265,9 @@ int bench_mem_memcpy(int argc, const char **argv, printf(" (with prefault)\n"); } } else { - if (use_clock) { - printf(" %14lf Clock/Byte", - (double)result_clock[pf] + if (use_cycle) { + printf(" %14lf Cycle/Byte", + (double)result_cycle[pf] / (double)len); } else print_bps(result_bps[pf]); @@ -277,17 +277,17 @@ int bench_mem_memcpy(int argc, const char **argv, break; case BENCH_FORMAT_SIMPLE: if (!only_prefault && !no_prefault) { - if (use_clock) { + if (use_cycle) { printf("%lf %lf\n", - (double)result_clock[0] / (double)len, - (double)result_clock[1] / (double)len); + (double)result_cycle[0] / (double)len, + (double)result_cycle[1] / (double)len); } else { printf("%lf %lf\n", result_bps[0], result_bps[1]); } } else { - if (use_clock) { - printf("%lf\n", (double)result_clock[pf] + if (use_cycle) { + printf("%lf\n", (double)result_cycle[pf] / (double)len); } else printf("%lf\n", result_bps[pf]); diff --git a/tools/perf/bench/mem-memset.c b/tools/perf/bench/mem-memset.c index bf0d5f552017..350cc9557265 100644 --- a/tools/perf/bench/mem-memset.c +++ b/tools/perf/bench/mem-memset.c @@ -24,8 +24,8 @@ static const char *length_str = "1MB"; static const char *routine = "default"; static int iterations = 1; -static bool use_clock; -static int clock_fd; +static bool use_cycle; +static int cycle_fd; static bool only_prefault; static bool no_prefault; @@ -37,7 +37,7 @@ static const struct option options[] = { "Specify routine to set"), OPT_INTEGER('i', "iterations", &iterations, "repeat memset() invocation this number of times"), - OPT_BOOLEAN('c', "clock", &use_clock, + OPT_BOOLEAN('c', "cycle", &use_cycle, "Use cycles event instead of gettimeofday() for measuring"), OPT_BOOLEAN('o', "only-prefault", &only_prefault, "Show only the result with page faults before memset()"), @@ -76,27 +76,27 @@ static const char * const bench_mem_memset_usage[] = { NULL }; -static struct perf_event_attr clock_attr = { +static struct perf_event_attr cycle_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }; -static void init_clock(void) +static void init_cycle(void) { - clock_fd = sys_perf_event_open(&clock_attr, getpid(), -1, -1, 0); + cycle_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, 0); - if (clock_fd < 0 && errno == ENOSYS) + if (cycle_fd < 0 && errno == ENOSYS) die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); else - BUG_ON(clock_fd < 0); + BUG_ON(cycle_fd < 0); } -static u64 get_clock(void) +static u64 get_cycle(void) { int ret; u64 clk; - ret = read(clock_fd, &clk, sizeof(u64)); + ret = read(cycle_fd, &clk, sizeof(u64)); BUG_ON(ret != sizeof(u64)); return clk; @@ -115,9 +115,9 @@ static void alloc_mem(void **dst, size_t length) die("memory allocation failed - maybe length is too large?\n"); } -static u64 do_memset_clock(memset_t fn, size_t len, bool prefault) +static u64 do_memset_cycle(memset_t fn, size_t len, bool prefault) { - u64 clock_start = 0ULL, clock_end = 0ULL; + u64 cycle_start = 0ULL, cycle_end = 0ULL; void *dst = NULL; int i; @@ -126,13 +126,13 @@ static u64 do_memset_clock(memset_t fn, size_t len, bool prefault) if (prefault) fn(dst, -1, len); - clock_start = get_clock(); + cycle_start = get_cycle(); for (i = 0; i < iterations; ++i) fn(dst, i, len); - clock_end = get_clock(); + cycle_end = get_cycle(); free(dst); - return clock_end - clock_start; + return cycle_end - cycle_start; } static double do_memset_gettimeofday(memset_t fn, size_t len, bool prefault) @@ -176,17 +176,17 @@ int bench_mem_memset(int argc, const char **argv, int i; size_t len; double result_bps[2]; - u64 result_clock[2]; + u64 result_cycle[2]; argc = parse_options(argc, argv, options, bench_mem_memset_usage, 0); - if (use_clock) - init_clock(); + if (use_cycle) + init_cycle(); len = (size_t)perf_atoll((char *)length_str); - result_clock[0] = result_clock[1] = 0ULL; + result_cycle[0] = result_cycle[1] = 0ULL; result_bps[0] = result_bps[1] = 0.0; if ((s64)len <= 0) { @@ -217,11 +217,11 @@ int bench_mem_memset(int argc, const char **argv, if (!only_prefault && !no_prefault) { /* show both of results */ - if (use_clock) { - result_clock[0] = - do_memset_clock(routines[i].fn, len, false); - result_clock[1] = - do_memset_clock(routines[i].fn, len, true); + if (use_cycle) { + result_cycle[0] = + do_memset_cycle(routines[i].fn, len, false); + result_cycle[1] = + do_memset_cycle(routines[i].fn, len, true); } else { result_bps[0] = do_memset_gettimeofday(routines[i].fn, @@ -231,9 +231,9 @@ int bench_mem_memset(int argc, const char **argv, len, true); } } else { - if (use_clock) { - result_clock[pf] = - do_memset_clock(routines[i].fn, + if (use_cycle) { + result_cycle[pf] = + do_memset_cycle(routines[i].fn, len, only_prefault); } else { result_bps[pf] = @@ -245,12 +245,12 @@ int bench_mem_memset(int argc, const char **argv, switch (bench_format) { case BENCH_FORMAT_DEFAULT: if (!only_prefault && !no_prefault) { - if (use_clock) { - printf(" %14lf Clock/Byte\n", - (double)result_clock[0] + if (use_cycle) { + printf(" %14lf Cycle/Byte\n", + (double)result_cycle[0] / (double)len); - printf(" %14lf Clock/Byte (with prefault)\n ", - (double)result_clock[1] + printf(" %14lf Cycle/Byte (with prefault)\n ", + (double)result_cycle[1] / (double)len); } else { print_bps(result_bps[0]); @@ -259,9 +259,9 @@ int bench_mem_memset(int argc, const char **argv, printf(" (with prefault)\n"); } } else { - if (use_clock) { - printf(" %14lf Clock/Byte", - (double)result_clock[pf] + if (use_cycle) { + printf(" %14lf Cycle/Byte", + (double)result_cycle[pf] / (double)len); } else print_bps(result_bps[pf]); @@ -271,17 +271,17 @@ int bench_mem_memset(int argc, const char **argv, break; case BENCH_FORMAT_SIMPLE: if (!only_prefault && !no_prefault) { - if (use_clock) { + if (use_cycle) { printf("%lf %lf\n", - (double)result_clock[0] / (double)len, - (double)result_clock[1] / (double)len); + (double)result_cycle[0] / (double)len, + (double)result_cycle[1] / (double)len); } else { printf("%lf %lf\n", result_bps[0], result_bps[1]); } } else { - if (use_clock) { - printf("%lf\n", (double)result_clock[pf] + if (use_cycle) { + printf("%lf\n", (double)result_cycle[pf] / (double)len); } else printf("%lf\n", result_bps[pf]); From cba6d0d64ee53772b285d0c0c288deefbeaf7775 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Jul 2012 07:08:42 -0700 Subject: [PATCH 207/612] Revert "rcu: Move PREEMPT_RCU preemption to switch_to() invocation" This reverts commit 616c310e83b872024271c915c1b9ab505b9efad9. (Move PREEMPT_RCU preemption to switch_to() invocation). Testing by Sasha Levin showed that this can result in deadlock due to invoking the scheduler when one of the runqueue locks is held. Because this commit was simply a performance optimization, revert it. Reported-by: Sasha Levin Signed-off-by: Paul E. McKenney Tested-by: Sasha Levin --- arch/um/drivers/mconsole_kern.c | 1 - include/linux/rcupdate.h | 1 - include/linux/rcutiny.h | 6 ++++++ include/linux/sched.h | 10 ---------- kernel/rcutree.c | 1 + kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 14 +++++++++++--- kernel/sched/core.c | 1 - 8 files changed, 19 insertions(+), 16 deletions(-) diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 88e466b159dc..43b39d61b538 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -705,7 +705,6 @@ static void stack_proc(void *arg) struct task_struct *from = current, *to = arg; to->thread.saved_task = from; - rcu_switch_from(from); switch_to(from, to, from); } diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 26d1a47591f1..9cac722b169c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -184,7 +184,6 @@ static inline int rcu_preempt_depth(void) /* Internal to kernel */ extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); -extern void rcu_preempt_note_context_switch(void); extern void rcu_check_callbacks(int cpu, int user); struct notifier_block; extern void rcu_idle_enter(void); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 854dc4c5c271..4e56a9c69a35 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -87,6 +87,10 @@ static inline void kfree_call_rcu(struct rcu_head *head, #ifdef CONFIG_TINY_RCU +static inline void rcu_preempt_note_context_switch(void) +{ +} + static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; @@ -95,6 +99,7 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) #else /* #ifdef CONFIG_TINY_RCU */ +void rcu_preempt_note_context_switch(void); int rcu_preempt_needs_cpu(void); static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) @@ -108,6 +113,7 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) static inline void rcu_note_context_switch(int cpu) { rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(); } /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 4059c0f33f07..06a4c5f4f55c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1871,22 +1871,12 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_node_entry); } -static inline void rcu_switch_from(struct task_struct *prev) -{ - if (prev->rcu_read_lock_nesting != 0) - rcu_preempt_note_context_switch(); -} - #else static inline void rcu_copy_process(struct task_struct *p) { } -static inline void rcu_switch_from(struct task_struct *prev) -{ -} - #endif #ifdef CONFIG_SMP diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 38ecdda3f55f..4b97bba7396e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -201,6 +201,7 @@ void rcu_note_context_switch(int cpu) { trace_rcu_utilization("Start context switch"); rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(cpu); trace_rcu_utilization("End context switch"); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index ea056495783e..19b61ac1079f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -444,6 +444,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5271a020887e..3e4899459f3d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) * * Caller must disable preemption. */ -void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; unsigned long flags; @@ -164,7 +164,7 @@ void rcu_preempt_note_context_switch(void) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = __this_cpu_ptr(rcu_preempt_state.rda); + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -228,7 +228,7 @@ void rcu_preempt_note_context_switch(void) * means that we continue to block the current grace period. */ local_irq_save(flags); - rcu_preempt_qs(smp_processor_id()); + rcu_preempt_qs(cpu); local_irq_restore(flags); } @@ -1001,6 +1001,14 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); +/* + * Because preemptible RCU does not exist, we never have to check for + * CPUs being in quiescent states. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ +} + /* * Because preemptible RCU does not exist, there are never any preempted * RCU readers. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d4..eaead2df6aa8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ - rcu_switch_from(prev); switch_to(prev, next, prev); barrier(); From f885b7f2b2de70be266d2cecc476f773a1e2ca5d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Apr 2012 15:52:53 -0700 Subject: [PATCH 208/612] rcu: Control RCU_FANOUT_LEAF from boot-time parameter Although making RCU_FANOUT_LEAF a kernel configuration parameter rather than a fixed constant makes it easier for people to decrease cache-miss overhead for large systems, it is of little help for people who must run a single pre-built kernel binary. This commit therefore allows the value of RCU_FANOUT_LEAF to be increased (but not decreased!) via a boot-time parameter named rcutree.rcu_fanout_leaf. Reported-by: Mike Galbraith Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 5 ++ kernel/rcutree.c | 97 ++++++++++++++++++++++++----- kernel/rcutree.h | 23 ++++--- kernel/rcutree_plugin.h | 2 + kernel/rcutree_trace.c | 2 +- 5 files changed, 104 insertions(+), 25 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a92c5ebf373e..12783fa833c3 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2367,6 +2367,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Set maximum number of finished RCU callbacks to process in one batch. + rcutree.fanout_leaf= [KNL,BOOT] + Increase the number of CPUs assigned to each + leaf rcu_node structure. Useful for very large + systems. + rcutree.qhimark= [KNL,BOOT] Set threshold of queued RCU callbacks over which batch limiting is disabled. diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4b97bba7396e..a4c592b66e10 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -60,17 +60,10 @@ /* Data structures. */ -static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; +static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; #define RCU_STATE_INITIALIZER(structname) { \ .level = { &structname##_state.node[0] }, \ - .levelcnt = { \ - NUM_RCU_LVL_0, /* root of hierarchy. */ \ - NUM_RCU_LVL_1, \ - NUM_RCU_LVL_2, \ - NUM_RCU_LVL_3, \ - NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ - }, \ .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ @@ -91,6 +84,19 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; +/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ +static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; +module_param(rcu_fanout_leaf, int, 0); +int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; +static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ + NUM_RCU_LVL_0, + NUM_RCU_LVL_1, + NUM_RCU_LVL_2, + NUM_RCU_LVL_3, + NUM_RCU_LVL_4, +}; +int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ + /* * The rcu_scheduler_active variable transitions from zero to one just * before the first task is spawned. So when this variable is zero, RCU @@ -2574,9 +2580,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - for (i = NUM_RCU_LVLS - 1; i > 0; i--) + for (i = rcu_num_lvls - 1; i > 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; + rsp->levelspread[0] = rcu_fanout_leaf; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) @@ -2586,7 +2592,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) int i; cprv = NR_CPUS; - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + for (i = rcu_num_lvls - 1; i >= 0; i--) { ccur = rsp->levelcnt[i]; rsp->levelspread[i] = (cprv + ccur - 1) / ccur; cprv = ccur; @@ -2613,13 +2619,15 @@ static void __init rcu_init_one(struct rcu_state *rsp, /* Initialize the level-tracking arrays. */ - for (i = 1; i < NUM_RCU_LVLS; i++) + for (i = 0; i < rcu_num_lvls; i++) + rsp->levelcnt[i] = num_rcu_lvl[i]; + for (i = 1; i < rcu_num_lvls; i++) rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; rcu_init_levelspread(rsp); /* Initialize the elements themselves, starting from the leaves. */ - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + for (i = rcu_num_lvls - 1; i >= 0; i--) { cpustride *= rsp->levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { @@ -2649,7 +2657,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rsp->rda = rda; - rnp = rsp->level[NUM_RCU_LVLS - 1]; + rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; @@ -2658,11 +2666,72 @@ static void __init rcu_init_one(struct rcu_state *rsp, } } +/* + * Compute the rcu_node tree geometry from kernel parameters. This cannot + * replace the definitions in rcutree.h because those are needed to size + * the ->node array in the rcu_state structure. + */ +static void __init rcu_init_geometry(void) +{ + int i; + int j; + int n = NR_CPUS; + int rcu_capacity[MAX_RCU_LVLS + 1]; + + /* If the compile-time values are accurate, just leave. */ + if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) + return; + + /* + * Compute number of nodes that can be handled an rcu_node tree + * with the given number of levels. Setting rcu_capacity[0] makes + * some of the arithmetic easier. + */ + rcu_capacity[0] = 1; + rcu_capacity[1] = rcu_fanout_leaf; + for (i = 2; i <= MAX_RCU_LVLS; i++) + rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; + + /* + * The boot-time rcu_fanout_leaf parameter is only permitted + * to increase the leaf-level fanout, not decrease it. Of course, + * the leaf-level fanout cannot exceed the number of bits in + * the rcu_node masks. Finally, the tree must be able to accommodate + * the configured number of CPUs. Complain and fall back to the + * compile-time values if these limits are exceeded. + */ + if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || + rcu_fanout_leaf > sizeof(unsigned long) * 8 || + n > rcu_capacity[MAX_RCU_LVLS]) { + WARN_ON(1); + return; + } + + /* Calculate the number of rcu_nodes at each level of the tree. */ + for (i = 1; i <= MAX_RCU_LVLS; i++) + if (n <= rcu_capacity[i]) { + for (j = 0; j <= i; j++) + num_rcu_lvl[j] = + DIV_ROUND_UP(n, rcu_capacity[i - j]); + rcu_num_lvls = i; + for (j = i + 1; j <= MAX_RCU_LVLS; j++) + num_rcu_lvl[j] = 0; + break; + } + + /* Calculate the total number of rcu_node structures. */ + rcu_num_nodes = 0; + for (i = 0; i <= MAX_RCU_LVLS; i++) + rcu_num_nodes += num_rcu_lvl[i]; + rcu_num_nodes -= n; +} + void __init rcu_init(void) { int cpu; rcu_bootup_announce(); + rcu_init_geometry(); rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 19b61ac1079f..780a0195d35a 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -42,28 +42,28 @@ #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) #if NR_CPUS <= RCU_FANOUT_1 -# define NUM_RCU_LVLS 1 +# define RCU_NUM_LVLS 1 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 (NR_CPUS) # define NUM_RCU_LVL_2 0 # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_2 -# define NUM_RCU_LVLS 2 +# define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_3 -# define NUM_RCU_LVLS 3 +# define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_3 (NR_CPUS) # define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_4 -# define NUM_RCU_LVLS 4 +# define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) @@ -76,6 +76,9 @@ #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) +extern int rcu_num_lvls; +extern int rcu_num_nodes; + /* * Dynticks per-CPU state. */ @@ -206,7 +209,7 @@ struct rcu_node { */ #define rcu_for_each_node_breadth_first(rsp, rnp) \ for ((rnp) = &(rsp)->node[0]; \ - (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* * Do a breadth-first scan of the non-leaf rcu_node structures for the @@ -215,7 +218,7 @@ struct rcu_node { */ #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) + (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) /* * Scan the leaves of the rcu_node hierarchy for the specified rcu_state @@ -224,8 +227,8 @@ struct rcu_node { * It is still a leaf node, even if it is also the root node. */ #define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ - (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* Index values for nxttail array in struct rcu_data. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ @@ -357,9 +360,9 @@ do { \ */ struct rcu_state { struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ - struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ + struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ - u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ + u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ /* The following fields are guarded by the root rcu_node's lock. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3e4899459f3d..fb92b6dd9980 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -70,6 +70,8 @@ static void __init rcu_bootup_announce_oddness(void) #if NUM_RCU_LVL_4 != 0 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); #endif + if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) + printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); } #ifdef CONFIG_TREE_PREEMPT_RCU diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d4bc16ddd1d4..a3556a2d842c 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -278,7 +278,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); - for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { + for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); level = rnp->level; From cc5df65b0370fc6aa2bfe3bb19e0451d5cafb99f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Jun 2012 18:16:00 -0700 Subject: [PATCH 209/612] rcu: Four-level hierarchy is no longer experimental Time to make the four-level-hierarchy setting less scary, so this commit removes "Experimental" from the boot-time message. Leave the message in order to get a heads-up on any possible need to expand to a five-level hierarchy. Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index fb92b6dd9980..70e0fd256cc6 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -68,7 +68,7 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); #endif #if NUM_RCU_LVL_4 != 0 - printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); + printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); #endif if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); From cca6f3931920a7547d02e68adc2ca635bea5600c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2012 21:00:28 -0700 Subject: [PATCH 210/612] rcu: Size rcu_node tree from nr_cpu_ids rather than NR_CPUS The rcu_node tree array is sized based on compile-time constants, including NR_CPUS. Although this approach has worked well in the past, the recent trend by many distros to define NR_CPUS=4096 results in excessive grace-period-initialization latencies. This commit therefore substitutes the run-time computed nr_cpu_ids for the compile-time NR_CPUS when building the tree. This can result in much of the compile-time-allocated rcu_node array being unused. If this is a major problem, you are in a specialized situation anyway, so you can manually adjust the NR_CPUS, RCU_FANOUT, and RCU_FANOUT_LEAF kernel config parameters. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- kernel/rcutree_plugin.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index a4c592b66e10..0fdbc5e07302 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2675,7 +2675,7 @@ static void __init rcu_init_geometry(void) { int i; int j; - int n = NR_CPUS; + int n = nr_cpu_ids; int rcu_capacity[MAX_RCU_LVLS + 1]; /* If the compile-time values are accurate, just leave. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 70e0fd256cc6..ef2b5231afa4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -72,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void) #endif if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + if (nr_cpu_ids != NR_CPUS) + printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); } #ifdef CONFIG_TREE_PREEMPT_RCU From 6c90cc7bf077f28144013e949ee0c122012d194a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 May 2012 19:41:41 -0700 Subject: [PATCH 211/612] rcu: Prevent excessive line length in RCU_STATE_INITIALIZER() Upcoming rcu_barrier() concurrency commits will result in line lengths greater than 80 characters in the RCU_STATE_INITIALIZER(), so this commit shortens the name of the macro's argument to prevent this. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0fdbc5e07302..dd7fd96c90c5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -62,18 +62,18 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; -#define RCU_STATE_INITIALIZER(structname) { \ - .level = { &structname##_state.node[0] }, \ +#define RCU_STATE_INITIALIZER(sname) { \ + .level = { &sname##_state.node[0] }, \ .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ - .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ - .orphan_nxttail = &structname##_state.orphan_nxtlist, \ - .orphan_donetail = &structname##_state.orphan_donelist, \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ + .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ + .orphan_nxttail = &sname##_state.orphan_nxtlist, \ + .orphan_donetail = &sname##_state.orphan_donelist, \ + .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ - .name = #structname, \ + .name = #sname, \ } struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); From 037b64ed0bf2405a1a01542164d3418564b44fff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 May 2012 23:26:01 -0700 Subject: [PATCH 212/612] rcu: Place pointer to call_rcu() in rcu_data structure This is a preparatory commit for increasing rcu_barrier()'s concurrency. It adds a pointer in the rcu_data structure to the corresponding call_rcu() function. This allows a pointer to the rcu_data structure to imply the function pointer, which allows _rcu_barrier() state to be placed in the rcu_state structure. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 27 ++++++++++++--------------- kernel/rcutree.h | 2 ++ kernel/rcutree_plugin.h | 5 +++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dd7fd96c90c5..00c518fa34bb 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -62,8 +62,9 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; -#define RCU_STATE_INITIALIZER(sname) { \ +#define RCU_STATE_INITIALIZER(sname, cr) { \ .level = { &sname##_state.node[0] }, \ + .call = cr, \ .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ @@ -76,10 +77,11 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; .name = #sname, \ } -struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); +struct rcu_state rcu_sched_state = + RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; @@ -2282,21 +2284,17 @@ static void rcu_barrier_func(void *type) { int cpu = smp_processor_id(); struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head)); + struct rcu_state *rsp = type; atomic_inc(&rcu_barrier_cpu_count); - call_rcu_func = type; - call_rcu_func(head, rcu_barrier_callback); + rsp->call(head, rcu_barrier_callback); } /* * Orchestrate the specified type of RCU barrier, waiting for all * RCU callbacks of the specified type to complete. */ -static void _rcu_barrier(struct rcu_state *rsp, - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head))) +static void _rcu_barrier(struct rcu_state *rsp) { int cpu; unsigned long flags; @@ -2348,8 +2346,7 @@ static void _rcu_barrier(struct rcu_state *rsp, while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) schedule_timeout_interruptible(1); } else if (ACCESS_ONCE(rdp->qlen)) { - smp_call_function_single(cpu, rcu_barrier_func, - (void *)call_rcu_func, 1); + smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); preempt_enable(); } else { preempt_enable(); @@ -2370,7 +2367,7 @@ static void _rcu_barrier(struct rcu_state *rsp, raw_spin_unlock_irqrestore(&rsp->onofflock, flags); atomic_inc(&rcu_barrier_cpu_count); smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ - call_rcu_func(&rh, rcu_barrier_callback); + rsp->call(&rh, rcu_barrier_callback); /* * Now that we have an rcu_barrier_callback() callback on each @@ -2393,7 +2390,7 @@ static void _rcu_barrier(struct rcu_state *rsp, */ void rcu_barrier_bh(void) { - _rcu_barrier(&rcu_bh_state, call_rcu_bh); + _rcu_barrier(&rcu_bh_state); } EXPORT_SYMBOL_GPL(rcu_barrier_bh); @@ -2402,7 +2399,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh); */ void rcu_barrier_sched(void) { - _rcu_barrier(&rcu_sched_state, call_rcu_sched); + _rcu_barrier(&rcu_sched_state); } EXPORT_SYMBOL_GPL(rcu_barrier_sched); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 780a0195d35a..049896a835d9 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -364,6 +364,8 @@ struct rcu_state { u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ + void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ + void (*func)(struct rcu_head *head)); /* The following fields are guarded by the root rcu_node's lock. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index ef2b5231afa4..9cb3a68819fa 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -78,7 +78,8 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU -struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); +struct rcu_state rcu_preempt_state = + RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; @@ -944,7 +945,7 @@ static int rcu_preempt_cpu_has_callbacks(int cpu) */ void rcu_barrier(void) { - _rcu_barrier(&rcu_preempt_state, call_rcu); + _rcu_barrier(&rcu_preempt_state); } EXPORT_SYMBOL_GPL(rcu_barrier); From 06668efa9180f4824fe846a8ff96338c18646bc7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 May 2012 23:57:46 -0700 Subject: [PATCH 213/612] rcu: Move _rcu_barrier()'s rcu_head structures to rcu_data structures In order for multiple flavors of RCU to each concurrently run one rcu_barrier(), each flavor needs its own per-CPU set of rcu_head structures. This commit therefore moves _rcu_barrier()'s set of per-CPU rcu_head structures from per-CPU variables to the existing per-CPU and per-RCU-flavor rcu_data structures. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 6 ++---- kernel/rcutree.h | 3 +++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 00c518fa34bb..1e552598b55d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -157,7 +157,6 @@ unsigned long rcutorture_vernum; /* State information for rcu_barrier() and friends. */ -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; @@ -2282,12 +2281,11 @@ static void rcu_barrier_callback(struct rcu_head *notused) */ static void rcu_barrier_func(void *type) { - int cpu = smp_processor_id(); - struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); struct rcu_state *rsp = type; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); atomic_inc(&rcu_barrier_cpu_count); - rsp->call(head, rcu_barrier_callback); + rsp->call(&rdp->barrier_head, rcu_barrier_callback); } /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 049896a835d9..586d93c978f2 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -314,6 +314,9 @@ struct rcu_data { unsigned long n_rp_need_fqs; unsigned long n_rp_need_nothing; + /* 6) _rcu_barrier() callback. */ + struct rcu_head barrier_head; + int cpu; struct rcu_state *rsp; }; From 24ebbca8ecdd5129d7f829a7cb5146aaeb531f77 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 May 2012 00:34:56 -0700 Subject: [PATCH 214/612] rcu: Move rcu_barrier_cpu_count to rcu_state structure In order to allow each RCU flavor to concurrently execute its rcu_barrier() function, it is necessary to move the relevant state to the rcu_state structure. This commit therefore moves the rcu_barrier_cpu_count global variable to a new ->barrier_cpu_count field in the rcu_state structure. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 25 ++++++++++++++----------- kernel/rcutree.h | 1 + 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1e552598b55d..5929b021666d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -157,7 +157,6 @@ unsigned long rcutorture_vernum; /* State information for rcu_barrier() and friends. */ -static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; @@ -2270,9 +2269,12 @@ static int rcu_cpu_has_callbacks(int cpu) * RCU callback function for _rcu_barrier(). If we are last, wake * up the task executing _rcu_barrier(). */ -static void rcu_barrier_callback(struct rcu_head *notused) +static void rcu_barrier_callback(struct rcu_head *rhp) { - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); + struct rcu_state *rsp = rdp->rsp; + + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) complete(&rcu_barrier_completion); } @@ -2284,7 +2286,7 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); - atomic_inc(&rcu_barrier_cpu_count); + atomic_inc(&rsp->barrier_cpu_count); rsp->call(&rdp->barrier_head, rcu_barrier_callback); } @@ -2297,9 +2299,9 @@ static void _rcu_barrier(struct rcu_state *rsp) int cpu; unsigned long flags; struct rcu_data *rdp; - struct rcu_head rh; + struct rcu_data rd; - init_rcu_head_on_stack(&rh); + init_rcu_head_on_stack(&rd.barrier_head); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rcu_barrier_mutex); @@ -2324,7 +2326,7 @@ static void _rcu_barrier(struct rcu_state *rsp) * us -- but before CPU 1's orphaned callbacks are invoked!!! */ init_completion(&rcu_barrier_completion); - atomic_set(&rcu_barrier_cpu_count, 1); + atomic_set(&rsp->barrier_cpu_count, 1); raw_spin_lock_irqsave(&rsp->onofflock, flags); rsp->rcu_barrier_in_progress = current; raw_spin_unlock_irqrestore(&rsp->onofflock, flags); @@ -2363,15 +2365,16 @@ static void _rcu_barrier(struct rcu_state *rsp) rcu_adopt_orphan_cbs(rsp); rsp->rcu_barrier_in_progress = NULL; raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - atomic_inc(&rcu_barrier_cpu_count); + atomic_inc(&rsp->barrier_cpu_count); smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ - rsp->call(&rh, rcu_barrier_callback); + rd.rsp = rsp; + rsp->call(&rd.barrier_head, rcu_barrier_callback); /* * Now that we have an rcu_barrier_callback() callback on each * CPU, and thus each counted, remove the initial count. */ - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) complete(&rcu_barrier_completion); /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ @@ -2380,7 +2383,7 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rcu_barrier_mutex); - destroy_rcu_head_on_stack(&rh); + destroy_rcu_head_on_stack(&rd.barrier_head); } /** diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 586d93c978f2..c57ef0b7f097 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -400,6 +400,7 @@ struct rcu_state { struct task_struct *rcu_barrier_in_progress; /* Task doing rcu_barrier(), */ /* or NULL if no barrier. */ + atomic_t barrier_cpu_count; /* # CPUs waiting on. */ raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ From 7db74df88b52844f4e966901e2972bba725e6766 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 May 2012 03:03:37 -0700 Subject: [PATCH 215/612] rcu: Move rcu_barrier_completion to rcu_state structure In order to allow each RCU flavor to concurrently execute its rcu_barrier() function, it is necessary to move the relevant state to the rcu_state structure. This commit therefore moves the rcu_barrier_completion global variable to a new ->barrier_completion field in the rcu_state structure. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 9 ++++----- kernel/rcutree.h | 1 + 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5929b021666d..ca7d1678ac79 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -158,7 +158,6 @@ unsigned long rcutorture_vernum; /* State information for rcu_barrier() and friends. */ static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s @@ -2275,7 +2274,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_state *rsp = rdp->rsp; if (atomic_dec_and_test(&rsp->barrier_cpu_count)) - complete(&rcu_barrier_completion); + complete(&rsp->barrier_completion); } /* @@ -2325,7 +2324,7 @@ static void _rcu_barrier(struct rcu_state *rsp) * 6. Both rcu_barrier_callback() callbacks are invoked, awakening * us -- but before CPU 1's orphaned callbacks are invoked!!! */ - init_completion(&rcu_barrier_completion); + init_completion(&rsp->barrier_completion); atomic_set(&rsp->barrier_cpu_count, 1); raw_spin_lock_irqsave(&rsp->onofflock, flags); rsp->rcu_barrier_in_progress = current; @@ -2375,10 +2374,10 @@ static void _rcu_barrier(struct rcu_state *rsp) * CPU, and thus each counted, remove the initial count. */ if (atomic_dec_and_test(&rsp->barrier_cpu_count)) - complete(&rcu_barrier_completion); + complete(&rsp->barrier_completion); /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ - wait_for_completion(&rcu_barrier_completion); + wait_for_completion(&rsp->barrier_completion); /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rcu_barrier_mutex); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index c57ef0b7f097..d1ca4424122b 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -401,6 +401,7 @@ struct rcu_state { /* Task doing rcu_barrier(), */ /* or NULL if no barrier. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ + struct completion barrier_completion; /* Wake at barrier end. */ raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ From 7be7f0be907224445acc62b3884c892f38b7ff40 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 May 2012 05:18:53 -0700 Subject: [PATCH 216/612] rcu: Move rcu_barrier_mutex to rcu_state structure In order to allow each RCU flavor to concurrently execute its rcu_barrier() function, it is necessary to move the relevant state to the rcu_state structure. This commit therefore moves the rcu_barrier_mutex global variable to a new ->barrier_mutex field in the rcu_state structure. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 9 +++------ kernel/rcutree.h | 1 + 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ca7d1678ac79..ff992ac24e73 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -71,6 +71,7 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ + .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ @@ -155,10 +156,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); unsigned long rcutorture_testseq; unsigned long rcutorture_vernum; -/* State information for rcu_barrier() and friends. */ - -static DEFINE_MUTEX(rcu_barrier_mutex); - /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -2303,7 +2300,7 @@ static void _rcu_barrier(struct rcu_state *rsp) init_rcu_head_on_stack(&rd.barrier_head); /* Take mutex to serialize concurrent rcu_barrier() requests. */ - mutex_lock(&rcu_barrier_mutex); + mutex_lock(&rsp->barrier_mutex); smp_mb(); /* Prevent any prior operations from leaking in. */ @@ -2380,7 +2377,7 @@ static void _rcu_barrier(struct rcu_state *rsp) wait_for_completion(&rsp->barrier_completion); /* Other rcu_barrier() invocations can now safely proceed. */ - mutex_unlock(&rcu_barrier_mutex); + mutex_unlock(&rsp->barrier_mutex); destroy_rcu_head_on_stack(&rd.barrier_head); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index d1ca4424122b..7641aec3e59c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -400,6 +400,7 @@ struct rcu_state { struct task_struct *rcu_barrier_in_progress; /* Task doing rcu_barrier(), */ /* or NULL if no barrier. */ + struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ raw_spinlock_t fqslock; /* Only one task forcing */ From cfed0a85dad921c683e9c0d25b072bcc5745ede0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Jun 2012 18:22:05 -0700 Subject: [PATCH 217/612] rcu: Remove needless initialization For global variables, C defaults all fields to zero. The initialization of the rcu_state structure's ->n_force_qs and ->n_force_qs_ngp fields is therefore redundant, so this commit removes these initializations. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ff992ac24e73..44a8fda9be86 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -73,8 +73,6 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ - .n_force_qs = 0, \ - .n_force_qs_ngp = 0, \ .name = #sname, \ } From cf3a9c4842b1e097dbe0854933c471d43dd24f69 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 May 2012 14:56:46 -0700 Subject: [PATCH 218/612] rcu: Increase rcu_barrier() concurrency The traditional rcu_barrier() implementation has serialized all requests, regardless of RCU flavor, and also does not coalesce concurrent requests. In the past, this has been good and sufficient. However, systems are getting larger and use of rcu_barrier() has been increasing. This commit therefore introduces a counter-based scheme that allows _rcu_barrier() calls for the same flavor of RCU to take advantage of each others' work. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 36 +++++++++++++++++++++++++++++++++++- kernel/rcutree.h | 2 ++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 44a8fda9be86..6bb5d562253f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2294,13 +2294,41 @@ static void _rcu_barrier(struct rcu_state *rsp) unsigned long flags; struct rcu_data *rdp; struct rcu_data rd; + unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); + unsigned long snap_done; init_rcu_head_on_stack(&rd.barrier_head); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); - smp_mb(); /* Prevent any prior operations from leaking in. */ + /* + * Ensure that all prior references, including to ->n_barrier_done, + * are ordered before the _rcu_barrier() machinery. + */ + smp_mb(); /* See above block comment. */ + + /* + * Recheck ->n_barrier_done to see if others did our work for us. + * This means checking ->n_barrier_done for an even-to-odd-to-even + * transition. The "if" expression below therefore rounds the old + * value up to the next even number and adds two before comparing. + */ + snap_done = ACCESS_ONCE(rsp->n_barrier_done); + if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { + smp_mb(); /* caller's subsequent code after above check. */ + mutex_unlock(&rsp->barrier_mutex); + return; + } + + /* + * Increment ->n_barrier_done to avoid duplicate work. Use + * ACCESS_ONCE() to prevent the compiler from speculating + * the increment to precede the early-exit check. + */ + ACCESS_ONCE(rsp->n_barrier_done)++; + WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); + smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ /* * Initialize the count to one rather than to zero in order to @@ -2371,6 +2399,12 @@ static void _rcu_barrier(struct rcu_state *rsp) if (atomic_dec_and_test(&rsp->barrier_cpu_count)) complete(&rsp->barrier_completion); + /* Increment ->n_barrier_done to prevent duplicate work. */ + smp_mb(); /* Keep increment after above mechanism. */ + ACCESS_ONCE(rsp->n_barrier_done)++; + WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); + smp_mb(); /* Keep increment before caller's subsequent code. */ + /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ wait_for_completion(&rsp->barrier_completion); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7641aec3e59c..be10286ad380 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -403,6 +403,8 @@ struct rcu_state { struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ + unsigned long n_barrier_done; /* ++ at start and end of */ + /* _rcu_barrier(). */ raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ From a83eff0a82a7f3f14fea477fd41e6c082e7fc96a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 May 2012 18:47:05 -0700 Subject: [PATCH 219/612] rcu: Add tracing for _rcu_barrier() This commit adds event tracing for _rcu_barrier() execution. This is defined only if RCU_TRACE=y. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/trace/events/rcu.h | 45 ++++++++++++++++++++++++++++++++++++++ kernel/rcutree.c | 29 +++++++++++++++++++++++- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index d274734b2aa4..5bde94d8585b 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -541,6 +541,50 @@ TRACE_EVENT(rcu_torture_read, __entry->rcutorturename, __entry->rhp) ); +/* + * Tracepoint for _rcu_barrier() execution. The string "s" describes + * the _rcu_barrier phase: + * "Begin": rcu_barrier_callback() started. + * "Check": rcu_barrier_callback() checking for piggybacking. + * "EarlyExit": rcu_barrier_callback() piggybacked, thus early exit. + * "Inc1": rcu_barrier_callback() piggyback check counter incremented. + * "Offline": rcu_barrier_callback() found offline CPU + * "OnlineQ": rcu_barrier_callback() found online CPU with callbacks. + * "OnlineNQ": rcu_barrier_callback() found online CPU, no callbacks. + * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. + * "CB": An rcu_barrier_callback() invoked a callback, not the last. + * "LastCB": An rcu_barrier_callback() invoked the last callback. + * "Inc2": rcu_barrier_callback() piggyback check counter incremented. + * The "cpu" argument is the CPU or -1 if meaningless, the "cnt" argument + * is the count of remaining callbacks, and "done" is the piggybacking count. + */ +TRACE_EVENT(rcu_barrier, + + TP_PROTO(char *rcuname, char *s, int cpu, int cnt, unsigned long done), + + TP_ARGS(rcuname, s, cpu, cnt, done), + + TP_STRUCT__entry( + __field(char *, rcuname) + __field(char *, s) + __field(int, cpu) + __field(int, cnt) + __field(unsigned long, done) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->s = s; + __entry->cpu = cpu; + __entry->cnt = cnt; + __entry->done = done; + ), + + TP_printk("%s %s cpu %d remaining %d # %lu", + __entry->rcuname, __entry->s, __entry->cpu, __entry->cnt, + __entry->done) +); + #else /* #ifdef CONFIG_RCU_TRACE */ #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) @@ -564,6 +608,7 @@ TRACE_EVENT(rcu_torture_read, #define trace_rcu_batch_end(rcuname, callbacks_invoked, cb, nr, iit, risk) \ do { } while (0) #define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) +#define trace_rcu_barrier(name, s, cpu, cnt, done) do { } while (0) #endif /* #else #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6bb5d562253f..dda43d826504 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2259,6 +2259,17 @@ static int rcu_cpu_has_callbacks(int cpu) rcu_preempt_cpu_has_callbacks(cpu); } +/* + * Helper function for _rcu_barrier() tracing. If tracing is disabled, + * the compiler is expected to optimize this away. + */ +static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, + int cpu, unsigned long done) +{ + trace_rcu_barrier(rsp->name, s, cpu, + atomic_read(&rsp->barrier_cpu_count), done); +} + /* * RCU callback function for _rcu_barrier(). If we are last, wake * up the task executing _rcu_barrier(). @@ -2268,8 +2279,12 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); struct rcu_state *rsp = rdp->rsp; - if (atomic_dec_and_test(&rsp->barrier_cpu_count)) + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { + _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); complete(&rsp->barrier_completion); + } else { + _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); + } } /* @@ -2280,6 +2295,7 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); atomic_inc(&rsp->barrier_cpu_count); rsp->call(&rdp->barrier_head, rcu_barrier_callback); } @@ -2298,6 +2314,7 @@ static void _rcu_barrier(struct rcu_state *rsp) unsigned long snap_done; init_rcu_head_on_stack(&rd.barrier_head); + _rcu_barrier_trace(rsp, "Begin", -1, snap); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); @@ -2315,7 +2332,9 @@ static void _rcu_barrier(struct rcu_state *rsp) * value up to the next even number and adds two before comparing. */ snap_done = ACCESS_ONCE(rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "Check", -1, snap_done); if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { + _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); return; @@ -2328,6 +2347,7 @@ static void _rcu_barrier(struct rcu_state *rsp) */ ACCESS_ONCE(rsp->n_barrier_done)++; WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); + _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ /* @@ -2364,13 +2384,19 @@ static void _rcu_barrier(struct rcu_state *rsp) preempt_disable(); rdp = per_cpu_ptr(rsp->rda, cpu); if (cpu_is_offline(cpu)) { + _rcu_barrier_trace(rsp, "Offline", cpu, + rsp->n_barrier_done); preempt_enable(); while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) schedule_timeout_interruptible(1); } else if (ACCESS_ONCE(rdp->qlen)) { + _rcu_barrier_trace(rsp, "OnlineQ", cpu, + rsp->n_barrier_done); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); preempt_enable(); } else { + _rcu_barrier_trace(rsp, "OnlineNQ", cpu, + rsp->n_barrier_done); preempt_enable(); } } @@ -2403,6 +2429,7 @@ static void _rcu_barrier(struct rcu_state *rsp) smp_mb(); /* Keep increment after above mechanism. */ ACCESS_ONCE(rsp->n_barrier_done)++; WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); + _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); smp_mb(); /* Keep increment before caller's subsequent code. */ /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ From d7e187c8e9f30543f9cadfed094896ff414acb8f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 May 2012 11:49:15 -0700 Subject: [PATCH 220/612] rcu: Add rcu_barrier() statistics to debugfs tracing This commit adds an rcubarrier file to RCU's debugfs statistical tracing directory, providing diagnostic information on rcu_barrier(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_trace.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index a3556a2d842c..057408be6c3b 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,6 +46,40 @@ #define RCU_TREE_NONCORE #include "rcutree.h" +static void print_rcubarrier(struct seq_file *m, struct rcu_state *rsp) +{ + seq_printf(m, "%c bcc: %d nbd: %lu\n", + rsp->rcu_barrier_in_progress ? 'B' : '.', + atomic_read(&rsp->barrier_cpu_count), + rsp->n_barrier_done); +} + +static int show_rcubarrier(struct seq_file *m, void *unused) +{ +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "rcu_preempt: "); + print_rcubarrier(m, &rcu_preempt_state); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + seq_puts(m, "rcu_sched: "); + print_rcubarrier(m, &rcu_sched_state); + seq_puts(m, "rcu_bh: "); + print_rcubarrier(m, &rcu_bh_state); + return 0; +} + +static int rcubarrier_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcubarrier, NULL); +} + +static const struct file_operations rcubarrier_fops = { + .owner = THIS_MODULE, + .open = rcubarrier_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + #ifdef CONFIG_RCU_BOOST static char convert_kthread_status(unsigned int kthread_status) @@ -453,6 +487,11 @@ static int __init rcutree_trace_init(void) if (!rcudir) goto free_out; + retval = debugfs_create_file("rcubarrier", 0444, rcudir, + NULL, &rcubarrier_fops); + if (!retval) + goto free_out; + retval = debugfs_create_file("rcudata", 0444, rcudir, NULL, &rcudata_fops); if (!retval) From 1bca8cf1a2c3c6683b12ad28a3e826ca7a834978 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jun 2012 09:40:38 -0700 Subject: [PATCH 221/612] rcu: Remove unneeded __rcu_process_callbacks() argument With the advent of __this_cpu_ptr(), it is no longer necessary to pass both the rcu_state and rcu_data structures into __rcu_process_callbacks(). This commit therefore computes the rcu_data pointer from the rcu_state pointer within __rcu_process_callbacks() so that callers can pass in only the pointer to the rcu_state structure. This paves the way for linking the rcu_state structures together and iterating over them. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 8 ++++---- kernel/rcutree_plugin.h | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dda43d826504..5376a156be8a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1788,9 +1788,10 @@ unlock_fqs_ret: * whom the rdp belongs. */ static void -__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) +__rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); WARN_ON_ONCE(rdp->beenonline == 0); @@ -1827,9 +1828,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_process_callbacks(struct softirq_action *unused) { trace_rcu_utilization("Start RCU core"); - __rcu_process_callbacks(&rcu_sched_state, - &__get_cpu_var(rcu_sched_data)); - __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + __rcu_process_callbacks(&rcu_sched_state); + __rcu_process_callbacks(&rcu_bh_state); rcu_preempt_process_callbacks(); trace_rcu_utilization("End RCU core"); } diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 9cb3a68819fa..5a80cdd9a0a3 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -687,8 +687,7 @@ static void rcu_preempt_check_callbacks(int cpu) */ static void rcu_preempt_process_callbacks(void) { - __rcu_process_callbacks(&rcu_preempt_state, - &__get_cpu_var(rcu_preempt_data)); + __rcu_process_callbacks(&rcu_preempt_state); } #ifdef CONFIG_RCU_BOOST From 6ce75a2326e6f8b3bdfb60e1de7934b89858e87b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jun 2012 11:01:13 -0700 Subject: [PATCH 222/612] rcu: Introduce for_each_rcu_flavor() and use it The arrival of TREE_PREEMPT_RCU some years back included some ugly code involving either #ifdef or #ifdef'ed wrapper functions to iterate over all non-SRCU flavors of RCU. This commit therefore introduces a for_each_rcu_flavor() iterator over the rcu_state structures for each flavor of RCU to clean up a bit of the ugliness. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 53 ++++++++++-------- kernel/rcutree.h | 12 ++--- kernel/rcutree_plugin.h | 116 ---------------------------------------- 3 files changed, 37 insertions(+), 144 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5376a156be8a..b61c3ffc80e9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -84,6 +84,7 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; +LIST_HEAD(rcu_struct_flavors); /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; @@ -860,9 +861,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) */ void rcu_cpu_stall_reset(void) { - rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; - rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; - rcu_preempt_stall_reset(); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rsp->jiffies_stall = jiffies + ULONG_MAX / 2; } static struct notifier_block rcu_panic_block = { @@ -1827,10 +1829,11 @@ __rcu_process_callbacks(struct rcu_state *rsp) */ static void rcu_process_callbacks(struct softirq_action *unused) { + struct rcu_state *rsp; + trace_rcu_utilization("Start RCU core"); - __rcu_process_callbacks(&rcu_sched_state); - __rcu_process_callbacks(&rcu_bh_state); - rcu_preempt_process_callbacks(); + for_each_rcu_flavor(rsp) + __rcu_process_callbacks(rsp); trace_rcu_utilization("End RCU core"); } @@ -2241,9 +2244,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) */ static int rcu_pending(int cpu) { - return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || - __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || - rcu_preempt_pending(cpu); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) + return 1; + return 0; } /* @@ -2253,10 +2259,13 @@ static int rcu_pending(int cpu) */ static int rcu_cpu_has_callbacks(int cpu) { + struct rcu_state *rsp; + /* RCU callbacks either ready or pending? */ - return per_cpu(rcu_sched_data, cpu).nxtlist || - per_cpu(rcu_bh_data, cpu).nxtlist || - rcu_preempt_cpu_has_callbacks(cpu); + for_each_rcu_flavor(rsp) + if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) + return 1; + return 0; } /* @@ -2551,9 +2560,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) static void __cpuinit rcu_prepare_cpu(int cpu) { - rcu_init_percpu_data(cpu, &rcu_sched_state, 0); - rcu_init_percpu_data(cpu, &rcu_bh_state, 0); - rcu_preempt_init_percpu_data(cpu); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_init_percpu_data(cpu, rsp, + strcmp(rsp->name, "rcu_preempt") == 0); } /* @@ -2565,6 +2576,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, long cpu = (long)hcpu; struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; + struct rcu_state *rsp; trace_rcu_utilization("Start CPU hotplug"); switch (action) { @@ -2589,18 +2601,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, * touch any data without introducing corruption. We send the * dying CPU's callbacks to an arbitrarily chosen online CPU. */ - rcu_cleanup_dying_cpu(&rcu_bh_state); - rcu_cleanup_dying_cpu(&rcu_sched_state); - rcu_preempt_cleanup_dying_cpu(); + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_cpu(rsp); rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); - rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); - rcu_preempt_cleanup_dead_cpu(cpu); + for_each_rcu_flavor(rsp) + rcu_cleanup_dead_cpu(cpu, rsp); break; default: break; @@ -2717,6 +2727,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, per_cpu_ptr(rsp->rda, i)->mynode = rnp; rcu_boot_init_percpu_data(i, rsp); } + list_add(&rsp->flavors, &rcu_struct_flavors); } /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index be10286ad380..b92c4550a6e6 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -422,8 +422,13 @@ struct rcu_state { unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ char *name; /* Name of structure. */ + struct list_head flavors; /* List of RCU flavors. */ }; +extern struct list_head rcu_struct_flavors; +#define for_each_rcu_flavor(rsp) \ + list_for_each_entry((rsp), &rcu_struct_flavors, flavors) + /* Return values for rcu_preempt_offline_tasks(). */ #define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ @@ -466,25 +471,18 @@ static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); -static void rcu_preempt_stall_reset(void); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_preempt_cleanup_dead_cpu(int cpu); static void rcu_preempt_check_callbacks(int cpu); -static void rcu_preempt_process_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ -static int rcu_preempt_pending(int cpu); -static int rcu_preempt_cpu_has_callbacks(int cpu); -static void __cpuinit rcu_preempt_init_percpu_data(int cpu); -static void rcu_preempt_cleanup_dying_cpu(void); static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5a80cdd9a0a3..d18b4d383afe 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -544,16 +544,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return ndetected; } -/* - * Suppress preemptible RCU's CPU stall warnings by pushing the - * time of the next stall-warning message comfortably far into the - * future. - */ -static void rcu_preempt_stall_reset(void) -{ - rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; -} - /* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace @@ -654,14 +644,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -/* - * Do CPU-offline processing for preemptible RCU. - */ -static void rcu_preempt_cleanup_dead_cpu(int cpu) -{ - rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state); -} - /* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the corresponding CPU's rcu_node structure, @@ -682,14 +664,6 @@ static void rcu_preempt_check_callbacks(int cpu) t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } -/* - * Process callbacks for preemptible RCU. - */ -static void rcu_preempt_process_callbacks(void) -{ - __rcu_process_callbacks(&rcu_preempt_state); -} - #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void) @@ -921,24 +895,6 @@ mb_ret: } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -/* - * Check to see if there is any immediate preemptible-RCU-related work - * to be done. - */ -static int rcu_preempt_pending(int cpu) -{ - return __rcu_pending(&rcu_preempt_state, - &per_cpu(rcu_preempt_data, cpu)); -} - -/* - * Does preemptible RCU have callbacks on this CPU? - */ -static int rcu_preempt_cpu_has_callbacks(int cpu) -{ - return !!per_cpu(rcu_preempt_data, cpu).nxtlist; -} - /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. */ @@ -948,23 +904,6 @@ void rcu_barrier(void) } EXPORT_SYMBOL_GPL(rcu_barrier); -/* - * Initialize preemptible RCU's per-CPU data. - */ -static void __cpuinit rcu_preempt_init_percpu_data(int cpu) -{ - rcu_init_percpu_data(cpu, &rcu_preempt_state, 1); -} - -/* - * Move preemptible RCU's callbacks from dying CPU to other online CPU - * and record a quiescent state. - */ -static void rcu_preempt_cleanup_dying_cpu(void) -{ - rcu_cleanup_dying_cpu(&rcu_preempt_state); -} - /* * Initialize preemptible RCU's state structures. */ @@ -1049,14 +988,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return 0; } -/* - * Because preemptible RCU does not exist, there is no need to suppress - * its CPU stall warnings. - */ -static void rcu_preempt_stall_reset(void) -{ -} - /* * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for @@ -1084,14 +1015,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -/* - * Because preemptible RCU does not exist, it never needs CPU-offline - * processing. - */ -static void rcu_preempt_cleanup_dead_cpu(int cpu) -{ -} - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -1100,14 +1023,6 @@ static void rcu_preempt_check_callbacks(int cpu) { } -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to process. - */ -static void rcu_preempt_process_callbacks(void) -{ -} - /* * Queue an RCU callback for lazy invocation after a grace period. * This will likely be later named something like "call_rcu_lazy()", @@ -1148,22 +1063,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -/* - * Because preemptible RCU does not exist, it never has any work to do. - */ -static int rcu_preempt_pending(int cpu) -{ - return 0; -} - -/* - * Because preemptible RCU does not exist, it never has callbacks - */ -static int rcu_preempt_cpu_has_callbacks(int cpu) -{ - return 0; -} - /* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). @@ -1174,21 +1073,6 @@ void rcu_barrier(void) } EXPORT_SYMBOL_GPL(rcu_barrier); -/* - * Because preemptible RCU does not exist, there is no per-CPU - * data to initialize. - */ -static void __cpuinit rcu_preempt_init_percpu_data(int cpu) -{ -} - -/* - * Because there is no preemptible RCU, there is no cleanup to do. - */ -static void rcu_preempt_cleanup_dying_cpu(void) -{ -} - /* * Because preemptible RCU does not exist, it need not be initialized. */ From c0cc962da3e7770feb3665f087ea3e23d8c15479 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jun 2012 12:25:39 -0700 Subject: [PATCH 223/612] rcu: Use for_each_rcu_flavor() in TREE_RCU tracing This commit applies the new for_each_rcu_flavor() macro to the kernel/rcutree_trace.c file. Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 114 +++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 72 deletions(-) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 057408be6c3b..a16ddbd6fdc4 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,24 +46,15 @@ #define RCU_TREE_NONCORE #include "rcutree.h" -static void print_rcubarrier(struct seq_file *m, struct rcu_state *rsp) -{ - seq_printf(m, "%c bcc: %d nbd: %lu\n", - rsp->rcu_barrier_in_progress ? 'B' : '.', - atomic_read(&rsp->barrier_cpu_count), - rsp->n_barrier_done); -} - static int show_rcubarrier(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt: "); - print_rcubarrier(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched: "); - print_rcubarrier(m, &rcu_sched_state); - seq_puts(m, "rcu_bh: "); - print_rcubarrier(m, &rcu_bh_state); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", + rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', + atomic_read(&rsp->barrier_cpu_count), + rsp->n_barrier_done); return 0; } @@ -129,24 +120,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } -#define PRINT_RCU_DATA(name, func, m) \ - do { \ - int _p_r_d_i; \ - \ - for_each_possible_cpu(_p_r_d_i) \ - func(m, &per_cpu(name, _p_r_d_i)); \ - } while (0) - static int show_rcudata(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); - seq_puts(m, "rcu_bh:\n"); - PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); + int cpu; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + seq_printf(m, "%s:\n", rsp->name); + for_each_possible_cpu(cpu) + print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu)); + } return 0; } @@ -200,6 +183,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata_csv(struct seq_file *m, void *unused) { + int cpu; + struct rcu_state *rsp; + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); @@ -207,14 +193,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) seq_puts(m, "\"kt\",\"ktl\""); #endif /* #ifdef CONFIG_RCU_BOOST */ seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "\"rcu_preempt:\"\n"); - PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "\"rcu_sched:\"\n"); - PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m); - seq_puts(m, "\"rcu_bh:\"\n"); - PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); + for_each_rcu_flavor(rsp) { + seq_printf(m, "\"%s:\"\n", rsp->name); + for_each_possible_cpu(cpu) + print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); + } return 0; } @@ -304,9 +287,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; gpnum = rsp->gpnum; - seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " + seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x " "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", - rsp->completed, gpnum, rsp->fqs_state, + rsp->name, rsp->completed, gpnum, rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, @@ -329,14 +312,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) static int show_rcuhier(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - print_one_rcu_state(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - print_one_rcu_state(m, &rcu_sched_state); - seq_puts(m, "rcu_bh:\n"); - print_one_rcu_state(m, &rcu_bh_state); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + print_one_rcu_state(m, rsp); return 0; } @@ -377,11 +356,10 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) static int show_rcugp(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - show_one_rcugp(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - show_one_rcugp(m, &rcu_sched_state); - show_one_rcugp(m, &rcu_bh_state); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + show_one_rcugp(m, rsp); return 0; } @@ -416,28 +394,20 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) rdp->n_rp_need_nothing); } -static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) +static int show_rcu_pending(struct seq_file *m, void *unused) { int cpu; struct rcu_data *rdp; + struct rcu_state *rsp; - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (rdp->beenonline) - print_one_rcu_pending(m, rdp); + for_each_rcu_flavor(rsp) { + seq_printf(m, "%s:\n", rsp->name); + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->beenonline) + print_one_rcu_pending(m, rdp); + } } -} - -static int show_rcu_pending(struct seq_file *m, void *unused) -{ -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - print_rcu_pendings(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - print_rcu_pendings(m, &rcu_sched_state); - seq_puts(m, "rcu_bh:\n"); - print_rcu_pendings(m, &rcu_bh_state); return 0; } From ff015030c939f0bec68fa9b8898da3aaa7fe55ea Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jun 2012 13:16:58 -0700 Subject: [PATCH 224/612] rcu: RCU_SAVE_DYNTICK code no longer ever dead Before RCU had unified idle, the RCU_SAVE_DYNTICK leg of the switch statement in force_quiescent_state() was dead code for CONFIG_NO_HZ=n kernel builds. With unified idle, the code is never dead. This commit therefore removes the "if" statement designed to make gcc aware of when the code was and was not dead. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b61c3ffc80e9..967b4bed2cf3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1747,8 +1747,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) break; /* grace period idle or initializing, ignore. */ case RCU_SAVE_DYNTICK: - if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) - break; /* So gcc recognizes the dead code. */ raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ From 74d874e7bdc5b09cedcc7da0de44db25888eea10 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 May 2012 13:43:30 -0700 Subject: [PATCH 225/612] rcu: Update documentation to cover call_srcu() and srcu_barrier(). The advent of call_srcu() and srcu_barrier() obsoleted some of the documentation, so this commit brings that up to date. Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.txt | 39 ++++++++++++++++---------------- Documentation/RCU/rcubarrier.txt | 15 ++++-------- Documentation/RCU/torture.txt | 9 ++++++++ Documentation/RCU/whatisRCU.txt | 6 ++--- 4 files changed, 36 insertions(+), 33 deletions(-) diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 5c8d74968090..fc103d7a0474 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -162,9 +162,9 @@ over a rather long period of time, but improvements are always welcome! when publicizing a pointer to a structure that can be traversed by an RCU read-side critical section. -5. If call_rcu(), or a related primitive such as call_rcu_bh() or - call_rcu_sched(), is used, the callback function must be - written to be called from softirq context. In particular, +5. If call_rcu(), or a related primitive such as call_rcu_bh(), + call_rcu_sched(), or call_srcu() is used, the callback function + must be written to be called from softirq context. In particular, it cannot block. 6. Since synchronize_rcu() can block, it cannot be called from @@ -202,11 +202,12 @@ over a rather long period of time, but improvements are always welcome! updater uses call_rcu_sched() or synchronize_sched(), then the corresponding readers must disable preemption, possibly by calling rcu_read_lock_sched() and rcu_read_unlock_sched(). - If the updater uses synchronize_srcu(), the the corresponding - readers must use srcu_read_lock() and srcu_read_unlock(), - and with the same srcu_struct. The rules for the expedited - primitives are the same as for their non-expedited counterparts. - Mixing things up will result in confusion and broken kernels. + If the updater uses synchronize_srcu() or call_srcu(), + the the corresponding readers must use srcu_read_lock() and + srcu_read_unlock(), and with the same srcu_struct. The rules for + the expedited primitives are the same as for their non-expedited + counterparts. Mixing things up will result in confusion and + broken kernels. One exception to this rule: rcu_read_lock() and rcu_read_unlock() may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh() @@ -333,14 +334,14 @@ over a rather long period of time, but improvements are always welcome! victim CPU from ever going offline.) 14. SRCU (srcu_read_lock(), srcu_read_unlock(), srcu_dereference(), - synchronize_srcu(), and synchronize_srcu_expedited()) may only - be invoked from process context. Unlike other forms of RCU, it - -is- permissible to block in an SRCU read-side critical section - (demarked by srcu_read_lock() and srcu_read_unlock()), hence the - "SRCU": "sleepable RCU". Please note that if you don't need - to sleep in read-side critical sections, you should be using - RCU rather than SRCU, because RCU is almost always faster and - easier to use than is SRCU. + synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu()) + may only be invoked from process context. Unlike other forms of + RCU, it -is- permissible to block in an SRCU read-side critical + section (demarked by srcu_read_lock() and srcu_read_unlock()), + hence the "SRCU": "sleepable RCU". Please note that if you + don't need to sleep in read-side critical sections, you should be + using RCU rather than SRCU, because RCU is almost always faster + and easier to use than is SRCU. If you need to enter your read-side critical section in a hardirq or exception handler, and then exit that same read-side @@ -353,8 +354,8 @@ over a rather long period of time, but improvements are always welcome! cleanup_srcu_struct(). These are passed a "struct srcu_struct" that defines the scope of a given SRCU domain. Once initialized, the srcu_struct is passed to srcu_read_lock(), srcu_read_unlock() - synchronize_srcu(), and synchronize_srcu_expedited(). A given - synchronize_srcu() waits only for SRCU read-side critical + synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu(). + A given synchronize_srcu() waits only for SRCU read-side critical sections governed by srcu_read_lock() and srcu_read_unlock() calls that have been passed the same srcu_struct. This property is what makes sleeping read-side critical sections tolerable -- @@ -374,7 +375,7 @@ over a rather long period of time, but improvements are always welcome! requiring SRCU's read-side deadlock immunity or low read-side realtime latency. - Note that, rcu_assign_pointer() relates to SRCU just as they do + Note that, rcu_assign_pointer() relates to SRCU just as it does to other forms of RCU. 15. The whole point of call_rcu(), synchronize_rcu(), and friends diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt index e439a0edee22..38428c125135 100644 --- a/Documentation/RCU/rcubarrier.txt +++ b/Documentation/RCU/rcubarrier.txt @@ -79,8 +79,6 @@ complete. Pseudo-code using rcu_barrier() is as follows: 2. Execute rcu_barrier(). 3. Allow the module to be unloaded. -Quick Quiz #1: Why is there no srcu_barrier()? - The rcutorture module makes use of rcu_barrier in its exit function as follows: @@ -162,7 +160,7 @@ for any pre-existing callbacks to complete. Then lines 55-62 print status and do operation-specific cleanup, and then return, permitting the module-unload operation to be completed. -Quick Quiz #2: Is there any other situation where rcu_barrier() might +Quick Quiz #1: Is there any other situation where rcu_barrier() might be required? Your module might have additional complications. For example, if your @@ -242,7 +240,7 @@ reaches zero, as follows: 4 complete(&rcu_barrier_completion); 5 } -Quick Quiz #3: What happens if CPU 0's rcu_barrier_func() executes +Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes immediately (thus incrementing rcu_barrier_cpu_count to the value one), but the other CPU's rcu_barrier_func() invocations are delayed for a full grace period? Couldn't this result in @@ -259,12 +257,7 @@ so that your module may be safely unloaded. Answers to Quick Quizzes -Quick Quiz #1: Why is there no srcu_barrier()? - -Answer: Since there is no call_srcu(), there can be no outstanding SRCU - callbacks. Therefore, there is no need to wait for them. - -Quick Quiz #2: Is there any other situation where rcu_barrier() might +Quick Quiz #1: Is there any other situation where rcu_barrier() might be required? Answer: Interestingly enough, rcu_barrier() was not originally @@ -278,7 +271,7 @@ Answer: Interestingly enough, rcu_barrier() was not originally implementing rcutorture, and found that rcu_barrier() solves this problem as well. -Quick Quiz #3: What happens if CPU 0's rcu_barrier_func() executes +Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes immediately (thus incrementing rcu_barrier_cpu_count to the value one), but the other CPU's rcu_barrier_func() invocations are delayed for a full grace period? Couldn't this result in diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index 4ddf3913fd8c..7dce8a17eac2 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -174,11 +174,20 @@ torture_type The type of RCU to test, with string values as follows: and synchronize_rcu_bh_expedited(). "srcu": srcu_read_lock(), srcu_read_unlock() and + call_srcu(). + + "srcu_sync": srcu_read_lock(), srcu_read_unlock() and synchronize_srcu(). "srcu_expedited": srcu_read_lock(), srcu_read_unlock() and synchronize_srcu_expedited(). + "srcu_raw": srcu_read_lock_raw(), srcu_read_unlock_raw(), + and call_srcu(). + + "srcu_raw_sync": srcu_read_lock_raw(), srcu_read_unlock_raw(), + and synchronize_srcu(). + "sched": preempt_disable(), preempt_enable(), and call_rcu_sched(). diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 6bbe8dcdc3da..69ee188515e7 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -833,9 +833,9 @@ sched: Critical sections Grace period Barrier SRCU: Critical sections Grace period Barrier - srcu_read_lock synchronize_srcu N/A - srcu_read_unlock synchronize_srcu_expedited - srcu_read_lock_raw + srcu_read_lock synchronize_srcu srcu_barrier + srcu_read_unlock call_srcu + srcu_read_lock_raw synchronize_srcu_expedited srcu_read_unlock_raw srcu_dereference From 751a68b2e46fe11a42450cb55a16e8065eddec7e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 May 2012 13:44:44 -0700 Subject: [PATCH 226/612] rcu: Rationalize ordering of torture_ops list Move the raw SRCU interfaces out of the middle of the normal SRCU interfaces. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e66b34ab7555..9850479f319d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1908,8 +1908,8 @@ rcu_torture_init(void) static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, - &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, - &srcu_raw_sync_ops, &srcu_expedited_ops, + &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, + &srcu_raw_ops, &srcu_raw_sync_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); From e3f8d3788ed7cf55946030dc9b76e73edb111602 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2012 10:21:50 -0700 Subject: [PATCH 227/612] rcu: Test srcu_barrier() from rcutorture test suite SRCU now has a call_srcu() and an srcu_barrier(), but rcutorture does not test them. This commit adds the machinery to allow rcutorture's existing tests for call_rcu() and rcu_barrier() to apply to the SRCU equivalents. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9850479f319d..7b6935e0cee3 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -635,6 +635,17 @@ static void srcu_torture_synchronize(void) synchronize_srcu(&srcu_ctl); } +static void srcu_torture_call(struct rcu_head *head, + void (*func)(struct rcu_head *head)) +{ + call_srcu(&srcu_ctl, head, func); +} + +static void srcu_torture_barrier(void) +{ + srcu_barrier(&srcu_ctl); +} + static int srcu_torture_stats(char *page) { int cnt = 0; @@ -661,8 +672,8 @@ static struct rcu_torture_ops srcu_ops = { .completed = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, + .call = srcu_torture_call, + .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .name = "srcu" }; From c6ebcbb60c8c68a88160fe54302e851700d1362c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 May 2012 19:21:41 -0700 Subject: [PATCH 228/612] rcu: Fix bug in rcu_barrier() torture test The child threads in the rcu_torture_barrier_cbs() are improperly synchronized, which can cause the rcu_barrier() tests to hang. The failure mode is as follows: 1. CPU 0 running in rcu_torture_barrier() sets barrier_cbs_count to n_barrier_cbs. 2. CPU 1 running in rcu_torture_barrier_cbs() wakes up, posts its RCU callback, and atomically decrements barrier_cbs_count. Because barrier_cbs_count is not zero, it does not do the wake_up(). 3. CPU 2 running in rcu_torture_barrier_cbs() wakes up, but finds that barrier_cbs_count is not equal to n_barrier_cbs, and so returns to sleep. 4. The value of barrier_cbs_count therefore never reaches zero, which causes the test to hang. This commit therefore uses a phase variable to coordinate the test, preventing this scenario from occurring. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 7b6935e0cee3..f7fe73e59c9f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -206,6 +206,7 @@ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ +static bool barrier_phase; /* Test phase. */ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); @@ -1642,6 +1643,7 @@ void rcu_torture_barrier_cbf(struct rcu_head *rcu) static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; + bool lastphase = 0; struct rcu_head rcu; init_rcu_head_on_stack(&rcu); @@ -1649,9 +1651,11 @@ static int rcu_torture_barrier_cbs(void *arg) set_user_nice(current, 19); do { wait_event(barrier_cbs_wq[myid], - atomic_read(&barrier_cbs_count) == n_barrier_cbs || + barrier_phase != lastphase || kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP); + lastphase = barrier_phase; + smp_mb(); /* ensure barrier_phase load before ->call(). */ if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) break; cur_ops->call(&rcu, rcu_torture_barrier_cbf); @@ -1676,7 +1680,8 @@ static int rcu_torture_barrier(void *arg) do { atomic_set(&barrier_cbs_invoked, 0); atomic_set(&barrier_cbs_count, n_barrier_cbs); - /* wake_up() path contains the required barriers. */ + smp_mb(); /* Ensure barrier_phase after prior assignments. */ + barrier_phase = !barrier_phase; for (i = 0; i < n_barrier_cbs; i++) wake_up(&barrier_cbs_wq[i]); wait_event(barrier_wq, From 143aa672f4fc643420c8325ad09c379ed33a27cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 May 2012 18:17:48 -0700 Subject: [PATCH 229/612] rcu: Fix diagnostic-printk typo in rcutorture The rcu_torture_barrier() function has a copy-and-paste typo in the string passed to rcutorture_shutdown_absorb(), which this commit fixes. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index f7fe73e59c9f..045a3dc233ea 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1700,7 +1700,7 @@ static int rcu_torture_barrier(void *arg) schedule_timeout_interruptible(HZ / 10); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + rcutorture_shutdown_absorb("rcu_torture_barrier"); while (!kthread_should_stop()) schedule_timeout_interruptible(1); return 0; From 72472a02a9c4507ef54d03d71bb253c26015f52c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 May 2012 17:50:51 -0700 Subject: [PATCH 230/612] rcu: Make rcutorture fakewriters invoke rcu_barrier() The current rcutorture rcu_barrier() testing never intentionally runs more than one instance of rcu_barrier() at a given time. This fails to test the the shiny new concurrency features of rcu_barrier(). This commit therefore modifies the rcutorture fakewriter kthread to randomly invoke rcu_barrier() rather than the usual synchronize_rcu(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 045a3dc233ea..c279ee920947 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1025,7 +1025,11 @@ rcu_torture_fakewriter(void *arg) do { schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); udelay(rcu_random(&rand) & 0x3ff); - cur_ops->sync(); + if (cur_ops->cb_barrier != NULL && + rcu_random(&rand) % (nfakewriters * 8) == 0) + cur_ops->cb_barrier(); + else + cur_ops->sync(); rcu_stutter_wait("rcu_torture_fakewriter"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); From 285fe29481d865ae381ad3924c80894e6968c2d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2012 08:45:12 -0700 Subject: [PATCH 231/612] rcu: Fix detection of abruptly-ending stall The code that attempts to identify stalls that end just as we detect them is broken by both flavors of initialization failure. This commit therefore properly initializes and computes the count of the number of reasons why the RCU grace period is stalled. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4b97bba7396e..dc8c5284fe06 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -733,7 +733,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) int cpu; long delta; unsigned long flags; - int ndetected; + int ndetected = 0; struct rcu_node *rnp = rcu_get_root(rsp); /* Only let one CPU complain about others per time interval. */ @@ -774,7 +774,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) */ rnp = rcu_get_root(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); - ndetected = rcu_print_task_stall(rnp); + ndetected += rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); print_cpu_stall_info_end(); From 3f5d3ea64f1783f0d4ea0d35890ae3297f045a8b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2012 15:39:56 -0700 Subject: [PATCH 232/612] rcu: Consolidate duplicate callback-list initialization There are a couple of open-coded initializations of the rcu_data structure's RCU callback list. This commit therefore consolidates them into a new init_callback_list() function. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dc8c5284fe06..81e0394e46af 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -936,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) return ret; } +/* + * Initialize the specified rcu_data structure's callback list to empty. + */ +static void init_callback_list(struct rcu_data *rdp) +{ + int i; + + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; +} + /* * Advance this CPU's callbacks, but only if the current grace period * has ended. This may be called only from the CPU to whom the rdp @@ -1328,8 +1340,6 @@ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - int i; - /* * Orphan the callbacks. First adjust the counts. This is safe * because ->onofflock excludes _rcu_barrier()'s adoption of @@ -1369,9 +1379,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, } /* Finally, initialize the rcu_data structure's list to empty. */ - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; + init_callback_list(rdp); } /* @@ -2407,16 +2415,13 @@ static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; - int i; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; + init_callback_list(rdp); rdp->qlen_lazy = 0; rdp->qlen = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); From 1d1fb395f6dbc07b36285bbedcf01a73b57f7cb5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2012 15:44:42 -0700 Subject: [PATCH 233/612] rcu: Add ACCESS_ONCE() to ->qlen accesses The _rcu_barrier() function accesses other CPUs' rcu_data structure's ->qlen field without benefit of locking. This commit therefore adds the required ACCESS_ONCE() wrappers around accesses and updates that need it. ACCESS_ONCE() is not needed when a CPU accesses its own ->qlen, or in code that cannot run while _rcu_barrier() is sampling ->qlen fields. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 81e0394e46af..89addada3e3a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1350,7 +1350,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, rsp->qlen += rdp->qlen; rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen_lazy = 0; - rdp->qlen = 0; + ACCESS_ONCE(rdp->qlen) = 0; } /* @@ -1600,7 +1600,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) } smp_mb(); /* List handling before counting for rcu_barrier(). */ rdp->qlen_lazy -= count_lazy; - rdp->qlen -= count; + ACCESS_ONCE(rdp->qlen) -= count; rdp->n_cbs_invoked += count; /* Reinstate batch limit if we have worked down the excess. */ @@ -1889,7 +1889,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ - rdp->qlen++; + ACCESS_ONCE(rdp->qlen)++; if (lazy) rdp->qlen_lazy++; else @@ -2423,7 +2423,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); init_callback_list(rdp); rdp->qlen_lazy = 0; - rdp->qlen = 0; + ACCESS_ONCE(rdp->qlen) = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); From 172708d002e0a2aca032b04fe6f2b8525c29244a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2012 15:23:45 -0700 Subject: [PATCH 234/612] rcu: Add a gcc-style structure initializer for RCU pointers RCU_INIT_POINTER() returns a value that is never used, and which should be abolished due to terminal ugliness: q = RCU_INIT_POINTER(global_p, p); However, there are two uses that cannot be handled by a do-while formulation because they do gcc-style initialization: RCU_INIT_POINTER(.real_cred, &init_cred), RCU_INIT_POINTER(.cred, &init_cred), This usage is clever, but not necessarily the nicest approach. This commit therefore creates an RCU_POINTER_INITIALIZER() macro that is specifically designed for gcc-style initialization. Signed-off-by: Paul E. McKenney Acked-by: David Howells --- include/linux/rcupdate.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9cac722b169c..ffe24c09e53d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -905,6 +905,14 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define RCU_INIT_POINTER(p, v) \ p = (typeof(*v) __force __rcu *)(v) +/** + * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer + * + * GCC-style initialization for an RCU-protected pointer in a structure field. + */ +#define RCU_POINTER_INITIALIZER(p, v) \ + .p = (typeof(*v) __force __rcu *)(v) + static __always_inline bool __is_kfree_rcu_offset(unsigned long offset) { return offset < 4096; From d36cc701b28983ea2c8d80afe68aae5452aea3bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2012 15:33:15 -0700 Subject: [PATCH 235/612] rcu: Use new RCU_POINTER_INITIALIZER for gcc-style initializations This commit applies the INIT_RCU_POINTER() macro to all uses of RCU_POINTER_INITIALIZER() that were all too cleverly creating gcc-style initializations. Signed-off-by: Paul E. McKenney Acked-by: David Howells --- include/linux/init_task.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 9e65eff6af3b..8a7476186990 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -168,8 +168,8 @@ extern struct cred init_cred; .children = LIST_HEAD_INIT(tsk.children), \ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ - RCU_INIT_POINTER(.real_cred, &init_cred), \ - RCU_INIT_POINTER(.cred, &init_cred), \ + RCU_POINTER_INITIALIZER(real_cred, &init_cred), \ + RCU_POINTER_INITIALIZER(cred, &init_cred), \ .comm = INIT_TASK_COMM, \ .thread = INIT_THREAD, \ .fs = &init_fs, \ From d1b88eb9e3bccaa43fb5d1bde1cbe210b3434731 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2012 15:42:30 -0700 Subject: [PATCH 236/612] rcu: Remove return value from RCU_INIT_POINTER() The return value from RCU_INIT_POINTER() is not used, and using it would be quite ugly, for example: q = RCU_INIT_POINTER(global_p, p); To prevent this sort of ugliness from appearing, this commit wraps RCU_INIT_POINTER() in a do-while loop. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: David Howells --- include/linux/rcupdate.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ffe24c09e53d..abf44d89c6ce 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -903,7 +903,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * the reader-accessible portions of the linked structure. */ #define RCU_INIT_POINTER(p, v) \ - p = (typeof(*v) __force __rcu *)(v) + do { \ + p = (typeof(*v) __force __rcu *)(v); \ + } while (0) /** * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer From e5c1f444d28b1a9eaf9c3927041db0414f684ef4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2012 16:31:38 -0700 Subject: [PATCH 237/612] key: Remove extraneous parentheses from rcu_assign_keypointer() This commit removes the extraneous parentheses from rcu_assign_keypointer() so that rcu_assign_pointer() can be wrapped in do-while. It also wraps rcu_assign_keypointer() in a do-while and parenthesizes its final argument, as suggested by David Howells. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/key.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/key.h b/include/linux/key.h index 4cd22ed627ef..cef3b315ba7c 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -303,7 +303,9 @@ static inline bool key_is_instantiated(const struct key *key) rwsem_is_locked(&((struct key *)(KEY))->sem))) #define rcu_assign_keypointer(KEY, PAYLOAD) \ - (rcu_assign_pointer((KEY)->payload.rcudata, PAYLOAD)) +do { \ + rcu_assign_pointer((KEY)->payload.rcudata, (PAYLOAD)); \ +} while (0) #ifdef CONFIG_SYSCTL extern ctl_table key_sysctls[]; From e9023c4061054cbf59c5288068118a4d0b152f01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2012 15:51:08 -0700 Subject: [PATCH 238/612] rcu: Remove return value from rcu_assign_pointer() The return value from rcu_assign_pointer() is not used, and using it would be quite ugly, for example: q = rcu_assign_pointer(global_p, p); To prevent this sort of ugliness from spreading, this commit wraps rcu_assign_pointer() in a do-while loop. Reported-by: Mathieu Desnoyers Reported-by: Josh Triplett Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index abf44d89c6ce..fb8e6db511d7 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -513,10 +513,10 @@ static inline void rcu_preempt_sleep_check(void) (_________p1); \ }) #define __rcu_assign_pointer(p, v, space) \ - ({ \ + do { \ smp_wmb(); \ (p) = (typeof(*v) __force space *)(v); \ - }) + } while (0) /** @@ -851,7 +851,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * * Assigns the specified value to the specified RCU-protected * pointer, ensuring that any concurrent RCU readers will see - * any prior initialization. Returns the value assigned. + * any prior initialization. * * Inserts memory barriers on architectures that require them * (which is most of them), and also prevents the compiler from From 2a3fa843b555d202e682bf08c65ee1a4a93c79cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 May 2012 11:58:36 -0700 Subject: [PATCH 239/612] rcu: Consolidate tree/tiny __rcu_read_{,un}lock() implementations The CONFIG_TREE_PREEMPT_RCU and CONFIG_TINY_PREEMPT_RCU versions of __rcu_read_lock() and __rcu_read_unlock() are identical, so this commit consolidates them into kernel/rcupdate.h. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 1 + kernel/rcupdate.c | 44 +++++++++++++++++++++++++++++++++++++ kernel/rcutiny_plugin.h | 47 +--------------------------------------- kernel/rcutree_plugin.h | 47 +--------------------------------------- 4 files changed, 47 insertions(+), 92 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index fb8e6db511d7..698555ebf49b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -147,6 +147,7 @@ extern void synchronize_sched(void); extern void __rcu_read_lock(void); extern void __rcu_read_unlock(void); +extern void rcu_read_unlock_special(struct task_struct *t); void synchronize_rcu(void); /* diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 95cba41ce1e9..4e6a61b15e86 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -53,6 +53,50 @@ #ifdef CONFIG_PREEMPT_RCU +/* + * Preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* critical section after entry code. */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting != 1) { + --t->rcu_read_lock_nesting; + } else { + barrier(); /* critical section before exit code. */ + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } +#ifdef CONFIG_PROVE_LOCKING + { + int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + /* * Check for a task exiting while in a preemptible-RCU read-side * critical section, clean up if so. No need to issue warnings, diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index fc31a2d65100..a269b0da0eb6 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -132,7 +132,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { RCU_TRACE(.rcb.name = "rcu_preempt") }; -static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(void); static void rcu_report_exp_done(void); @@ -526,24 +525,12 @@ void rcu_preempt_note_context_switch(void) local_irq_restore(flags); } -/* - * Tiny-preemptible RCU implementation for rcu_read_lock(). - * Just increment ->rcu_read_lock_nesting, shared state will be updated - * if we block. - */ -void __rcu_read_lock(void) -{ - current->rcu_read_lock_nesting++; - barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - /* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static noinline void rcu_read_unlock_special(struct task_struct *t) +void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; @@ -626,38 +613,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) local_irq_restore(flags); } -/* - * Tiny-preemptible RCU implementation for rcu_read_unlock(). - * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost - * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then - * invoke rcu_read_unlock_special() to clean up after a context switch - * in an RCU read-side critical section and other special cases. - */ -void __rcu_read_unlock(void) -{ - struct task_struct *t = current; - - barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ - if (t->rcu_read_lock_nesting != 1) - --t->rcu_read_lock_nesting; - else { - t->rcu_read_lock_nesting = INT_MIN; - barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; - } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); - - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); - } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - /* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the rcu_preempt_ctrlblk structure, which is diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3e4899459f3d..4b6b17cdf66b 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -78,7 +78,6 @@ struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; -static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(struct rcu_node *rnp); /* @@ -232,18 +231,6 @@ static void rcu_preempt_note_context_switch(int cpu) local_irq_restore(flags); } -/* - * Tree-preemptible RCU implementation for rcu_read_lock(). - * Just increment ->rcu_read_lock_nesting, shared state will be updated - * if we block. - */ -void __rcu_read_lock(void) -{ - current->rcu_read_lock_nesting++; - barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - /* * Check for preempted RCU readers blocking the current grace period * for the specified rcu_node structure. If the caller needs a reliable @@ -310,7 +297,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static noinline void rcu_read_unlock_special(struct task_struct *t) +void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; @@ -418,38 +405,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) } } -/* - * Tree-preemptible RCU implementation for rcu_read_unlock(). - * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost - * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then - * invoke rcu_read_unlock_special() to clean up after a context switch - * in an RCU read-side critical section and other special cases. - */ -void __rcu_read_unlock(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting != 1) - --t->rcu_read_lock_nesting; - else { - barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = INT_MIN; - barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; - } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); - - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); - } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - #ifdef CONFIG_RCU_CPU_STALL_VERBOSE /* From 28f5c693d03530eb15c5354f758b789189b98c37 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 May 2012 14:25:58 -0700 Subject: [PATCH 240/612] rcu: Remove function versions of __kfree_rcu and __is_kfree_rcu_offset Commit d8169d4c (Make __kfree_rcu() less dependent on compiler choices) added cpp macro versions of __kfree_rcu() and __is_kfree_rcu_offset(), but failed to remove the old inline-function versions. This commit does this cleanup. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 698555ebf49b..31568c734525 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -916,24 +916,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define RCU_POINTER_INITIALIZER(p, v) \ .p = (typeof(*v) __force __rcu *)(v) -static __always_inline bool __is_kfree_rcu_offset(unsigned long offset) -{ - return offset < 4096; -} - -static __always_inline -void __kfree_rcu(struct rcu_head *head, unsigned long offset) -{ - typedef void (*rcu_callback)(struct rcu_head *); - - BUILD_BUG_ON(!__builtin_constant_p(offset)); - - /* See the kfree_rcu() header comment. */ - BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); - - kfree_call_rcu(head, (rcu_callback)offset); -} - /* * Does the specified offset indicate that the corresponding rcu_head * structure can be handled by kfree_rcu()? From 62fde6edf12b60fddb13a3f0a779c8be0bb7447e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2012 22:10:24 -0700 Subject: [PATCH 241/612] rcu: Make __call_rcu() handle invocation from idle Although __call_rcu() is handled correctly when called from a momentary non-idle period, if it is called on a CPU that RCU believes to be idle on RCU_FAST_NO_HZ kernels, the callback might be indefinitely postponed. This commit therefore ensures that RCU is aware of the new callback and has a chance to force the CPU out of dyntick-idle mode when a new callback is posted. Reported-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 13 ++++--------- kernel/rcutree.c | 15 +++++++++------ 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 31568c734525..26f6417f0264 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -256,6 +256,10 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SMP) +extern int rcu_is_cpu_idle(void); +#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SMP) */ + #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) bool rcu_lockdep_current_cpu_online(void); #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ @@ -267,15 +271,6 @@ static inline bool rcu_lockdep_current_cpu_online(void) #ifdef CONFIG_DEBUG_LOCK_ALLOC -#ifdef CONFIG_PROVE_RCU -extern int rcu_is_cpu_idle(void); -#else /* !CONFIG_PROVE_RCU */ -static inline int rcu_is_cpu_idle(void) -{ - return 0; -} -#endif /* else !CONFIG_PROVE_RCU */ - static inline void rcu_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 0, 2, 1, NULL, _THIS_IP_); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 89addada3e3a..a4a9c916ad36 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -585,8 +585,6 @@ void rcu_nmi_exit(void) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } -#ifdef CONFIG_PROVE_RCU - /** * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle * @@ -604,7 +602,7 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); -#ifdef CONFIG_HOTPLUG_CPU +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* * Is the current CPU online? Disable preemption to avoid false positives @@ -645,9 +643,7 @@ bool rcu_lockdep_current_cpu_online(void) } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -#endif /* #ifdef CONFIG_PROVE_RCU */ +#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ /** * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle @@ -1904,6 +1900,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), else trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); + /* + * If called from an extended quiescent state, invoke the RCU + * core in order to force a re-evaluation of RCU's idleness. + */ + if (rcu_is_cpu_idle()) + invoke_rcu_core(); + /* If interrupts were disabled, don't dive into RCU core. */ if (irqs_disabled_flags(flags)) { local_irq_restore(flags); From a16b7a693430406dc229ab0c6b154f669a2031c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 26 May 2012 08:56:01 -0700 Subject: [PATCH 242/612] rcu: Prevent __call_rcu() from invoking RCU core on offline CPUs The __call_rcu() function will invoke the RCU core, for example, if it detects that the current CPU has too many callbacks. However, this can happen on an offline CPU that is on its way to the idle loop, in which case it is an error to invoke the RCU core, and the excess callbacks will be adopted in any case. This commit therefore adds checks to __call_rcu() for running on an offline CPU, refraining from invoking the RCU core in this case. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index a4a9c916ad36..ceaa95923a87 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1904,11 +1904,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. */ - if (rcu_is_cpu_idle()) + if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) invoke_rcu_core(); - /* If interrupts were disabled, don't dive into RCU core. */ - if (irqs_disabled_flags(flags)) { + /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ + if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) { local_irq_restore(flags); return; } From 29154c57e35a191c83b19c61b1935c9f21957662 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 May 2012 03:21:48 -0700 Subject: [PATCH 243/612] rcu: Split RCU core processing out of __call_rcu() The __call_rcu() function is a bit overweight, so this commit splits it into actual enqueuing of and accounting for the callback (__call_rcu()) and associated RCU-core processing (__call_rcu_core()). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 96 ++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 44 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ceaa95923a87..70c4da7d2a97 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1861,6 +1861,56 @@ static void invoke_rcu_core(void) raise_softirq(RCU_SOFTIRQ); } +/* + * Handle any core-RCU processing required by a call_rcu() invocation. + */ +static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, + struct rcu_head *head, unsigned long flags) +{ + /* + * If called from an extended quiescent state, invoke the RCU + * core in order to force a re-evaluation of RCU's idleness. + */ + if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) + invoke_rcu_core(); + + /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ + if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) + return; + + /* + * Force the grace period if too many callbacks or too long waiting. + * Enforce hysteresis, and don't invoke force_quiescent_state() + * if some other CPU has recently done so. Also, don't bother + * invoking force_quiescent_state() if the newly enqueued callback + * is the only one waiting for a grace period to complete. + */ + if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { + + /* Are we ignoring a completed grace period? */ + rcu_process_gp_end(rsp, rdp); + check_for_new_grace_period(rsp, rdp); + + /* Start a new grace period if one not already started. */ + if (!rcu_gp_in_progress(rsp)) { + unsigned long nestflag; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + raw_spin_lock_irqsave(&rnp_root->lock, nestflag); + rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ + } else { + /* Give the grace period a kick. */ + rdp->blimit = LONG_MAX; + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; + } + } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) + force_quiescent_state(rsp, 1); +} + static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_state *rsp, bool lazy) @@ -1900,50 +1950,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), else trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); - /* - * If called from an extended quiescent state, invoke the RCU - * core in order to force a re-evaluation of RCU's idleness. - */ - if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) - invoke_rcu_core(); - - /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ - if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) { - local_irq_restore(flags); - return; - } - - /* - * Force the grace period if too many callbacks or too long waiting. - * Enforce hysteresis, and don't invoke force_quiescent_state() - * if some other CPU has recently done so. Also, don't bother - * invoking force_quiescent_state() if the newly enqueued callback - * is the only one waiting for a grace period to complete. - */ - if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { - - /* Are we ignoring a completed grace period? */ - rcu_process_gp_end(rsp, rdp); - check_for_new_grace_period(rsp, rdp); - - /* Start a new grace period if one not already started. */ - if (!rcu_gp_in_progress(rsp)) { - unsigned long nestflag; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - raw_spin_lock_irqsave(&rnp_root->lock, nestflag); - rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ - } else { - /* Give the grace period a kick. */ - rdp->blimit = LONG_MAX; - if (rsp->n_force_qs == rdp->n_force_qs_snap && - *rdp->nxttail[RCU_DONE_TAIL] != head) - force_quiescent_state(rsp, 0); - rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->qlen_last_fqs_check = rdp->qlen; - } - } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) - force_quiescent_state(rsp, 1); + /* Go handle any RCU core processing required. */ + __call_rcu_core(rsp, rdp, head, flags); local_irq_restore(flags); } From d7118175cce7e76b3292b60832813ef1f28b6523 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Jun 2012 07:12:43 -0700 Subject: [PATCH 244/612] rcu: Fix rcu_is_cpu_idle() #ifdef in TINY_RCU The rcu_is_cpu_idle() function is used if CONFIG_DEBUG_LOCK_ALLOC, but TINY_RCU defines it only when CONFIG_PROVE_RCU. This causes build failures when CONFIG_DEBUG_LOCK_ALLOC=y but CONFIG_PROVE_RCU=n. This commit therefore adjusts the #ifdefs for rcu_is_cpu_idle() so that it is defined when CONFIG_DEBUG_LOCK_ALLOC=y. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutiny.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 37a5444204d2..547b1fe5b052 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -172,7 +172,7 @@ void rcu_irq_enter(void) local_irq_restore(flags); } -#ifdef CONFIG_PROVE_RCU +#ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Test whether RCU thinks that the current CPU is idle. @@ -183,7 +183,7 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); -#endif /* #ifdef CONFIG_PROVE_RCU */ +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* * Test whether the current CPU was interrupted from idle. Nested From 1c17e4d4437d8045a596d9f06c1558dc09e2b372 Mon Sep 17 00:00:00 2001 From: Carsten Emde Date: Tue, 19 Jun 2012 10:43:16 +0200 Subject: [PATCH 245/612] rcu: Prevent uninitialized string in RCU CPU stall info An uninitialized string may be displayed at the end of the rcu_preempt detected stall info such as 0: (1 GPs behind) idle=075/140000000000000/0 =8?^D=8?^D ^^^^^^^^^^ if CONFIG_RCU_FAST_NO_HZ is not defined. This trivial patch clears the string in this case. Signed-off-by: Carsten Emde Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 4b6b17cdf66b..395cdd1e0634 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2224,6 +2224,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { + *cp = '\0'; } #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ From 95f0c1de3e6ed4383cc4b5f52ce4ecfb21026b49 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Jun 2012 11:58:27 -0700 Subject: [PATCH 246/612] rcu: Disable preemption in rcu_blocking_is_gp() It is time to optimize CONFIG_TREE_PREEMPT_RCU's synchronize_rcu() for uniprocessor optimization, which means that rcu_blocking_is_gp() can no longer rely on RCU read-side critical sections having disabled preemption. This commit therefore disables preemption across rcu_blocking_is_gp()'s scan of the cpu_online_mask. (Updated from previous version to fix embarrassing bug spotted by Wu Fengguang.) Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 70c4da7d2a97..e000a623e635 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1981,28 +1981,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); * occasionally incorrectly indicate that there are multiple CPUs online * when there was in fact only one the whole time, as this just adds * some overhead: RCU still operates correctly. - * - * Of course, sampling num_online_cpus() with preemption enabled can - * give erroneous results if there are concurrent CPU-hotplug operations. - * For example, given a demonic sequence of preemptions in num_online_cpus() - * and CPU-hotplug operations, there could be two or more CPUs online at - * all times, but num_online_cpus() might well return one (or even zero). - * - * However, all such demonic sequences require at least one CPU-offline - * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer - * is only a problem if there is an RCU read-side critical section executing - * throughout. But RCU-sched and RCU-bh read-side critical sections - * disable either preemption or bh, which prevents a CPU from going offline. - * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return - * that there is only one CPU when in fact there was more than one throughout - * is when there were no RCU readers in the system. If there are no - * RCU readers, the grace period by definition can be of zero length, - * regardless of the number of online CPUs. */ static inline int rcu_blocking_is_gp(void) { + int ret; + might_sleep(); /* Check for RCU read-side critical section. */ - return num_online_cpus() <= 1; + preempt_disable(); + ret = num_online_cpus() <= 1; + preempt_enable(); + return ret; } /** From cf01537ecf192d9ff15c32a355db5d5af22bea4d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Jun 2012 11:26:42 -0700 Subject: [PATCH 247/612] rcu: Add check for CPUs going offline with callbacks queued If a CPU goes offline with callbacks queued, those callbacks might be indefinitely postponed, which can result in a system hang. This commit therefore inserts warnings for this condition. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e000a623e635..95c7b61e77e4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1509,6 +1509,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp, true); + WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", + cpu, rdp->qlen, rdp->nxtlist); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ From bf1304e9cd755be7814ac8365834b7a1d0f06b58 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2012 15:55:39 -0700 Subject: [PATCH 248/612] rcu: Dump only the current CPU's buffers for idle-entry/exit warnings Problems in RCU idle entry and exit are almost always confined to the offending CPU. This commit therefore switches ftrace_dump() from DUMP_ALL to DUMP_ORIG. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4b97bba7396e..6eb48f13eeeb 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -358,7 +358,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); - ftrace_dump(DUMP_ALL); + ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -468,7 +468,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) trace_rcu_dyntick("Error on exit: not idle task", oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ALL); + ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ From 28f8555364ef5b54b21251c5c8022109a70626e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 May 2012 15:37:48 -0700 Subject: [PATCH 249/612] rcu: The rcu_needs_cpu() function is not a quiescent state The TINY_PREEMPT_RCU() function rcu_preempt_needs_cpu(), which is called from rcu_needs_cpu(), assumes that it is in a quiescent state with respect to the CPU. This is no longer the case. This commit therefore updates rcu_preempt_needs_cpu() to make it aware that it is not running in a quiescent state. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutiny_plugin.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index fc31a2d65100..849ede51c12d 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -846,8 +846,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); */ int rcu_preempt_needs_cpu(void) { - if (!rcu_preempt_running_reader()) - rcu_preempt_cpu_qs(); return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; } From e84c48ae3024ac2f14ed1c3671e5ea37c60fb838 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Jun 2012 20:45:10 -0700 Subject: [PATCH 250/612] rcu: Round FAST_NO_HZ lazy timeout to nearest second Currently, if several CPUs in the same package have all lazy RCU callbacks, their wakeups will be uncorrelated. If all the CPUs are in the same power domain (as is often the case), this will result in unnecessary power-ups of the package. This commit therefore uses round_jiffies() to round the timeouts to a second boundary, increasing the odds that they can be coalesced with each other or with other timeouts. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3e4899459f3d..c28d25522a81 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1968,7 +1968,7 @@ static void rcu_idle_count_callbacks_posted(void) */ #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ -#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ +#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ /* @@ -2047,10 +2047,13 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) return 1; } /* Set up for the possibility that RCU will post a timer. */ - if (rcu_cpu_has_nonlazy_callbacks(cpu)) - *delta_jiffies = RCU_IDLE_GP_DELAY; - else - *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, + RCU_IDLE_GP_DELAY) - jiffies; + } else { + *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; + *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; + } return 0; } @@ -2187,10 +2190,11 @@ static void rcu_prepare_for_idle(int cpu) if (rcu_cpu_has_nonlazy_callbacks(cpu)) { trace_rcu_prep_idle("Dyntick with callbacks"); rdtp->idle_gp_timer_expires = - jiffies + RCU_IDLE_GP_DELAY; + round_up(jiffies + RCU_IDLE_GP_DELAY, + RCU_IDLE_GP_DELAY); } else { rdtp->idle_gp_timer_expires = - jiffies + RCU_IDLE_LAZY_GP_DELAY; + round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); trace_rcu_prep_idle("Dyntick with lazy callbacks"); } tp = &rdtp->idle_gp_timer; From 4fa3b6cb1bc8c14b81b4c8ffdfd3f2500a7e9367 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Jun 2012 15:53:53 -0700 Subject: [PATCH 251/612] rcu: Fix qlen_lazy breakage Commit d8169d4c (Make __kfree_rcu() less dependent on compiler choices) created a macro out of an inline function in order to avoid build breakage for certain combinations of gcc flags. Unfortunately, it also converted a kfree_call_rcu() to a call_rcu(), which made the rcu_data structure's ->qlen_lazy field lose counts. This commit therefore changes the call_rcu() back to kfree_call_rcu(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9cac722b169c..46d45e0f9134 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -935,7 +935,7 @@ void __kfree_rcu(struct rcu_head *head, unsigned long offset) #define __kfree_rcu(head, offset) \ do { \ BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \ - call_rcu(head, (void (*)(struct rcu_head *))(unsigned long)(offset)); \ + kfree_call_rcu(head, (void (*)(struct rcu_head *))(unsigned long)(offset)); \ } while (0) /** From 9d2ad24306f2fafc3612e5a216aab31f9e56e879 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 24 Jun 2012 10:15:02 -0700 Subject: [PATCH 252/612] rcu: Make RCU_FAST_NO_HZ respect nohz= boot parameter If the nohz= boot parameter disables nohz, then RCU_FAST_NO_HZ needs to also disable itself. This commit therefore checks for tick_nohz_enabled being zero, disabling rcu_prepare_for_idle() if so. This commit assumes that tick_nohz_enabled can change at runtime: If this is not the case, then a simpler approach suffices. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 15 +++++++++++++++ kernel/time/tick-sched.c | 2 +- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 19b61ac1079f..e978845c6800 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -97,6 +97,7 @@ struct rcu_dynticks { /* # times non-lazy CBs posted to CPU. */ unsigned long nonlazy_posted_snap; /* idle-period nonlazy_posted snapshot. */ + int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c28d25522a81..3508000e077d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1971,6 +1971,8 @@ static void rcu_idle_count_callbacks_posted(void) #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ +extern int tick_nohz_enabled; + /* * Does the specified flavor of RCU have non-lazy callbacks pending on * the specified CPU? Both RCU flavor and CPU are specified by the @@ -2112,6 +2114,7 @@ static void rcu_cleanup_after_idle(int cpu) del_timer(&rdtp->idle_gp_timer); trace_rcu_prep_idle("Cleanup after idle"); + rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); } /* @@ -2137,6 +2140,18 @@ static void rcu_prepare_for_idle(int cpu) { struct timer_list *tp; struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + int tne; + + /* Handle nohz enablement switches conservatively. */ + tne = ACCESS_ONCE(tick_nohz_enabled); + if (tne != rdtp->tick_nohz_enabled_snap) { + if (rcu_cpu_has_callbacks(cpu)) + invoke_rcu_core(); /* force nohz to see update. */ + rdtp->tick_nohz_enabled_snap = tne; + return; + } + if (!tne) + return; /* * If this is an idle re-entry, for example, due to use of diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 869997833928..66ff07f6184c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -105,7 +105,7 @@ static ktime_t tick_init_jiffy_update(void) /* * NO HZ enabled ? */ -static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_enabled __read_mostly = 1; /* * Enable / Disable tickless mode From 8bea2bd37df08aaa599aa361a9f8b836ba98e554 Mon Sep 17 00:00:00 2001 From: Stanislaw Ledwon Date: Mon, 18 Jun 2012 15:20:00 +0200 Subject: [PATCH 253/612] usb: Add support for root hub port status CAS The host controller port status register supports CAS (Cold Attach Status) bit. This bit could be set when USB3.0 device is connected when system is in Sx state. When the system wakes to S0 this port status with CAS bit is reported and this port can't be used by any device. When CAS bit is set the port should be reset by warm reset. This was not supported by xhci driver. The issue was found when pendrive was connected to suspended platform. The link state of "Compliance Mode" was reported together with CAS bit. This link state was also not supported by xhci and core/hub.c. The CAS bit is defined only for xhci root hub port and it is not supported on regular hubs. The link status is used to force warm reset on port. Make the USB core issue a warm reset when port is in ether the 'inactive' or 'compliance mode'. Change the xHCI driver to report 'compliance mode' when the CAS is set. This force warm reset on the root hub port. This patch should be backported to stable kernels as old as 3.2, that contain the commit 10d674a82e553cb8a1f41027bb3c3e309b3f6804 "USB: When hot reset for USB3 fails, try warm reset." Signed-off-by: Stanislaw Ledwon Signed-off-by: Sarah Sharp Acked-by: Andiry Xu Cc: stable@vger.kernel.org --- drivers/usb/core/hub.c | 18 ++++++++------- drivers/usb/host/xhci-hub.c | 44 ++++++++++++++++++++++++++++++++----- drivers/usb/host/xhci.h | 6 ++++- 3 files changed, 53 insertions(+), 15 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 25a7422ee657..8fb484984c86 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -2324,12 +2324,16 @@ static unsigned hub_is_wusb(struct usb_hub *hub) static int hub_port_reset(struct usb_hub *hub, int port1, struct usb_device *udev, unsigned int delay, bool warm); -/* Is a USB 3.0 port in the Inactive state? */ -static bool hub_port_inactive(struct usb_hub *hub, u16 portstatus) +/* Is a USB 3.0 port in the Inactive or Complinance Mode state? + * Port worm reset is required to recover + */ +static bool hub_port_warm_reset_required(struct usb_hub *hub, u16 portstatus) { return hub_is_superspeed(hub->hdev) && - (portstatus & USB_PORT_STAT_LINK_STATE) == - USB_SS_PORT_LS_SS_INACTIVE; + (((portstatus & USB_PORT_STAT_LINK_STATE) == + USB_SS_PORT_LS_SS_INACTIVE) || + ((portstatus & USB_PORT_STAT_LINK_STATE) == + USB_SS_PORT_LS_COMP_MOD)) ; } static int hub_port_wait_reset(struct usb_hub *hub, int port1, @@ -2365,7 +2369,7 @@ static int hub_port_wait_reset(struct usb_hub *hub, int port1, * * See https://bugzilla.kernel.org/show_bug.cgi?id=41752 */ - if (hub_port_inactive(hub, portstatus)) { + if (hub_port_warm_reset_required(hub, portstatus)) { int ret; if ((portchange & USB_PORT_STAT_C_CONNECTION)) @@ -4408,9 +4412,7 @@ static void hub_events(void) /* Warm reset a USB3 protocol port if it's in * SS.Inactive state. */ - if (hub_is_superspeed(hub->hdev) && - (portstatus & USB_PORT_STAT_LINK_STATE) - == USB_SS_PORT_LS_SS_INACTIVE) { + if (hub_port_warm_reset_required(hub, portstatus)) { dev_dbg(hub_dev, "warm reset port %d\n", i); hub_port_reset(hub, i, NULL, HUB_BH_RESET_TIME, true); diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 2732ef660c5c..7b01094d7993 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -462,6 +462,42 @@ void xhci_test_and_clear_bit(struct xhci_hcd *xhci, __le32 __iomem **port_array, } } +/* Updates Link Status for super Speed port */ +static void xhci_hub_report_link_state(u32 *status, u32 status_reg) +{ + u32 pls = status_reg & PORT_PLS_MASK; + + /* resume state is a xHCI internal state. + * Do not report it to usb core. + */ + if (pls == XDEV_RESUME) + return; + + /* When the CAS bit is set then warm reset + * should be performed on port + */ + if (status_reg & PORT_CAS) { + /* The CAS bit can be set while the port is + * in any link state. + * Only roothubs have CAS bit, so we + * pretend to be in compliance mode + * unless we're already in compliance + * or the inactive state. + */ + if (pls != USB_SS_PORT_LS_COMP_MOD && + pls != USB_SS_PORT_LS_SS_INACTIVE) { + pls = USB_SS_PORT_LS_COMP_MOD; + } + /* Return also connection bit - + * hub state machine resets port + * when this bit is set. + */ + pls |= USB_PORT_STAT_CONNECTION; + } + /* update status field */ + *status |= pls; +} + int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buf, u16 wLength) { @@ -606,13 +642,9 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, else status |= USB_PORT_STAT_POWER; } - /* Port Link State */ + /* Update Port Link State for super speed ports*/ if (hcd->speed == HCD_USB3) { - /* resume state is a xHCI internal state. - * Do not report it to usb core. - */ - if ((temp & PORT_PLS_MASK) != XDEV_RESUME) - status |= (temp & PORT_PLS_MASK); + xhci_hub_report_link_state(&status, temp); } if (bus_state->port_c_suspend & (1 << wIndex)) status |= 1 << USB_PORT_FEAT_C_SUSPEND; diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index de3d6e3e57be..55c0785810c9 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -341,7 +341,11 @@ struct xhci_op_regs { #define PORT_PLC (1 << 22) /* port configure error change - port failed to configure its link partner */ #define PORT_CEC (1 << 23) -/* bit 24 reserved */ +/* Cold Attach Status - xHC can set this bit to report device attached during + * Sx state. Warm port reset should be perfomed to clear this bit and move port + * to connected state. + */ +#define PORT_CAS (1 << 24) /* wake on connect (enable) */ #define PORT_WKCONN_E (1 << 25) /* wake on disconnect (enable) */ From 0d9f78a92ef5e97d9fe51d9215ebe22f6f0d289d Mon Sep 17 00:00:00 2001 From: Sarah Sharp Date: Thu, 21 Jun 2012 16:28:30 -0700 Subject: [PATCH 254/612] xhci: Fix hang on back-to-back Set TR Deq Ptr commands. The Microsoft LifeChat 3000 USB headset was causing a very reproducible hang whenever it was plugged in. At first, I thought the host controller was producing bad transfer events, because the log was filled with errors like: xhci_hcd 0000:00:14.0: ERROR Transfer event TRB DMA ptr not part of current TD However, it turned out to be an xHCI driver bug in the ring expansion patches. The bug is triggered When there are two ring segments, and a TD that ends just before a link TRB, like so: ______________ _____________ | | ---> | setup TRB B | ______________ | _____________ | | | | data TRB B | ______________ | _____________ | setup TRB A | <-- deq | | data TRB B | ______________ | _____________ | data TRB A | | | | <-- enq, deq'' ______________ | _____________ | status TRB A | | | | ______________ | _____________ | link TRB |--------------- | link TRB | _____________ <--- deq' _____________ TD A (the first control transfer) stalls on the data phase. That halts the ring. The xHCI driver moves the hardware dequeue pointer to the first TRB after the stalled transfer, which happens to be the link TRB. Once the Set TR dequeue pointer command completes, the function update_ring_for_set_deq_completion runs. That function is supposed to update the xHCI driver's dequeue pointer to match the internal hardware dequeue pointer. On the first call this would work fine, and the software dequeue pointer would move to deq'. However, if the transfer immediately after that stalled (TD B in this case), another Set TR Dequeue command would be issued. That would move the hardware dequeue pointer to deq''. Once that command completed, update_ring_for_set_deq_completion would run again. The original code would unconditionally increment the software dequeue pointer, which moved the pointer off the ring segment into la-la-land. The while loop would happy increment the dequeue pointer (possibly wrapping it) until it matched the hardware pointer value. The while loop would also access all the memory in between the first ring segment and the second ring segment to determine if it was a link TRB. This could cause general protection faults, although it was unlikely because the ring segments came from a DMA pool, and would often have consecutive memory addresses. If nothing in that space looked like a link TRB, the deq_seg pointer for the ring would remain on the first segment. Thus, the deq_seg and the software dequeue pointer would get out of sync. When the next transfer event came in after the stalled transfer, the xHCI driver code would attempt to convert the software dequeue pointer into a DMA address in order to compare the DMA address for the completed transfer. Since the deq_seg and the dequeue pointer were out of sync, xhci_trb_virt_to_dma would return NULL. The transfer event would get ignored, the transfer would eventually timeout, and we would mistakenly convert the finished transfer to no-op TRBs. Some kernel driver (maybe xHCI?) would then get stuck in an infinite loop in interrupt context, and the whole machine would hang. This patch should be backported to kernels as old as 3.4, that contain the commit b008df60c6369ba0290fa7daa177375407a12e07 "xHCI: count free TRBs on transfer ring" Signed-off-by: Sarah Sharp Cc: Andiry Xu Cc: stable@vger.kernel.org --- drivers/usb/host/xhci-ring.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 23b4aefd1036..8275645889da 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -885,6 +885,17 @@ static void update_ring_for_set_deq_completion(struct xhci_hcd *xhci, num_trbs_free_temp = ep_ring->num_trbs_free; dequeue_temp = ep_ring->dequeue; + /* If we get two back-to-back stalls, and the first stalled transfer + * ends just before a link TRB, the dequeue pointer will be left on + * the link TRB by the code in the while loop. So we have to update + * the dequeue pointer one segment further, or we'll jump off + * the segment into la-la-land. + */ + if (last_trb(xhci, ep_ring, ep_ring->deq_seg, ep_ring->dequeue)) { + ep_ring->deq_seg = ep_ring->deq_seg->next; + ep_ring->dequeue = ep_ring->deq_seg->trbs; + } + while (ep_ring->dequeue != dev->eps[ep_index].queued_deq_ptr) { /* We have more usable TRBs */ ep_ring->num_trbs_free++; From 7b86cef34afceeecf57d4d07f3cfab06f8b60b13 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 3 Jul 2012 11:05:50 -0500 Subject: [PATCH 255/612] gpio/omap: fix invalid context restore of gpio bank-0 Currently the gpio _runtime_resume/suspend functions are calling the get_context_loss_count() platform function if the function is populated for a gpio bank. This function is used to determine if the gpio bank logic state needs to be restored due to a power transition. This function will be populated for all banks, but it should only be called for banks that have the "loses_context" variable set. It is pointless to call this if loses_context is false as we know the context will never be lost and will not need restoring. For all OMAP2+ devices gpio bank-0 is in an always-on power domain and so will never lose context. We found that the get_context_loss_count() was being called for bank-0 during the probe and returning 1 instead of 0 indicating that the context had been lost. This was causing the context restore function to be called at probe time for this bank and because the context had never been saved, was restoring an invalid state. This ultimately resulted in a crash [1]. This issue is a regression that was exposed by commit 1b1287032 (gpio/omap: fix missing check in *_runtime_suspend()). There are multiple bugs here that need to be addressed ... 1. Why the always-on power domain returns a context loss count of 1? This needs to be fixed in the power domain code [2]. However, the gpio driver should not assume the loss count is 0 to begin with. 2. The omap gpio driver should never be calling get_context_loss_count for a gpio bank in a always-on domain. This is pointless and adds unneccessary overhead. 3. The OMAP gpio driver assumes that the initial power domain context loss count will be 0 at the time the gpio driver is probed. However, it could be possible that this is not the case and an invalid context restore could be performed during the probe. To avoid this only populate the get_context_loss_count() function pointer after the initial call to pm_runtime_get() has occurred. This will ensure that the first pm_runtime_put() initialised the loss count correctly. This patch addresses issues 2 and 3 above. [1] http://marc.info/?l=linux-omap&m=134065775323775&w=2 [2] http://marc.info/?l=linux-omap&m=134100413303810&w=2 Cc: Kevin Hilman Cc: Grant Likely Cc: Linus Walleij Cc: Tarun Kanti DebBarma Cc: Franky Lin Cc: Santosh Shilimkar Cc: NeilBrown Reported-by: Franky Lin Reviewed-by: Santosh Shilimkar Tested-by: Franky Lin Acked-by: Kevin Hilman Tested-by: NeilBrown Signed-off-by: Jon Hunter Signed-off-by: Kevin Hilman --- drivers/gpio/gpio-omap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c index ff213e70fa5a..4fbc208c32cf 100644 --- a/drivers/gpio/gpio-omap.c +++ b/drivers/gpio/gpio-omap.c @@ -1091,7 +1091,6 @@ static int __devinit omap_gpio_probe(struct platform_device *pdev) bank->is_mpuio = pdata->is_mpuio; bank->non_wakeup_gpios = pdata->non_wakeup_gpios; bank->loses_context = pdata->loses_context; - bank->get_context_loss_count = pdata->get_context_loss_count; bank->regs = pdata->regs; #ifdef CONFIG_OF_GPIO bank->chip.of_node = of_node_get(node); @@ -1145,6 +1144,9 @@ static int __devinit omap_gpio_probe(struct platform_device *pdev) omap_gpio_chip_init(bank); omap_gpio_show_rev(bank); + if (bank->loses_context) + bank->get_context_loss_count = pdata->get_context_loss_count; + pm_runtime_put(bank->dev); list_add_tail(&bank->node, &omap_gpio_list); From ea8e867d9b505b1c852f9932ba75504bc4eebeaa Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Tue, 8 May 2012 18:11:55 +0530 Subject: [PATCH 256/612] MIPS: Netlogic: Fix PCIX irq on XLR chips The correct irq is PIC_PCIX_IRQ Signed-off-by: Jayachandran C Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/3750/ Signed-off-by: Ralf Baechle --- arch/mips/pci/pci-xlr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/pci/pci-xlr.c b/arch/mips/pci/pci-xlr.c index 1644805a6730..50ff4dc28b4f 100644 --- a/arch/mips/pci/pci-xlr.c +++ b/arch/mips/pci/pci-xlr.c @@ -327,7 +327,7 @@ static int __init pcibios_init(void) } } else { /* XLR PCI controller ACK */ - irq_set_handler_data(PIC_PCIE_XLSB0_LINK3_IRQ, xlr_pci_ack); + irq_set_handler_data(PIC_PCIX_IRQ, xlr_pci_ack); } return 0; From 249e2a38fbd28ffaadba112290742ada16946900 Mon Sep 17 00:00:00 2001 From: Ganesan Ramalingam Date: Tue, 8 May 2012 18:11:56 +0530 Subject: [PATCH 257/612] MIPS: Netlogic: MSI enable fix for XLS MSI interrupts do not work on XLS after commit a776c49 ( "PCI: msi: Disable msi interrupts when we initialize a pci device" ) because the change disables MSI interrupts on the XLS PCIe bridges during the PCI enumeration. Fix this by enabling MSI interrupts on the bridge in the arch_setup_msi_irq() function. A new function xls_get_pcie_link() has been introduced to get the PCI device corresponding to the top level PCIe bridge on which MSI has to be enabled. Also, update get_irq_vector() to use the new xls_get_pcie_link() function and PCI_SLOT() macro for determining the IRQ of PCI devices. Signed-off-by: Ganesan Ramalingam Signed-off-by: Jayachandran C Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/3753/ Signed-off-by: Ralf Baechle --- arch/mips/pci/pci-xlr.c | 59 ++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/arch/mips/pci/pci-xlr.c b/arch/mips/pci/pci-xlr.c index 50ff4dc28b4f..172af1cd5867 100644 --- a/arch/mips/pci/pci-xlr.c +++ b/arch/mips/pci/pci-xlr.c @@ -41,6 +41,7 @@ #include #include #include +#include #include @@ -156,35 +157,55 @@ struct pci_controller nlm_pci_controller = { .io_offset = 0x00000000UL, }; +/* + * The top level PCIe links on the XLS PCIe controller appear as + * bridges. Given a device, this function finds which link it is + * on. + */ +static struct pci_dev *xls_get_pcie_link(const struct pci_dev *dev) +{ + struct pci_bus *bus, *p; + + /* Find the bridge on bus 0 */ + bus = dev->bus; + for (p = bus->parent; p && p->number != 0; p = p->parent) + bus = p; + + return p ? bus->self : NULL; +} + static int get_irq_vector(const struct pci_dev *dev) { + struct pci_dev *lnk; + if (!nlm_chip_is_xls()) - return PIC_PCIX_IRQ; /* for XLR just one IRQ*/ + return PIC_PCIX_IRQ; /* for XLR just one IRQ */ /* * For XLS PCIe, there is an IRQ per Link, find out which * link the device is on to assign interrupts - */ - if (dev->bus->self == NULL) + */ + lnk = xls_get_pcie_link(dev); + if (lnk == NULL) return 0; - switch (dev->bus->self->devfn) { - case 0x0: + switch (PCI_SLOT(lnk->devfn)) { + case 0: return PIC_PCIE_LINK0_IRQ; - case 0x8: + case 1: return PIC_PCIE_LINK1_IRQ; - case 0x10: + case 2: if (nlm_chip_is_xls_b()) return PIC_PCIE_XLSB0_LINK2_IRQ; else return PIC_PCIE_LINK2_IRQ; - case 0x18: + case 3: if (nlm_chip_is_xls_b()) return PIC_PCIE_XLSB0_LINK3_IRQ; else return PIC_PCIE_LINK3_IRQ; } - WARN(1, "Unexpected devfn %d\n", dev->bus->self->devfn); + WARN(1, "Unexpected devfn %d\n", lnk->devfn); return 0; } @@ -202,7 +223,27 @@ void arch_teardown_msi_irq(unsigned int irq) int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { struct msi_msg msg; + struct pci_dev *lnk; int irq, ret; + u16 val; + + /* MSI not supported on XLR */ + if (!nlm_chip_is_xls()) + return 1; + + /* + * Enable MSI on the XLS PCIe controller bridge which was disabled + * at enumeration, the bridge MSI capability is at 0x50 + */ + lnk = xls_get_pcie_link(dev); + if (lnk == NULL) + return 1; + + pci_read_config_word(lnk, 0x50 + PCI_MSI_FLAGS, &val); + if ((val & PCI_MSI_FLAGS_ENABLE) == 0) { + val |= PCI_MSI_FLAGS_ENABLE; + pci_write_config_word(lnk, 0x50 + PCI_MSI_FLAGS, val); + } irq = get_irq_vector(dev); if (irq <= 0) From b876c1a0bcfe68a5eaf03f325cb2a09822bf11f2 Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Tue, 3 Jul 2012 19:04:02 +0200 Subject: [PATCH 258/612] MIPS: Netlogic: Fix TLB size of boot CPU. Starting other threads in the core will change the number of TLB entries of a CPU. Re-calculate current_cpu_data.tlbsize on the boot cpu after enabling and waking up other threads. The secondary CPUs do not need this logic because the threads are enabled on the secondary cores at wakeup and before cpu_probe. Signed-off-by: Jayachandran C Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/3751/ Signed-off-by: Ralf Baechle --- arch/mips/netlogic/xlp/setup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/mips/netlogic/xlp/setup.c b/arch/mips/netlogic/xlp/setup.c index acb677a1227c..b3df7c2aad1e 100644 --- a/arch/mips/netlogic/xlp/setup.c +++ b/arch/mips/netlogic/xlp/setup.c @@ -82,8 +82,10 @@ void __init prom_free_prom_memory(void) void xlp_mmu_init(void) { + /* enable extended TLB and Large Fixed TLB */ write_c0_config6(read_c0_config6() | 0x24); - current_cpu_data.tlbsize = ((read_c0_config6() >> 16) & 0xffff) + 1; + + /* set page mask of Fixed TLB in config7 */ write_c0_config7(PM_DEFAULT_MASK >> (13 + (ffz(PM_DEFAULT_MASK >> 13) / 2))); } @@ -100,6 +102,10 @@ void __init prom_init(void) nlm_common_ebase = read_c0_ebase() & (~((1 << 12) - 1)); #ifdef CONFIG_SMP nlm_wakeup_secondary_cpus(0xffffffff); + + /* update TLB size after waking up threads */ + current_cpu_data.tlbsize = ((read_c0_config6() >> 16) & 0xffff) + 1; + register_smp_ops(&nlm_smp_ops); #endif } From d92d95b6bf2722ffa0fefa7651c51bf336743dd7 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 2 Jul 2012 19:21:06 -0700 Subject: [PATCH 259/612] regulator: Fix recursive mutex lockdep warning A recursive lockdep warning occurs if you call regulator_set_optimum_mode() on a regulator with a supply because there is no nesting annotation for the rdev->mutex. To avoid this warning, get the supply's load before locking the regulator's mutex to avoid grabbing the same class of lock twice. ============================================= [ INFO: possible recursive locking detected ] 3.4.0 #3257 Tainted: G W --------------------------------------------- swapper/0/1 is trying to acquire lock: (&rdev->mutex){+.+.+.}, at: [] regulator_get_voltage+0x18/0x38 but task is already holding lock: (&rdev->mutex){+.+.+.}, at: [] regulator_set_optimum_mode+0x24/0x224 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&rdev->mutex); lock(&rdev->mutex); *** DEADLOCK *** May be due to missing lock nesting notation 3 locks held by swapper/0/1: #0: (&__lockdep_no_validate__){......}, at: [] __driver_attach+0x40/0x8c #1: (&__lockdep_no_validate__){......}, at: [] __driver_attach+0x50/0x8c #2: (&rdev->mutex){+.+.+.}, at: [] regulator_set_optimum_mode+0x24/0x224 stack backtrace: [] (unwind_backtrace+0x0/0x12c) from [] (validate_chain+0x760/0x1080) [] (validate_chain+0x760/0x1080) from [] (__lock_acquire+0x950/0xa10) [] (__lock_acquire+0x950/0xa10) from [] (lock_acquire+0x18c/0x1e8) [] (lock_acquire+0x18c/0x1e8) from [] (mutex_lock_nested+0x68/0x3c4) [] (mutex_lock_nested+0x68/0x3c4) from [] (regulator_get_voltage+0x18/0x38) [] (regulator_get_voltage+0x18/0x38) from [] (regulator_set_optimum_mode+0xa4/0x224) ... Signed-off-by: Stephen Boyd Signed-off-by: Mark Brown --- drivers/regulator/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 09a737c868b5..8b4b3829d9e7 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -2519,9 +2519,12 @@ int regulator_set_optimum_mode(struct regulator *regulator, int uA_load) { struct regulator_dev *rdev = regulator->rdev; struct regulator *consumer; - int ret, output_uV, input_uV, total_uA_load = 0; + int ret, output_uV, input_uV = 0, total_uA_load = 0; unsigned int mode; + if (rdev->supply) + input_uV = regulator_get_voltage(rdev->supply); + mutex_lock(&rdev->mutex); /* @@ -2554,10 +2557,7 @@ int regulator_set_optimum_mode(struct regulator *regulator, int uA_load) goto out; } - /* get input voltage */ - input_uV = 0; - if (rdev->supply) - input_uV = regulator_get_voltage(rdev->supply); + /* No supply? Use constraint voltage */ if (input_uV <= 0) input_uV = rdev->constraints->input_uV; if (input_uV <= 0) { From e84c282b40251f314c429f39b044785e323f2648 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 May 2012 14:45:21 +0900 Subject: [PATCH 260/612] tools lib traceevent: Let filtering numbers by string use function names As a pointer can be converted into a function name, let the filters work with the function name as well as with the pointer number. If the comparison expects a string, then convert numbers into functions, but only when the number is the same size as a long. Signed-off-by: Steven Rostedt Link: http://lkml.kernel.org/n/tip-oxsa1qkr2eq7u8d7r0aapedu@git.kernel.org Signed-off-by: Namhyung Kim --- tools/lib/traceevent/parse-filter.c | 45 ++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index dfcfe2c131de..80d872a81f26 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -1710,18 +1710,43 @@ static int test_num(struct event_format *event, static const char *get_field_str(struct filter_arg *arg, struct pevent_record *record) { - const char *val = record->data + arg->str.field->offset; + struct event_format *event; + struct pevent *pevent; + unsigned long long addr; + const char *val = NULL; + char hex[64]; - /* - * We need to copy the data since we can't be sure the field - * is null terminated. - */ - if (*(val + arg->str.field->size - 1)) { - /* copy it */ - memcpy(arg->str.buffer, val, arg->str.field->size); - /* the buffer is already NULL terminated */ - val = arg->str.buffer; + /* If the field is not a string convert it */ + if (arg->str.field->flags & FIELD_IS_STRING) { + val = record->data + arg->str.field->offset; + + /* + * We need to copy the data since we can't be sure the field + * is null terminated. + */ + if (*(val + arg->str.field->size - 1)) { + /* copy it */ + memcpy(arg->str.buffer, val, arg->str.field->size); + /* the buffer is already NULL terminated */ + val = arg->str.buffer; + } + + } else { + event = arg->str.field->event; + pevent = event->pevent; + addr = get_value(event, arg->str.field, record); + + if (arg->str.field->flags & (FIELD_IS_POINTER | FIELD_IS_LONG)) + /* convert to a kernel symbol */ + val = pevent_find_function(pevent, addr); + + if (val == NULL) { + /* just use the hex of the string name */ + snprintf(hex, 64, "0x%llx", addr); + val = hex; + } } + return val; } From c2e6dc2b268cca44d522b2ee86147f0d30d7e3e4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 15 Nov 2011 18:47:48 -0500 Subject: [PATCH 261/612] tools lib traceevent: Add support for "%.*s" in bprintk events The arg notation of '*' in bprintks is not handled by the parser. Implement it so that they show up properly in the output and do not kill the tracer from reporting events. Reported-by: Johannes Berg Signed-off-by: Steven Rostedt Link: http://lkml.kernel.org/n/tip-t0ctq7t1xz3ud6wv4v886jou@git.kernel.org Signed-off-by: Namhyung Kim --- tools/lib/traceevent/event-parse.c | 46 ++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 83f0a8add177..0644c2a23ad6 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -3370,7 +3370,7 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, break; } case PRINT_BSTRING: - trace_seq_printf(s, format, arg->string.string); + print_str_to_seq(s, format, len_arg, arg->string.string); break; case PRINT_OP: /* @@ -3471,6 +3471,7 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc unsigned long long ip, val; char *ptr; void *bptr; + int vsize; field = pevent->bprint_buf_field; ip_field = pevent->bprint_ip_field; @@ -3519,6 +3520,8 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc goto process_again; case '0' ... '9': goto process_again; + case '.': + goto process_again; case 'p': ls = 1; /* fall through */ @@ -3526,23 +3529,29 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc case 'u': case 'x': case 'i': + switch (ls) { + case 0: + vsize = 4; + break; + case 1: + vsize = pevent->long_size; + break; + case 2: + vsize = 8; + default: + vsize = ls; /* ? */ + break; + } + /* fall through */ + case '*': + if (*ptr == '*') + vsize = 4; + /* the pointers are always 4 bytes aligned */ bptr = (void *)(((unsigned long)bptr + 3) & ~3); - switch (ls) { - case 0: - ls = 4; - break; - case 1: - ls = pevent->long_size; - break; - case 2: - ls = 8; - default: - break; - } - val = pevent_read_number(pevent, bptr, ls); - bptr += ls; + val = pevent_read_number(pevent, bptr, vsize); + bptr += vsize; arg = alloc_arg(); arg->next = NULL; arg->type = PRINT_ATOM; @@ -3550,6 +3559,13 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc sprintf(arg->atom.atom, "%lld", val); *next = arg; next = &arg->next; + /* + * The '*' case means that an arg is used as the length. + * We need to continue to figure out for what. + */ + if (*ptr == '*') + goto process_again; + break; case 's': arg = alloc_arg(); From 0866a97eb7562409ba792f5e904841dd8e23c8b5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 May 2012 14:52:40 +0900 Subject: [PATCH 262/612] tools lib traceevent: Add support to show migrate disable counter The RT kernel added a migrate disable counter in all events. Add support to show this in the latency format. Signed-off-by: Steven Rostedt Link: http://lkml.kernel.org/n/tip-l6ulxyda952g7kua4pfsh73k@git.kernel.org Signed-off-by: Namhyung Kim --- tools/lib/traceevent/event-parse.c | 57 ++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 0644c2a23ad6..872dd35db051 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -2884,7 +2884,7 @@ static int get_common_info(struct pevent *pevent, event = pevent->events[0]; field = pevent_find_common_field(event, type); if (!field) - die("field '%s' not found", type); + return -1; *offset = field->offset; *size = field->size; @@ -2935,15 +2935,16 @@ static int parse_common_flags(struct pevent *pevent, void *data) static int parse_common_lock_depth(struct pevent *pevent, void *data) { - int ret; + return __parse_common(pevent, data, + &pevent->ld_size, &pevent->ld_offset, + "common_lock_depth"); +} - ret = __parse_common(pevent, data, - &pevent->ld_size, &pevent->ld_offset, - "common_lock_depth"); - if (ret < 0) - return -1; - - return ret; +static int parse_common_migrate_disable(struct pevent *pevent, void *data) +{ + return __parse_common(pevent, data, + &pevent->ld_size, &pevent->ld_offset, + "common_migrate_disable"); } static int events_id_cmp(const void *a, const void *b); @@ -3988,10 +3989,13 @@ void pevent_data_lat_fmt(struct pevent *pevent, struct trace_seq *s, struct pevent_record *record) { static int check_lock_depth = 1; + static int check_migrate_disable = 1; static int lock_depth_exists; + static int migrate_disable_exists; unsigned int lat_flags; unsigned int pc; int lock_depth; + int migrate_disable; int hardirq; int softirq; void *data = record->data; @@ -3999,18 +4003,26 @@ void pevent_data_lat_fmt(struct pevent *pevent, lat_flags = parse_common_flags(pevent, data); pc = parse_common_pc(pevent, data); /* lock_depth may not always exist */ - if (check_lock_depth) { - struct format_field *field; - struct event_format *event; - - check_lock_depth = 0; - event = pevent->events[0]; - field = pevent_find_common_field(event, "common_lock_depth"); - if (field) - lock_depth_exists = 1; - } if (lock_depth_exists) lock_depth = parse_common_lock_depth(pevent, data); + else if (check_lock_depth) { + lock_depth = parse_common_lock_depth(pevent, data); + if (lock_depth < 0) + check_lock_depth = 0; + else + lock_depth_exists = 1; + } + + /* migrate_disable may not always exist */ + if (migrate_disable_exists) + migrate_disable = parse_common_migrate_disable(pevent, data); + else if (check_migrate_disable) { + migrate_disable = parse_common_migrate_disable(pevent, data); + if (migrate_disable < 0) + check_migrate_disable = 0; + else + migrate_disable_exists = 1; + } hardirq = lat_flags & TRACE_FLAG_HARDIRQ; softirq = lat_flags & TRACE_FLAG_SOFTIRQ; @@ -4029,6 +4041,13 @@ void pevent_data_lat_fmt(struct pevent *pevent, else trace_seq_putc(s, '.'); + if (migrate_disable_exists) { + if (migrate_disable < 0) + trace_seq_putc(s, '.'); + else + trace_seq_printf(s, "%d", migrate_disable); + } + if (lock_depth_exists) { if (lock_depth < 0) trace_seq_putc(s, '.'); From aaf05c725bed1c7a8c940d9215662c78bea05dfd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 9 Jan 2012 15:58:09 -0500 Subject: [PATCH 263/612] tools lib traceevent: Fix %pM print format arg handling When %pM is used, the arg value must be a 6 byte character that will be printed as a 6 byte MAC address. But the code does a break over the main code which updates the current processing arg to point to the next arg. If there are other print arguments after a %pM, they will be off by one. The next arg will still be processing the %pM arg. Reported-by: Johannes Berg Signed-off-by: Steven Rostedt Link: http://lkml.kernel.org/n/tip-q3g0n1espikynsdkpbi6ue6t@git.kernel.org Signed-off-by: Namhyung Kim --- tools/lib/traceevent/event-parse.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 872dd35db051..da06c33dcf41 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -3858,6 +3858,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event } else if (*(ptr+1) == 'M' || *(ptr+1) == 'm') { print_mac_arg(s, *(ptr+1), data, size, event, arg); ptr++; + arg = arg->next; break; } From c5b35b731965d16fa8c966e288489857097e0b25 Mon Sep 17 00:00:00 2001 From: Wolfgang Mauerer Date: Thu, 22 Mar 2012 11:18:21 +0100 Subject: [PATCH 264/612] tools lib traceevent: Fix trace_printk for long integers On 32 bit systems, a conversion of the trace_printk format string "%lu" -> "%llu" is intended (similar for %lx etc.) when a trace was taken on a machine with 64 bit long integers. However, the current code computes the bogus transformation "%lu" -> "%u". Fix this. Besides that, the transformation is only required on systems that don't use 64 bits for long integers natively. Signed-off-by: Wolfgang Mauerer Signed-off-by: Steven Rostedt Link: http://lkml.kernel.org/r/1332411501-8059-3-git-send-email-wolfgang.mauerer@siemens.com Signed-off-by: Namhyung Kim --- tools/lib/traceevent/event-parse.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index da06c33dcf41..ddee5a8cf135 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -3895,14 +3895,15 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event break; } } - if (pevent->long_size == 8 && ls) { + if (pevent->long_size == 8 && ls && + sizeof(long) != 8) { char *p; ls = 2; /* make %l into %ll */ p = strchr(format, 'l'); if (p) - memmove(p, p+1, strlen(p)+1); + memmove(p+1, p, strlen(p)+1); else if (strcmp(format, "%p") == 0) strcpy(format, "0x%llx"); } From 0fc45ef5202a34b5862ca246740e6ab50bc3e3e1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 9 Apr 2012 11:54:29 +0900 Subject: [PATCH 265/612] tools lib traceevent: Fix printk_cmp() The printk_cmp function should use printk_map instead of func_map. Also rename the variables for consistency. Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: David Ahern Link: http://lkml.kernel.org/r/1333940074-19052-3-git-send-email-namhyung.kim@lge.com Signed-off-by: Steven Rostedt Signed-off-by: Namhyung Kim --- tools/lib/traceevent/event-parse.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index ddee5a8cf135..7815b8d2eabd 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -511,12 +511,12 @@ struct printk_list { static int printk_cmp(const void *a, const void *b) { - const struct func_map *fa = a; - const struct func_map *fb = b; + const struct printk_map *pa = a; + const struct printk_map *pb = b; - if (fa->addr < fb->addr) + if (pa->addr < pb->addr) return -1; - if (fa->addr > fb->addr) + if (pa->addr > pb->addr) return 1; return 0; From deba3fb26fd1ed3235b00dccced9784a7f76ec3c Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 9 Apr 2012 11:54:30 +0900 Subject: [PATCH 266/612] tools lib traceevent: Introduce extend_token() The __read_token() function has some duplicated code to handle internal buffer overflow. Factor them out to new extend_token(). According to the man pages of realloc/free(3), they can handle NULL pointer input so that it can be ended up to compact the code. Also handle error path correctly. Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: David Ahern Link: http://lkml.kernel.org/r/1333940074-19052-4-git-send-email-namhyung.kim@lge.com [rostedt@goodmis.org: added some extra whitespace] Signed-off-by: Steven Rostedt Signed-off-by: Namhyung Kim --- tools/lib/traceevent/event-parse.c | 54 ++++++++++++++---------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 7815b8d2eabd..768fab5fbcb7 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -781,6 +781,25 @@ int pevent_peek_char(void) return __peek_char(); } +static int extend_token(char **tok, char *buf, int size) +{ + char *newtok = realloc(*tok, size); + + if (!newtok) { + free(*tok); + *tok = NULL; + return -1; + } + + if (!*tok) + strcpy(newtok, buf); + else + strcat(newtok, buf); + *tok = newtok; + + return 0; +} + static enum event_type force_token(const char *str, char **tok); static enum event_type __read_token(char **tok) @@ -865,17 +884,10 @@ static enum event_type __read_token(char **tok) do { if (i == (BUFSIZ - 1)) { buf[i] = 0; - if (*tok) { - *tok = realloc(*tok, tok_size + BUFSIZ); - if (!*tok) - return EVENT_NONE; - strcat(*tok, buf); - } else - *tok = strdup(buf); - - if (!*tok) - return EVENT_NONE; tok_size += BUFSIZ; + + if (extend_token(tok, buf, tok_size) < 0) + return EVENT_NONE; i = 0; } last_ch = ch; @@ -914,17 +926,10 @@ static enum event_type __read_token(char **tok) while (get_type(__peek_char()) == type) { if (i == (BUFSIZ - 1)) { buf[i] = 0; - if (*tok) { - *tok = realloc(*tok, tok_size + BUFSIZ); - if (!*tok) - return EVENT_NONE; - strcat(*tok, buf); - } else - *tok = strdup(buf); - - if (!*tok) - return EVENT_NONE; tok_size += BUFSIZ; + + if (extend_token(tok, buf, tok_size) < 0) + return EVENT_NONE; i = 0; } ch = __read_char(); @@ -933,14 +938,7 @@ static enum event_type __read_token(char **tok) out: buf[i] = 0; - if (*tok) { - *tok = realloc(*tok, tok_size + i); - if (!*tok) - return EVENT_NONE; - strcat(*tok, buf); - } else - *tok = strdup(buf); - if (!*tok) + if (extend_token(tok, buf, tok_size + i + 1) < 0) return EVENT_NONE; if (type == EVENT_ITEM) { From ca63858e9eb0ce495031c4ab5291874835cb43cf Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 9 Apr 2012 11:54:31 +0900 Subject: [PATCH 267/612] tools lib traceevent: Handle strdup failure cases There were some places didn't check return value of the strdup and had unneeded/duplicated checks. Fix it. Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: David Ahern Link: http://lkml.kernel.org/r/1333940074-19052-5-git-send-email-namhyung.kim@lge.com Signed-off-by: Steven Rostedt Signed-off-by: Namhyung Kim --- tools/lib/traceevent/event-parse.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 768fab5fbcb7..cd334d52d333 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -467,8 +467,10 @@ int pevent_register_function(struct pevent *pevent, char *func, item->mod = NULL; item->addr = addr; - pevent->funclist = item; + if (!item->func || (mod && !item->mod)) + die("malloc func"); + pevent->funclist = item; pevent->func_count++; return 0; @@ -583,10 +585,13 @@ int pevent_register_print_string(struct pevent *pevent, char *fmt, item = malloc_or_die(sizeof(*item)); item->next = pevent->printklist; - pevent->printklist = item; item->printk = strdup(fmt); item->addr = addr; + if (!item->printk) + die("malloc fmt"); + + pevent->printklist = item; pevent->printk_count++; return 0; @@ -2150,6 +2155,8 @@ process_fields(struct event_format *event, struct print_flag_sym **list, char ** if (value == NULL) goto out_free; field->value = strdup(value); + if (field->value == NULL) + goto out_free; free_arg(arg); arg = alloc_arg(); @@ -2163,6 +2170,8 @@ process_fields(struct event_format *event, struct print_flag_sym **list, char ** if (value == NULL) goto out_free; field->str = strdup(value); + if (field->str == NULL) + goto out_free; free_arg(arg); arg = NULL; @@ -3433,6 +3442,9 @@ process_defined_func(struct trace_seq *s, void *data, int size, string = malloc_or_die(sizeof(*string)); string->next = strings; string->str = strdup(str.buffer); + if (!string->str) + die("malloc str"); + strings = string; trace_seq_destroy(&str); break; @@ -3571,6 +3583,8 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc arg->next = NULL; arg->type = PRINT_BSTRING; arg->string.string = strdup(bptr); + if (!arg->string.string) + break; bptr += strlen(bptr) + 1; *next = arg; next = &arg->next; @@ -4666,6 +4680,8 @@ int pevent_parse_event(struct pevent *pevent, die("failed to read event id"); event->system = strdup(sys); + if (!event->system) + die("failed to allocate system"); /* Add pevent to event so that it can be referenced */ event->pevent = pevent; @@ -4707,6 +4723,10 @@ int pevent_parse_event(struct pevent *pevent, list = &arg->next; arg->type = PRINT_FIELD; arg->field.name = strdup(field->name); + if (!arg->field.name) { + do_warning("failed to allocate field name"); + goto event_failed; + } arg->field.field = field; } return 0; @@ -5050,6 +5070,11 @@ int pevent_register_event_handler(struct pevent *pevent, if (sys_name) handle->sys_name = strdup(sys_name); + if ((event_name && !handle->event_name) || + (sys_name && !handle->sys_name)) { + die("Failed to allocate event/sys name"); + } + handle->func = func; handle->next = pevent->handlers; pevent->handlers = handle; From d286447f23cdb0337a5429e10b39761f6b1d5c18 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 9 Apr 2012 11:54:33 +0900 Subject: [PATCH 268/612] tools lib traceevent: Handle realloc() failure path The realloc can fail so that we should handle it properly. Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: David Ahern Link: http://lkml.kernel.org/r/1333940074-19052-7-git-send-email-namhyung.kim@lge.com Signed-off-by: Steven Rostedt Signed-off-by: Namhyung Kim --- tools/lib/traceevent/event-parse.c | 76 +++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 16 deletions(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index cd334d52d333..05eb6b40b16a 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -1264,9 +1264,15 @@ static int event_read_fields(struct event_format *event, struct format_field **f field->flags |= FIELD_IS_POINTER; if (field->type) { - field->type = realloc(field->type, - strlen(field->type) + - strlen(last_token) + 2); + char *new_type; + new_type = realloc(field->type, + strlen(field->type) + + strlen(last_token) + 2); + if (!new_type) { + free(last_token); + goto fail; + } + field->type = new_type; strcat(field->type, " "); strcat(field->type, last_token); free(last_token); @@ -1291,6 +1297,7 @@ static int event_read_fields(struct event_format *event, struct format_field **f if (strcmp(token, "[") == 0) { enum event_type last_type = type; char *brackets = token; + char *new_brackets; int len; field->flags |= FIELD_IS_ARRAY; @@ -1310,9 +1317,14 @@ static int event_read_fields(struct event_format *event, struct format_field **f len = 1; last_type = type; - brackets = realloc(brackets, - strlen(brackets) + - strlen(token) + len); + new_brackets = realloc(brackets, + strlen(brackets) + + strlen(token) + len); + if (!new_brackets) { + free(brackets); + goto fail; + } + brackets = new_brackets; if (len == 2) strcat(brackets, " "); strcat(brackets, token); @@ -1328,7 +1340,12 @@ static int event_read_fields(struct event_format *event, struct format_field **f free_token(token); - brackets = realloc(brackets, strlen(brackets) + 2); + new_brackets = realloc(brackets, strlen(brackets) + 2); + if (!new_brackets) { + free(brackets); + goto fail; + } + brackets = new_brackets; strcat(brackets, "]"); /* add brackets to type */ @@ -1339,10 +1356,16 @@ static int event_read_fields(struct event_format *event, struct format_field **f * the format: type [] item; */ if (type == EVENT_ITEM) { - field->type = realloc(field->type, - strlen(field->type) + - strlen(field->name) + - strlen(brackets) + 2); + char *new_type; + new_type = realloc(field->type, + strlen(field->type) + + strlen(field->name) + + strlen(brackets) + 2); + if (!new_type) { + free(brackets); + goto fail; + } + field->type = new_type; strcat(field->type, " "); strcat(field->type, field->name); free_token(field->name); @@ -1350,9 +1373,15 @@ static int event_read_fields(struct event_format *event, struct format_field **f field->name = token; type = read_token(&token); } else { - field->type = realloc(field->type, - strlen(field->type) + - strlen(brackets) + 1); + char *new_type; + new_type = realloc(field->type, + strlen(field->type) + + strlen(brackets) + 1); + if (!new_type) { + free(brackets); + goto fail; + } + field->type = new_type; strcat(field->type, brackets); } free(brackets); @@ -1735,10 +1764,16 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok) /* could just be a type pointer */ if ((strcmp(arg->op.op, "*") == 0) && type == EVENT_DELIM && (strcmp(token, ")") == 0)) { + char *new_atom; + if (left->type != PRINT_ATOM) die("bad pointer type"); - left->atom.atom = realloc(left->atom.atom, + new_atom = realloc(left->atom.atom, strlen(left->atom.atom) + 3); + if (!new_atom) + goto out_free; + + left->atom.atom = new_atom; strcat(left->atom.atom, " *"); free(arg->op.op); *arg = *left; @@ -2597,7 +2632,16 @@ process_arg_token(struct event_format *event, struct print_arg *arg, } /* atoms can be more than one token long */ while (type == EVENT_ITEM) { - atom = realloc(atom, strlen(atom) + strlen(token) + 2); + char *new_atom; + new_atom = realloc(atom, + strlen(atom) + strlen(token) + 2); + if (!new_atom) { + free(atom); + *tok = NULL; + free_token(token); + return EVENT_ERROR; + } + atom = new_atom; strcat(atom, " "); strcat(atom, token); free_token(token); From 3831a42deba59bf26618b0dd6fa1320a0a94ebd9 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 23 Apr 2012 13:58:33 +0900 Subject: [PATCH 269/612] tools lib traceevent: Pass string type argument to args It seems PEVENT_FUNC_ARG_STRING missed passing the allocated string to the args array. Fix it. Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: David Ahern Link: http://lkml.kernel.org/r/1335157118-14658-7-git-send-email-namhyung.kim@lge.com Signed-off-by: Steven Rostedt Signed-off-by: Namhyung Kim --- tools/lib/traceevent/event-parse.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 05eb6b40b16a..337ca02fb525 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -3489,6 +3489,7 @@ process_defined_func(struct trace_seq *s, void *data, int size, if (!string->str) die("malloc str"); + args[i] = (unsigned long long)string->str; strings = string; trace_seq_destroy(&str); break; From 4b5632bc3122ec9b7e875294c8abe92f1573196a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 23 Apr 2012 13:58:34 +0900 Subject: [PATCH 270/612] tools lib traceevent: Do not call add_event() again if allocation failed When memory allocation for the field name is failed, do not goto event_failed since we added the event already. Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: David Ahern Link: http://lkml.kernel.org/r/1335157118-14658-8-git-send-email-namhyung.kim@lge.com Signed-off-by: Steven Rostedt Signed-off-by: Namhyung Kim --- tools/lib/traceevent/event-parse.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 337ca02fb525..99b0cd4b79d4 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -4770,7 +4770,8 @@ int pevent_parse_event(struct pevent *pevent, arg->field.name = strdup(field->name); if (!arg->field.name) { do_warning("failed to allocate field name"); - goto event_failed; + event->flags |= EVENT_FL_FAILED; + return -1; } arg->field.field = field; } From 16e6b8fdfd181ac79f04ff826c42c7d8a2806a17 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 23 Apr 2012 13:58:35 +0900 Subject: [PATCH 271/612] tools lib traceevent: Fix some comments Update and add missing argument descriptions and fix some typo on function comments. Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: David Ahern Link: http://lkml.kernel.org/r/1335157118-14658-9-git-send-email-namhyung.kim@lge.com Signed-off-by: Steven Rostedt Signed-off-by: Namhyung Kim --- tools/lib/traceevent/event-parse.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 99b0cd4b79d4..863016040dd6 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -4037,8 +4037,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event * pevent_data_lat_fmt - parse the data for the latency format * @pevent: a handle to the pevent * @s: the trace_seq to write to - * @data: the raw data to read from - * @size: currently unused. + * @record: the record to read from * * This parses out the Latency format (interrupts disabled, * need rescheduling, in hard/soft interrupt, preempt count @@ -4173,10 +4172,7 @@ const char *pevent_data_comm_from_pid(struct pevent *pevent, int pid) * pevent_data_comm_from_pid - parse the data into the print format * @s: the trace_seq to write to * @event: the handle to the event - * @cpu: the cpu the event was recorded on - * @data: the raw data - * @size: the size of the raw data - * @nsecs: the timestamp of the event + * @record: the record to read from * * This parses the raw @data using the given @event information and * writes the print format into the trace_seq. @@ -4944,7 +4940,7 @@ int pevent_get_any_field_val(struct trace_seq *s, struct event_format *event, * @record: The record with the field name. * @err: print default error if failed. * - * Returns: 0 on success, -1 field not fould, or 1 if buffer is full. + * Returns: 0 on success, -1 field not found, or 1 if buffer is full. */ int pevent_print_num_field(struct trace_seq *s, const char *fmt, struct event_format *event, const char *name, @@ -4986,11 +4982,12 @@ static void free_func_handle(struct pevent_function_handler *func) * pevent_register_print_function - register a helper function * @pevent: the handle to the pevent * @func: the function to process the helper function + * @ret_type: the return type of the helper function * @name: the name of the helper function * @parameters: A list of enum pevent_func_arg_type * * Some events may have helper functions in the print format arguments. - * This allows a plugin to dynmically create a way to process one + * This allows a plugin to dynamically create a way to process one * of these functions. * * The @parameters is a variable list of pevent_func_arg_type enums that @@ -5061,12 +5058,13 @@ int pevent_register_print_function(struct pevent *pevent, } /** - * pevent_register_event_handle - register a way to parse an event + * pevent_register_event_handler - register a way to parse an event * @pevent: the handle to the pevent * @id: the id of the event to register * @sys_name: the system name the event belongs to * @event_name: the name of the event * @func: the function to call to parse the event information + * @context: the data to be passed to @func * * This function allows a developer to override the parsing of * a given event. If for some reason the default print format From e54b34aed1c4082ed03f4d1f7a19276059b1e30a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 23 Apr 2012 13:58:36 +0900 Subject: [PATCH 272/612] tools lib traceevent: Check result of malloc() during reading token The malloc can fail so the return value should be checked. For now, just use malloc_or_die(). Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: David Ahern Link: http://lkml.kernel.org/r/1335157118-14658-10-git-send-email-namhyung.kim@lge.com Signed-off-by: Steven Rostedt Signed-off-by: Namhyung Kim --- tools/lib/traceevent/parse-filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index 80d872a81f26..d54c2b4dbd9f 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -96,7 +96,7 @@ static enum event_type read_token(char **tok) (strcmp(token, "=") == 0 || strcmp(token, "!") == 0) && pevent_peek_char() == '~') { /* append it */ - *tok = malloc(3); + *tok = malloc_or_die(3); sprintf(*tok, "%c%c", *token, '~'); free_token(token); /* Now remove the '~' from the buffer */ From 0fed48341529716c38493be66591bda458921b75 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 23 Apr 2012 13:58:38 +0900 Subject: [PATCH 273/612] tools lib traceevent: Check return value of arg_to_str() The arg_to_str() can fail so we should handle that case properly. Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: David Ahern Link: http://lkml.kernel.org/r/1335157118-14658-12-git-send-email-namhyung.kim@lge.com Signed-off-by: Steven Rostedt Signed-off-by: Namhyung Kim --- tools/lib/traceevent/parse-filter.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index d54c2b4dbd9f..f020ac61f9c1 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -2026,11 +2026,13 @@ static char *exp_to_str(struct event_filter *filter, struct filter_arg *arg) char *lstr; char *rstr; char *op; - char *str; + char *str = NULL; int len; lstr = arg_to_str(filter, arg->exp.left); rstr = arg_to_str(filter, arg->exp.right); + if (!lstr || !rstr) + goto out; switch (arg->exp.type) { case FILTER_EXP_ADD: @@ -2070,6 +2072,7 @@ static char *exp_to_str(struct event_filter *filter, struct filter_arg *arg) len = strlen(op) + strlen(lstr) + strlen(rstr) + 4; str = malloc_or_die(len); snprintf(str, len, "%s %s %s", lstr, op, rstr); +out: free(lstr); free(rstr); @@ -2086,6 +2089,8 @@ static char *num_to_str(struct event_filter *filter, struct filter_arg *arg) lstr = arg_to_str(filter, arg->num.left); rstr = arg_to_str(filter, arg->num.right); + if (!lstr || !rstr) + goto out; switch (arg->num.type) { case FILTER_CMP_EQ: @@ -2122,6 +2127,7 @@ static char *num_to_str(struct event_filter *filter, struct filter_arg *arg) break; } +out: free(lstr); free(rstr); return str; @@ -2272,7 +2278,12 @@ int pevent_filter_compare(struct event_filter *filter1, struct event_filter *fil /* The best way to compare complex filters is with strings */ str1 = arg_to_str(filter1, filter_type1->filter); str2 = arg_to_str(filter2, filter_type2->filter); - result = strcmp(str1, str2) != 0; + if (str1 && str2) + result = strcmp(str1, str2) != 0; + else + /* bail out if allocation fails */ + result = 1; + free(str1); free(str2); if (result) From c9bbabe32a231b5caa8c42fa6783405346983c59 Mon Sep 17 00:00:00 2001 From: Peter Huewe Date: Tue, 24 Apr 2012 23:19:40 +0200 Subject: [PATCH 274/612] tools lib traceevent: Add missing break in make_bprint_args In the current code we assign vsize=8 and then fall through to the default and assign vsize=1. -> probably the break is missing here, otherwise we can remove the case. Signed-off-by: Peter Huewe Signed-off-by: Steven Rostedt Link: http://lkml.kernel.org/n/tip-3fxjy46h2tr9pl0spv7tems6@git.kernel.org Signed-off-by: Namhyung Kim --- tools/lib/traceevent/event-parse.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 863016040dd6..f880db457842 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -3594,6 +3594,7 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc break; case 2: vsize = 8; + break; default: vsize = ls; /* ? */ break; From f6ced60fb6c0ee115982157457c47e48802d6e1d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 24 Apr 2012 10:29:44 +0900 Subject: [PATCH 275/612] tools lib traceevent: Cleanup realloc use The if branch is completely unnecessary since 'realloc' can handle NULL pointers for the first parameter. This patch is just an adoption of Ulrich Drepper's recent patch on perf tools. Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1335230984-7613-1-git-send-email-namhyung.kim@lge.com Signed-off-by: Steven Rostedt Signed-off-by: Namhyung Kim --- tools/lib/traceevent/event-parse.c | 8 ++------ tools/lib/traceevent/parse-filter.c | 24 ++++++++---------------- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index f880db457842..5f34aa371b56 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -633,12 +633,8 @@ static void add_event(struct pevent *pevent, struct event_format *event) { int i; - if (!pevent->events) - pevent->events = malloc_or_die(sizeof(event)); - else - pevent->events = - realloc(pevent->events, sizeof(event) * - (pevent->nr_events + 1)); + pevent->events = realloc(pevent->events, sizeof(event) * + (pevent->nr_events + 1)); if (!pevent->events) die("Can not allocate events"); diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index f020ac61f9c1..ad17855528f9 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -148,17 +148,11 @@ add_filter_type(struct event_filter *filter, int id) if (filter_type) return filter_type; - if (!filter->filters) - filter->event_filters = - malloc_or_die(sizeof(*filter->event_filters)); - else { - filter->event_filters = - realloc(filter->event_filters, - sizeof(*filter->event_filters) * - (filter->filters + 1)); - if (!filter->event_filters) - die("Could not allocate filter"); - } + filter->event_filters = realloc(filter->event_filters, + sizeof(*filter->event_filters) * + (filter->filters + 1)); + if (!filter->event_filters) + die("Could not allocate filter"); for (i = 0; i < filter->filters; i++) { if (filter->event_filters[i].event_id > id) @@ -1480,7 +1474,7 @@ void pevent_filter_clear_trivial(struct event_filter *filter, { struct filter_type *filter_type; int count = 0; - int *ids; + int *ids = NULL; int i; if (!filter->filters) @@ -1504,10 +1498,8 @@ void pevent_filter_clear_trivial(struct event_filter *filter, default: break; } - if (count) - ids = realloc(ids, sizeof(*ids) * (count + 1)); - else - ids = malloc(sizeof(*ids)); + + ids = realloc(ids, sizeof(*ids) * (count + 1)); if (!ids) die("Can't allocate ids"); ids[count++] = filter_type->event_id; From d50394266b340d930a7458fa669d36e99670f200 Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Sun, 1 Jul 2012 11:31:35 +0300 Subject: [PATCH 276/612] remoteproc/omap: fix randconfig unmet direct dependencies OMAP_REMOTEPROC selects REMOTEPROC and RPMSG, both of which depend on EXPERIMENTAL, so let's have OMAP_REMOTEPROC depend on EXPERIMENTAL too, in order to avoid the below randconfig warnings. warning: (OMAP_REMOTEPROC) selects REMOTEPROC which has unmet direct dependencies (EXPERIMENTAL) warning: (OMAP_REMOTEPROC) selects RPMSG which has unmet direct dependencies (EXPERIMENTAL) Cc: stable Reported-by: Tony Lindgren Signed-off-by: Ohad Ben-Cohen --- drivers/remoteproc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig index 24d880e78ec6..db4e39b9164a 100644 --- a/drivers/remoteproc/Kconfig +++ b/drivers/remoteproc/Kconfig @@ -7,6 +7,7 @@ config REMOTEPROC config OMAP_REMOTEPROC tristate "OMAP remoteproc support" + depends on EXPERIMENTAL depends on ARCH_OMAP4 depends on OMAP_IOMMU select REMOTEPROC From e121aefa7d9f10eee5cf26ed47129237a05d940b Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Sun, 1 Jul 2012 11:53:36 +0300 Subject: [PATCH 277/612] remoteproc: fix missing CONFIG_FW_LOADER configurations Remoteproc requires user space firmware loading support, so let's select FW_LOADER explicitly to avoid painful misconfigurations (which only show up in runtime). Cc: stable Reported-by: Mark Grosen Signed-off-by: Ohad Ben-Cohen --- drivers/remoteproc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig index db4e39b9164a..f8d818abf98c 100644 --- a/drivers/remoteproc/Kconfig +++ b/drivers/remoteproc/Kconfig @@ -4,6 +4,7 @@ menu "Remoteproc drivers (EXPERIMENTAL)" config REMOTEPROC tristate depends on EXPERIMENTAL + select FW_CONFIG config OMAP_REMOTEPROC tristate "OMAP remoteproc support" From 5a081caa0414b9bbb82c17ffab9d6fe66edbb72f Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Wed, 6 Jun 2012 10:09:25 +0300 Subject: [PATCH 278/612] rpmsg: avoid premature deallocation of endpoints When an inbound message arrives, the rpmsg core looks up its associated endpoint and invokes the registered callback. If a message arrives while its endpoint is being removed (because the rpmsg driver was removed, or a recovery of a remote processor has kicked in) we must ensure atomicity, i.e.: - Either the ept is removed before it is found or - The ept is found but will not be freed until the callback returns This is achieved by maintaining a per-ept reference count, which, when drops to zero, will trigger deallocation of the ept. With this in hand, it is now forbidden to directly deallocate epts once they have been added to the endpoints idr. Cc: stable Reported-by: Fernando Guzman Lugo Signed-off-by: Ohad Ben-Cohen --- drivers/rpmsg/virtio_rpmsg_bus.c | 36 ++++++++++++++++++++++++++++++-- include/linux/rpmsg.h | 3 +++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index 75506ec2840e..9623327ba509 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -188,6 +188,26 @@ static int rpmsg_uevent(struct device *dev, struct kobj_uevent_env *env) rpdev->id.name); } +/** + * __ept_release() - deallocate an rpmsg endpoint + * @kref: the ept's reference count + * + * This function deallocates an ept, and is invoked when its @kref refcount + * drops to zero. + * + * Never invoke this function directly! + */ +static void __ept_release(struct kref *kref) +{ + struct rpmsg_endpoint *ept = container_of(kref, struct rpmsg_endpoint, + refcount); + /* + * At this point no one holds a reference to ept anymore, + * so we can directly free it + */ + kfree(ept); +} + /* for more info, see below documentation of rpmsg_create_ept() */ static struct rpmsg_endpoint *__rpmsg_create_ept(struct virtproc_info *vrp, struct rpmsg_channel *rpdev, rpmsg_rx_cb_t cb, @@ -206,6 +226,8 @@ static struct rpmsg_endpoint *__rpmsg_create_ept(struct virtproc_info *vrp, return NULL; } + kref_init(&ept->refcount); + ept->rpdev = rpdev; ept->cb = cb; ept->priv = priv; @@ -238,7 +260,7 @@ rem_idr: idr_remove(&vrp->endpoints, request); free_ept: mutex_unlock(&vrp->endpoints_lock); - kfree(ept); + kref_put(&ept->refcount, __ept_release); return NULL; } @@ -306,7 +328,7 @@ __rpmsg_destroy_ept(struct virtproc_info *vrp, struct rpmsg_endpoint *ept) idr_remove(&vrp->endpoints, ept->addr); mutex_unlock(&vrp->endpoints_lock); - kfree(ept); + kref_put(&ept->refcount, __ept_release); } /** @@ -790,7 +812,13 @@ static void rpmsg_recv_done(struct virtqueue *rvq) /* use the dst addr to fetch the callback of the appropriate user */ mutex_lock(&vrp->endpoints_lock); + ept = idr_find(&vrp->endpoints, msg->dst); + + /* let's make sure no one deallocates ept while we use it */ + if (ept) + kref_get(&ept->refcount); + mutex_unlock(&vrp->endpoints_lock); if (ept && ept->cb) @@ -798,6 +826,10 @@ static void rpmsg_recv_done(struct virtqueue *rvq) else dev_warn(dev, "msg received with no recepient\n"); + /* farewell, ept, we don't need you anymore */ + if (ept) + kref_put(&ept->refcount, __ept_release); + /* publish the real size of the buffer */ sg_init_one(&sg, msg, RPMSG_BUF_SIZE); diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index a8e50e44203c..195f373590b8 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -38,6 +38,7 @@ #include #include #include +#include /* The feature bitmap for virtio rpmsg */ #define VIRTIO_RPMSG_F_NS 0 /* RP supports name service notifications */ @@ -120,6 +121,7 @@ typedef void (*rpmsg_rx_cb_t)(struct rpmsg_channel *, void *, int, void *, u32); /** * struct rpmsg_endpoint - binds a local rpmsg address to its user * @rpdev: rpmsg channel device + * @refcount: when this drops to zero, the ept is deallocated * @cb: rx callback handler * @addr: local rpmsg address * @priv: private data for the driver's use @@ -140,6 +142,7 @@ typedef void (*rpmsg_rx_cb_t)(struct rpmsg_channel *, void *, int, void *, u32); */ struct rpmsg_endpoint { struct rpmsg_channel *rpdev; + struct kref refcount; rpmsg_rx_cb_t cb; u32 addr; void *priv; From 15fd943af50dbc5f7f4de33835795c72595f7bf4 Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Thu, 7 Jun 2012 15:39:35 +0300 Subject: [PATCH 279/612] rpmsg: make sure inflight messages don't invoke just-removed callbacks When inbound messages arrive, rpmsg core looks up their associated endpoint (by destination address) and then invokes their callback. We've made sure that endpoints will never be de-allocated after they were found by rpmsg core, but we also need to protect against the (rare) scenario where the rpmsg driver was just removed, and its callback function isn't available anymore. This is achieved by introducing a callback mutex, which must be taken before the callback is invoked, and, obviously, before it is removed. Cc: stable Reported-by: Fernando Guzman Lugo Signed-off-by: Ohad Ben-Cohen --- drivers/rpmsg/virtio_rpmsg_bus.c | 25 +++++++++++++++++++------ include/linux/rpmsg.h | 3 +++ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index 9623327ba509..39d3aa41adda 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -227,6 +227,7 @@ static struct rpmsg_endpoint *__rpmsg_create_ept(struct virtproc_info *vrp, } kref_init(&ept->refcount); + mutex_init(&ept->cb_lock); ept->rpdev = rpdev; ept->cb = cb; @@ -324,10 +325,16 @@ EXPORT_SYMBOL(rpmsg_create_ept); static void __rpmsg_destroy_ept(struct virtproc_info *vrp, struct rpmsg_endpoint *ept) { + /* make sure new inbound messages can't find this ept anymore */ mutex_lock(&vrp->endpoints_lock); idr_remove(&vrp->endpoints, ept->addr); mutex_unlock(&vrp->endpoints_lock); + /* make sure in-flight inbound messages won't invoke cb anymore */ + mutex_lock(&ept->cb_lock); + ept->cb = NULL; + mutex_unlock(&ept->cb_lock); + kref_put(&ept->refcount, __ept_release); } @@ -821,14 +828,20 @@ static void rpmsg_recv_done(struct virtqueue *rvq) mutex_unlock(&vrp->endpoints_lock); - if (ept && ept->cb) - ept->cb(ept->rpdev, msg->data, msg->len, ept->priv, msg->src); - else - dev_warn(dev, "msg received with no recepient\n"); + if (ept) { + /* make sure ept->cb doesn't go away while we use it */ + mutex_lock(&ept->cb_lock); - /* farewell, ept, we don't need you anymore */ - if (ept) + if (ept->cb) + ept->cb(ept->rpdev, msg->data, msg->len, ept->priv, + msg->src); + + mutex_unlock(&ept->cb_lock); + + /* farewell, ept, we don't need you anymore */ kref_put(&ept->refcount, __ept_release); + } else + dev_warn(dev, "msg received with no recepient\n"); /* publish the real size of the buffer */ sg_init_one(&sg, msg, RPMSG_BUF_SIZE); diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index 195f373590b8..82a673905edb 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -39,6 +39,7 @@ #include #include #include +#include /* The feature bitmap for virtio rpmsg */ #define VIRTIO_RPMSG_F_NS 0 /* RP supports name service notifications */ @@ -123,6 +124,7 @@ typedef void (*rpmsg_rx_cb_t)(struct rpmsg_channel *, void *, int, void *, u32); * @rpdev: rpmsg channel device * @refcount: when this drops to zero, the ept is deallocated * @cb: rx callback handler + * @cb_lock: must be taken before accessing/changing @cb * @addr: local rpmsg address * @priv: private data for the driver's use * @@ -144,6 +146,7 @@ struct rpmsg_endpoint { struct rpmsg_channel *rpdev; struct kref refcount; rpmsg_rx_cb_t cb; + struct mutex cb_lock; u32 addr; void *priv; }; From 0c47935c5b5cd4916cf1c1ed4a2894807f7bcc3e Mon Sep 17 00:00:00 2001 From: Daniel Nicoletti Date: Wed, 4 Jul 2012 10:20:31 -0300 Subject: [PATCH 280/612] HID: add battery quirk for Apple Wireless ANSI Add USB_DEVICE_ID_APPLE_ALU_WIRELESS_ANSI, to the quirk list since it report wrong feature type and wrong percentage range. Signed-off-by: Daniel Nicoletti Signed-off-by: Jiri Kosina --- drivers/hid/hid-input.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 132b0019365e..5301006f6c15 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -301,6 +301,9 @@ static const struct hid_device_id hid_battery_quirks[] = { { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_2011_ANSI), HID_BATTERY_QUIRK_PERCENT | HID_BATTERY_QUIRK_FEATURE }, + { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, + USB_DEVICE_ID_APPLE_ALU_WIRELESS_ANSI), + HID_BATTERY_QUIRK_PERCENT | HID_BATTERY_QUIRK_FEATURE }, {} }; From ebf124ffab59e9e1c6179347fe95fe5baf32dcd7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 4 Jul 2012 00:00:47 +0200 Subject: [PATCH 281/612] perf test: Use ARRAY_SIZE in parse events tests Use ARRAY_SIZE instead of defining the sizes separately for each test arrays. Signed-off-by: Jiri Olsa Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1341352848-11833-10-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events-test.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/tools/perf/util/parse-events-test.c b/tools/perf/util/parse-events-test.c index dd0c306a0698..1b997d2b89ce 100644 --- a/tools/perf/util/parse-events-test.c +++ b/tools/perf/util/parse-events-test.c @@ -634,8 +634,6 @@ static struct test__event_st test__events[] = { }, }; -#define TEST__EVENTS_CNT (sizeof(test__events) / sizeof(struct test__event_st)) - static struct test__event_st test__events_pmu[] = { [0] = { .name = "cpu/config=10,config1,config2=3,period=1000/u", @@ -647,9 +645,6 @@ static struct test__event_st test__events_pmu[] = { }, }; -#define TEST__EVENTS_PMU_CNT (sizeof(test__events_pmu) / \ - sizeof(struct test__event_st)) - struct test__term { const char *str; __u32 type; @@ -765,21 +760,17 @@ int parse_events__test(void) { int ret; - do { - ret = test_events(test__events, TEST__EVENTS_CNT); - if (ret) - break; +#define TEST_EVENTS(tests) \ +do { \ + ret = test_events(tests, ARRAY_SIZE(tests)); \ + if (ret) \ + return ret; \ +} while (0) - if (test_pmu()) { - ret = test_events(test__events_pmu, - TEST__EVENTS_PMU_CNT); - if (ret) - break; - } + TEST_EVENTS(test__events); - ret = test_terms(test__terms, TEST__TERMS_CNT); + if (test_pmu()) + TEST_EVENTS(test__events_pmu); - } while (0); - - return ret; + return test_terms(test__terms, ARRAY_SIZE(test__terms)); } From 2e2070c85aa19f6c5bb3642a1429abf42f101e8d Mon Sep 17 00:00:00 2001 From: Alessandro Rubini Date: Sun, 27 May 2012 22:55:41 +0200 Subject: [PATCH 282/612] gpio-sta2x11: don't use pdata if null If there is no platform data available, the driver shouldn't use the pointer or it will oops. Since things will mostly work nonetheless, (the BIOS may have set up the pins properly), I'd better not fail the probe even in this case. Signed-off-by: Alessandro Rubini Acked-by: Giancarlo Asnaghi Signed-off-by: Linus Walleij --- drivers/gpio/gpio-sta2x11.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-sta2x11.c b/drivers/gpio/gpio-sta2x11.c index 38416be8ba11..6064fb376e11 100644 --- a/drivers/gpio/gpio-sta2x11.c +++ b/drivers/gpio/gpio-sta2x11.c @@ -383,8 +383,9 @@ static int __devinit gsta_probe(struct platform_device *dev) } spin_lock_init(&chip->lock); gsta_gpio_setup(chip); - for (i = 0; i < GSTA_NR_GPIO; i++) - gsta_set_config(chip, i, gpio_pdata->pinconfig[i]); + if (gpio_pdata) + for (i = 0; i < GSTA_NR_GPIO; i++) + gsta_set_config(chip, i, gpio_pdata->pinconfig[i]); /* 384 was used in previous code: be compatible for other drivers */ err = irq_alloc_descs(-1, 384, GSTA_NR_GPIO, NUMA_NO_NODE); From afcc0f8c174ff34305146878c40e7dcc72fcbbdf Mon Sep 17 00:00:00 2001 From: Christian Dietrich Date: Thu, 31 May 2012 13:12:57 +0200 Subject: [PATCH 283/612] gpio/msm_v1: CONFIG_GPIO_MSM_V1 is only available on three SoCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The feature GPIO_MSM_V1 is only available on three SoCs. On all other MSM SoCs the INT_GPIO_GROUP{1,2} is undeclared, but Kconfig does allow such configurations. Therefore the produced configuration is valid, but does not compile. The problem is fixed by adding the missing Kconfig constraints. drivers/gpio/gpio-msm-v1.c: In function ‘msm_init_gpio’: drivers/gpio/gpio-msm-v1.c:629:26: error: 'INT_GPIO_GROUP1' undeclared drivers/gpio/gpio-msm-v1.c:630:26: error: 'INT_GPIO_GROUP2' undeclared Signed-off-by: Christian Dietrich Signed-off-by: Linus Walleij --- drivers/gpio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index c4067d0141f7..542f0c04b695 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -136,7 +136,7 @@ config GPIO_MPC8XXX config GPIO_MSM_V1 tristate "Qualcomm MSM GPIO v1" - depends on GPIOLIB && ARCH_MSM + depends on GPIOLIB && ARCH_MSM && (ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50) help Say yes here to support the GPIO interface on ARM v6 based Qualcomm MSM chips. Most of the pins on the MSM can be From 8cd578b6e28693f357867a77598a88ef3deb6b39 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 9 Jun 2012 11:07:56 +0800 Subject: [PATCH 284/612] gpiolib: wm8994: Pay attention to the value set when enabling as output Not paying attention to the value being set is a bad thing because it means that we'll not set the hardware up to reflect what was requested. Not setting the hardware up to reflect what was requested means that the caller won't get the results they wanted. Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Signed-off-by: Linus Walleij --- drivers/gpio/gpio-wm8994.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-wm8994.c b/drivers/gpio/gpio-wm8994.c index 92ea5350dfe9..aa61ad2fcaaa 100644 --- a/drivers/gpio/gpio-wm8994.c +++ b/drivers/gpio/gpio-wm8994.c @@ -89,8 +89,11 @@ static int wm8994_gpio_direction_out(struct gpio_chip *chip, struct wm8994_gpio *wm8994_gpio = to_wm8994_gpio(chip); struct wm8994 *wm8994 = wm8994_gpio->wm8994; + if (value) + value = WM8994_GPN_LVL; + return wm8994_set_bits(wm8994, WM8994_GPIO_1 + offset, - WM8994_GPN_DIR, 0); + WM8994_GPN_DIR | WM8994_GPN_LVL, value); } static void wm8994_gpio_set(struct gpio_chip *chip, unsigned offset, int value) From 3996bfc787a3b3d7c7ad97b7519247f9f2ac4068 Mon Sep 17 00:00:00 2001 From: Stephen Warren Date: Thu, 7 Jun 2012 17:56:09 -0600 Subject: [PATCH 285/612] gpio: export devm_gpio_request_one Without this, modules can't use this API, leading to build failures. Signed-off-by: Stephen Warren Signed-off-by: Linus Walleij --- drivers/gpio/devres.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/devres.c b/drivers/gpio/devres.c index 9e9947cb86a3..1077754f8289 100644 --- a/drivers/gpio/devres.c +++ b/drivers/gpio/devres.c @@ -98,6 +98,7 @@ int devm_gpio_request_one(struct device *dev, unsigned gpio, return 0; } +EXPORT_SYMBOL(devm_gpio_request_one); /** * devm_gpio_free - free an interrupt From 42b14cb037fefa33a2ff51c4d3915a49c71de3d5 Mon Sep 17 00:00:00 2001 From: Roland Stigge Date: Mon, 18 Jun 2012 11:28:26 +0200 Subject: [PATCH 286/612] mips: pci-lantiq: Fix check for valid gpio This patch fixes two checks for valid gpio number, formerly (wrongly) considering zero as invalid, now using gpio_is_valid(). Signed-off-by: Roland Stigge Signed-off-by: Linus Walleij --- arch/mips/pci/pci-lantiq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c index ea453532a33c..075d87acd12a 100644 --- a/arch/mips/pci/pci-lantiq.c +++ b/arch/mips/pci/pci-lantiq.c @@ -129,7 +129,7 @@ static int __devinit ltq_pci_startup(struct platform_device *pdev) /* setup reset gpio used by pci */ reset_gpio = of_get_named_gpio(node, "gpio-reset", 0); - if (reset_gpio > 0) + if (gpio_is_valid(reset_gpio)) devm_gpio_request(&pdev->dev, reset_gpio, "pci-reset"); /* enable auto-switching between PCI and EBU */ @@ -192,7 +192,7 @@ static int __devinit ltq_pci_startup(struct platform_device *pdev) ltq_ebu_w32(ltq_ebu_r32(LTQ_EBU_PCC_IEN) | 0x10, LTQ_EBU_PCC_IEN); /* toggle reset pin */ - if (reset_gpio > 0) { + if (gpio_is_valid(reset_gpio)) { __gpio_set_value(reset_gpio, 0); wmb(); mdelay(1); From f567fde24640cf6f2d6416196bfc8b3fefc8e433 Mon Sep 17 00:00:00 2001 From: Laxman Dewangan Date: Wed, 20 Jun 2012 14:14:05 +0530 Subject: [PATCH 287/612] gpio: fix bits conflict for gpio flags The bit 2 and 3 in GPIO flag are allocated for the flag OPEN_DRAIN/OPEN_SOURCE. These bits are reused for the flag EXPORT/EXPORT_CHANGEABLE and so creating conflict. Fix this conflict by assigning bit 4 and 5 for the flag EXPORT/EXPORT_CHANGEABLE. Signed-off-by: Laxman Dewangan Signed-off-by: Linus Walleij --- include/linux/gpio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/gpio.h b/include/linux/gpio.h index f07fc2d08159..2e31e8b3a190 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -22,8 +22,8 @@ /* Gpio pin is open source */ #define GPIOF_OPEN_SOURCE (1 << 3) -#define GPIOF_EXPORT (1 << 2) -#define GPIOF_EXPORT_CHANGEABLE (1 << 3) +#define GPIOF_EXPORT (1 << 4) +#define GPIOF_EXPORT_CHANGEABLE (1 << 5) #define GPIOF_EXPORT_DIR_FIXED (GPIOF_EXPORT) #define GPIOF_EXPORT_DIR_CHANGEABLE (GPIOF_EXPORT | GPIOF_EXPORT_CHANGEABLE) From 33a4e985bab25d6753b52d1322b4e2fff706dd4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 6 Jun 2012 11:49:23 +0200 Subject: [PATCH 288/612] gpio/mxc: make irqs work for fsl,imx21-gpio devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The chained handler was set for the platform device with id == 0. When the gpio devices are instantiated by a device tree, all have id == -1 and so the handler was unset resulting in unusable gpio irqs on i.MX21 and i.MX27 (when using oftree). Acked-by: Shawn Guo Cc: Grant Likely Signed-off-by: Uwe Kleine-König Signed-off-by: Linus Walleij --- drivers/gpio/gpio-mxc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-mxc.c b/drivers/gpio/gpio-mxc.c index c337143b18f8..c89c4c1e668d 100644 --- a/drivers/gpio/gpio-mxc.c +++ b/drivers/gpio/gpio-mxc.c @@ -398,10 +398,12 @@ static int __devinit mxc_gpio_probe(struct platform_device *pdev) writel(~0, port->base + GPIO_ISR); if (mxc_gpio_hwtype == IMX21_GPIO) { - /* setup one handler for all GPIO interrupts */ - if (pdev->id == 0) - irq_set_chained_handler(port->irq, - mx2_gpio_irq_handler); + /* + * Setup one handler for all GPIO interrupts. Actually setting + * the handler is needed only once, but doing it for every port + * is more robust and easier. + */ + irq_set_chained_handler(port->irq, mx2_gpio_irq_handler); } else { /* setup one handler for each entry */ irq_set_chained_handler(port->irq, mx3_gpio_irq_handler); From 626f9914a371e22f15135f67db6c2aa64adbdacb Mon Sep 17 00:00:00 2001 From: Laxman Dewangan Date: Wed, 4 Jul 2012 20:36:45 +0530 Subject: [PATCH 289/612] gpio: tps65910: initialize of_node of gpio_chip Initialize the gpio chip's of_node to the device's node to work with DT based system. Signed-off-by: Laxman Dewangan Signed-off-by: Linus Walleij --- drivers/gpio/gpio-tps65910.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-tps65910.c b/drivers/gpio/gpio-tps65910.c index c1ad2884f2ed..0749f9630869 100644 --- a/drivers/gpio/gpio-tps65910.c +++ b/drivers/gpio/gpio-tps65910.c @@ -149,6 +149,7 @@ static int __devinit tps65910_gpio_probe(struct platform_device *pdev) tps65910_gpio->gpio_chip.set = tps65910_gpio_set; tps65910_gpio->gpio_chip.get = tps65910_gpio_get; tps65910_gpio->gpio_chip.dev = &pdev->dev; + tps65910_gpio->gpio_chip.of_node = tps65910->dev->of_node; if (pdata && pdata->gpio_base) tps65910_gpio->gpio_chip.base = pdata->gpio_base; else From 8c5f0a84c6b16e04195001bfd78281909519cf09 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 4 Jul 2012 00:00:39 +0200 Subject: [PATCH 290/612] perf tools: Add empty rule for new line in event syntax parsing The flex generator prints out each input character that is ignored by lex rules. Since the alias processing, we can have '\n' characters on input. We need to assign empty rule to it, so it's not printed out. Signed-off-by: Jiri Olsa Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1341352848-11833-2-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.l | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index a06689474210..6cbe092e6c3b 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -152,6 +152,7 @@ r{num_raw_hex} { return raw(yyscanner); } , { return ','; } : { return ':'; } = { return '='; } +\n { } { {modifier_bp} { return str(yyscanner, PE_MODIFIER_BP); } From cf3506dcc4a855d11af20bcb271ac921033ba0de Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 4 Jul 2012 00:00:43 +0200 Subject: [PATCH 291/612] perf tools: Split out PE_VALUE_SYM parsing token to SW and HW tokens Spliting PE_VALUE_SYM token to PE_VALUE_SYM_HW and PE_VALUE_SYM_SW tokens to separate hardware and software symbols. This will be useful in upcomming patch where we want to be able to parse out only hardware events. Signed-off-by: Jiri Olsa Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1341352848-11833-6-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.l | 2 +- tools/perf/util/parse-events.y | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 6cbe092e6c3b..384ca74c6b22 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -56,7 +56,7 @@ static int sym(yyscan_t scanner, int type, int config) YYSTYPE *yylval = parse_events_get_lval(scanner); yylval->num = (type << 16) + config; - return PE_VALUE_SYM; + return type == PERF_TYPE_HARDWARE ? PE_VALUE_SYM_HW : PE_VALUE_SYM_SW; } static int term(yyscan_t scanner, int type) diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 9525c455d27f..2bc5fbff2b5d 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -26,14 +26,15 @@ do { \ %} %token PE_START_EVENTS PE_START_TERMS -%token PE_VALUE PE_VALUE_SYM PE_RAW PE_TERM +%token PE_VALUE PE_VALUE_SYM_HW PE_VALUE_SYM_SW PE_RAW PE_TERM %token PE_NAME %token PE_MODIFIER_EVENT PE_MODIFIER_BP %token PE_NAME_CACHE_TYPE PE_NAME_CACHE_OP_RESULT %token PE_PREFIX_MEM PE_PREFIX_RAW %token PE_ERROR %type PE_VALUE -%type PE_VALUE_SYM +%type PE_VALUE_SYM_HW +%type PE_VALUE_SYM_SW %type PE_RAW %type PE_TERM %type PE_NAME @@ -41,6 +42,7 @@ do { \ %type PE_NAME_CACHE_OP_RESULT %type PE_MODIFIER_EVENT %type PE_MODIFIER_BP +%type value_sym %type event_config %type event_term %type event_pmu @@ -109,8 +111,13 @@ PE_NAME '/' event_config '/' $$ = list; } +value_sym: +PE_VALUE_SYM_HW +| +PE_VALUE_SYM_SW + event_legacy_symbol: -PE_VALUE_SYM '/' event_config '/' +value_sym '/' event_config '/' { struct parse_events_data__events *data = _data; struct list_head *list = NULL; @@ -123,7 +130,7 @@ PE_VALUE_SYM '/' event_config '/' $$ = list; } | -PE_VALUE_SYM sep_slash_dc +value_sym sep_slash_dc { struct parse_events_data__events *data = _data; struct list_head *list = NULL; From 1dc12760854a670d0366dcfd8ad179e3574c1712 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 4 Jul 2012 00:00:44 +0200 Subject: [PATCH 292/612] perf tools: Split event symbols arrays to hw and sw parts It'll be convenient in upcoming patch to access hw event symbols strings via enum perf_hw_id indexes. In order not to duplicate the data, creating two separate arrays: event_symbols_hw for enum perf_hw_id events event_symbols_sw for enum perf_sw_ids events Changing the current event list code to follow the change. Signed-off-by: Jiri Olsa Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1341352848-11833-7-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 172 ++++++++++++++++++++++----------- 1 file changed, 116 insertions(+), 56 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 1dc44dc69133..1aa721d7c10f 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -19,8 +19,6 @@ #define MAX_NAME_LEN 100 struct event_symbol { - u8 type; - u64 config; const char *symbol; const char *alias; }; @@ -30,30 +28,86 @@ extern int parse_events_debug; #endif int parse_events_parse(void *data, void *scanner); -#define CHW(x) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_##x -#define CSW(x) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_##x +static struct event_symbol event_symbols_hw[PERF_COUNT_HW_MAX] = { + [PERF_COUNT_HW_CPU_CYCLES] = { + .symbol = "cpu-cycles", + .alias = "cycles", + }, + [PERF_COUNT_HW_INSTRUCTIONS] = { + .symbol = "instructions", + .alias = "", + }, + [PERF_COUNT_HW_CACHE_REFERENCES] = { + .symbol = "cache-references", + .alias = "", + }, + [PERF_COUNT_HW_CACHE_MISSES] = { + .symbol = "cache-misses", + .alias = "", + }, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { + .symbol = "branch-instructions", + .alias = "branches", + }, + [PERF_COUNT_HW_BRANCH_MISSES] = { + .symbol = "branch-misses", + .alias = "", + }, + [PERF_COUNT_HW_BUS_CYCLES] = { + .symbol = "bus-cycles", + .alias = "", + }, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = { + .symbol = "stalled-cycles-frontend", + .alias = "idle-cycles-frontend", + }, + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = { + .symbol = "stalled-cycles-backend", + .alias = "idle-cycles-backend", + }, + [PERF_COUNT_HW_REF_CPU_CYCLES] = { + .symbol = "ref-cycles", + .alias = "", + }, +}; -static struct event_symbol event_symbols[] = { - { CHW(CPU_CYCLES), "cpu-cycles", "cycles" }, - { CHW(STALLED_CYCLES_FRONTEND), "stalled-cycles-frontend", "idle-cycles-frontend" }, - { CHW(STALLED_CYCLES_BACKEND), "stalled-cycles-backend", "idle-cycles-backend" }, - { CHW(INSTRUCTIONS), "instructions", "" }, - { CHW(CACHE_REFERENCES), "cache-references", "" }, - { CHW(CACHE_MISSES), "cache-misses", "" }, - { CHW(BRANCH_INSTRUCTIONS), "branch-instructions", "branches" }, - { CHW(BRANCH_MISSES), "branch-misses", "" }, - { CHW(BUS_CYCLES), "bus-cycles", "" }, - { CHW(REF_CPU_CYCLES), "ref-cycles", "" }, - - { CSW(CPU_CLOCK), "cpu-clock", "" }, - { CSW(TASK_CLOCK), "task-clock", "" }, - { CSW(PAGE_FAULTS), "page-faults", "faults" }, - { CSW(PAGE_FAULTS_MIN), "minor-faults", "" }, - { CSW(PAGE_FAULTS_MAJ), "major-faults", "" }, - { CSW(CONTEXT_SWITCHES), "context-switches", "cs" }, - { CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" }, - { CSW(ALIGNMENT_FAULTS), "alignment-faults", "" }, - { CSW(EMULATION_FAULTS), "emulation-faults", "" }, +static struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = { + [PERF_COUNT_SW_CPU_CLOCK] = { + .symbol = "cpu-clock", + .alias = "", + }, + [PERF_COUNT_SW_TASK_CLOCK] = { + .symbol = "task-clock", + .alias = "", + }, + [PERF_COUNT_SW_PAGE_FAULTS] = { + .symbol = "page-faults", + .alias = "faults", + }, + [PERF_COUNT_SW_CONTEXT_SWITCHES] = { + .symbol = "context-switches", + .alias = "cs", + }, + [PERF_COUNT_SW_CPU_MIGRATIONS] = { + .symbol = "cpu-migrations", + .alias = "migrations", + }, + [PERF_COUNT_SW_PAGE_FAULTS_MIN] = { + .symbol = "minor-faults", + .alias = "", + }, + [PERF_COUNT_SW_PAGE_FAULTS_MAJ] = { + .symbol = "major-faults", + .alias = "", + }, + [PERF_COUNT_SW_ALIGNMENT_FAULTS] = { + .symbol = "alignment-faults", + .alias = "", + }, + [PERF_COUNT_SW_EMULATION_FAULTS] = { + .symbol = "emulation-faults", + .alias = "", + }, }; #define __PERF_EVENT_FIELD(config, name) \ @@ -824,16 +878,13 @@ int is_valid_tracepoint(const char *event_string) return 0; } -void print_events_type(u8 type) +static void __print_events_type(u8 type, struct event_symbol *syms, + unsigned max) { - struct event_symbol *syms = event_symbols; - unsigned int i; char name[64]; + unsigned i; - for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) { - if (type != syms->type) - continue; - + for (i = 0; i < max ; i++, syms++) { if (strlen(syms->alias)) snprintf(name, sizeof(name), "%s OR %s", syms->symbol, syms->alias); @@ -845,6 +896,14 @@ void print_events_type(u8 type) } } +void print_events_type(u8 type) +{ + if (type == PERF_TYPE_SOFTWARE) + __print_events_type(type, event_symbols_sw, PERF_COUNT_SW_MAX); + else + __print_events_type(type, event_symbols_hw, PERF_COUNT_HW_MAX); +} + int print_hwcache_events(const char *event_glob) { unsigned int type, op, i, printed = 0; @@ -872,26 +931,13 @@ int print_hwcache_events(const char *event_glob) return printed; } -/* - * Print the help text for the event symbols: - */ -void print_events(const char *event_glob) +static void print_symbol_events(const char *event_glob, unsigned type, + struct event_symbol *syms, unsigned max) { - unsigned int i, type, prev_type = -1, printed = 0, ntypes_printed = 0; - struct event_symbol *syms = event_symbols; + unsigned i, printed = 0; char name[MAX_NAME_LEN]; - printf("\n"); - printf("List of pre-defined events (to be used in -e):\n"); - - for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) { - type = syms->type; - - if (type != prev_type && printed) { - printf("\n"); - printed = 0; - ntypes_printed++; - } + for (i = 0; i < max; i++, syms++) { if (event_glob != NULL && !(strglobmatch(syms->symbol, event_glob) || @@ -902,17 +948,31 @@ void print_events(const char *event_glob) snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias); else strncpy(name, syms->symbol, MAX_NAME_LEN); - printf(" %-50s [%s]\n", name, - event_type_descriptors[type]); - prev_type = type; - ++printed; + printf(" %-50s [%s]\n", name, event_type_descriptors[type]); + + printed++; } - if (ntypes_printed) { - printed = 0; + if (printed) printf("\n"); - } +} + +/* + * Print the help text for the event symbols: + */ +void print_events(const char *event_glob) +{ + + printf("\n"); + printf("List of pre-defined events (to be used in -e):\n"); + + print_symbol_events(event_glob, PERF_TYPE_HARDWARE, + event_symbols_hw, PERF_COUNT_HW_MAX); + + print_symbol_events(event_glob, PERF_TYPE_SOFTWARE, + event_symbols_sw, PERF_COUNT_SW_MAX); + print_hwcache_events(event_glob); if (event_glob != NULL) From 164c33c6adee609b8b9062cce4c10f764d0dce13 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Mon, 25 Jun 2012 18:18:15 -0700 Subject: [PATCH 293/612] sched: Fix fork() error path to not crash In dup_task_struct(), if arch_dup_task_struct() fails, the clean up code fails to clean up correctly. That's because the clean up code depends on unininitalized ti->task pointer. We fix this by making sure that the task and thread_info know about each other before we attempt to take the error path. Signed-off-by: Salman Qazi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120626011815.11323.5533.stgit@dungbeetle.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b9e622..f00e319d8376 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) } err = arch_dup_task_struct(tsk, orig); + + /* + * We defer looking at err, because we will need this setup + * for the clean up path to work correctly. + */ + tsk->stack = ti; + setup_thread_stack(tsk, orig); + if (err) goto out; - tsk->stack = ti; - - setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); stackend = end_of_stack(tsk); From 5167e8d5417bf5c322a703d2927daec727ea40dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jun 2012 15:52:09 +0200 Subject: [PATCH 294/612] sched/nohz: Rewrite and fix load-avg computation -- again Thanks to Charles Wang for spotting the defects in the current code: - If we go idle during the sample window -- after sampling, we get a negative bias because we can negate our own sample. - If we wake up during the sample window we get a positive bias because we push the sample to a known active period. So rewrite the entire nohz load-avg muck once again, now adding copious documentation to the code. Reported-and-tested-by: Doug Smythies Reported-and-tested-by: Charles Wang Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1340373782.18025.74.camel@twins [ minor edits ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++ kernel/sched/core.c | 277 ++++++++++++++++++++++++++++----------- kernel/sched/idle_task.c | 1 - kernel/sched/sched.h | 2 - kernel/time/tick-sched.c | 2 + 5 files changed, 214 insertions(+), 76 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4059c0f33f07..20cb7497c59c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1909,6 +1909,14 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, } #endif +#ifdef CONFIG_NO_HZ +void calc_load_enter_idle(void); +void calc_load_exit_idle(void); +#else +static inline void calc_load_enter_idle(void) { } +static inline void calc_load_exit_idle(void) { } +#endif /* CONFIG_NO_HZ */ + #ifndef CONFIG_CPUMASK_OFFSTACK static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d4..bb840405335d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2161,11 +2161,73 @@ unsigned long this_cpu_load(void) } +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} static long calc_load_fold_active(struct rq *this_rq) { @@ -2182,6 +2244,9 @@ static long calc_load_fold_active(struct rq *this_rq) return delta; } +/* + * a1 = a0 * e + a * (1 - e) + */ static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { @@ -2193,30 +2258,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) #ifdef CONFIG_NO_HZ /* - * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. * * When making the ILB scale, we should try to pull this in as well. */ -static atomic_long_t calc_load_tasks_idle; +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; -void calc_load_account_idle(struct rq *this_rq) +static inline int calc_load_write_idx(void) { + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); long delta; + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks_idle); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } +} + +void calc_load_exit_idle(void) +{ + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + /* + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. + */ + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; } static long calc_load_fold_idle(void) { + int idx = calc_load_read_idx(); long delta = 0; - /* - * Its got a race, we don't care... - */ - if (atomic_long_read(&calc_load_tasks_idle)) - delta = atomic_long_xchg(&calc_load_tasks_idle, 0); + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); return delta; } @@ -2302,66 +2455,39 @@ static void calc_global_nohz(void) { long delta, active, n; - /* - * If we crossed a calc_load_update boundary, make sure to fold - * any pending idle changes, the respective CPUs might have - * missed the tick driven calc_load_account_active() update - * due to NO_HZ. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } /* - * It could be the one fold was all it took, we done! + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. */ - if (time_before(jiffies, calc_load_update + 10)) - return; - - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - - calc_load_update += n * LOAD_FREQ; -} -#else -void calc_load_account_idle(struct rq *this_rq) -{ + smp_wmb(); + calc_load_idx++; } +#else /* !CONFIG_NO_HZ */ -static inline long calc_load_fold_idle(void) -{ - return 0; -} +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } -static void calc_global_nohz(void) -{ -} -#endif - -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} +#endif /* CONFIG_NO_HZ */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -2369,11 +2495,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) */ void calc_global_load(unsigned long ticks) { - long active; + long active, delta; if (time_before(jiffies, calc_load_update + 10)) return; + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + active = atomic_long_read(&calc_load_tasks); active = active > 0 ? active * FIXED_1 : 0; @@ -2384,12 +2517,7 @@ void calc_global_load(unsigned long ticks) calc_load_update += LOAD_FREQ; /* - * Account one period with whatever state we found before - * folding in the nohz state and ageing the entire idle period. - * - * This avoids loosing a sample when we go idle between - * calc_load_account_active() (10 ticks ago) and now and thus - * under-accounting. + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. */ calc_global_nohz(); } @@ -2406,13 +2534,16 @@ static void calc_load_account_active(struct rq *this_rq) return; delta = calc_load_fold_active(this_rq); - delta += calc_load_fold_idle(); if (delta) atomic_long_add(delta, &calc_load_tasks); this_rq->calc_load_update += LOAD_FREQ; } +/* + * End of global load-average stuff + */ + /* * The exact cpuload at various idx values, calculated at every tick would be * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604b35d1..b6baf370cae9 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - calc_load_account_idle(rq); return rq->idle; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d52cea7f33d..55844f24435a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void) return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; } -void calc_load_account_idle(struct rq *this_rq); - #ifdef CONFIG_SCHED_HRTICK /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 869997833928..4a08472c3ca7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -406,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) */ if (!ts->tick_stopped) { select_nohz_load_balancer(1); + calc_load_enter_idle(); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -597,6 +598,7 @@ void tick_nohz_idle_exit(void) account_idle_ticks(ticks); #endif + calc_load_exit_idle(); touch_softlockup_watchdog(); /* * Cancel the scheduled timer and restore the tick From ce5c1fe9a9e059b5c58f0a7e2a3e687d0efac815 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Jun 2012 11:11:38 +0200 Subject: [PATCH 295/612] perf/x86: Fix USER/KERNEL tagging of samples Several perf interrupt handlers (PEBS,IBS,BTS) re-write regs->ip but do not update the segment registers. So use an regs->ip based test instead of an regs->cs/regs->flags based test. Reported-and-tested-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-xxrt0a1zronm1sm36obwc2vy@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c4706cf9c011..6ef9d41b87f9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1863,7 +1863,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs) else misc |= PERF_RECORD_MISC_GUEST_KERNEL; } else { - if (user_mode(regs)) + if (!kernel_ip(regs->ip)) misc |= PERF_RECORD_MISC_USER; else misc |= PERF_RECORD_MISC_KERNEL; From 15c7ad51ad58cbd3b46112c1840bc7228bd354bf Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 20 Jun 2012 20:46:33 +0200 Subject: [PATCH 296/612] perf/x86: Rename Intel specific macros There are macros that are Intel specific and not x86 generic. Rename them into INTEL_*. This patch removes X86_PMC_IDX_GENERIC and does: $ sed -i -e 's/X86_PMC_MAX_/INTEL_PMC_MAX_/g' \ arch/x86/include/asm/kvm_host.h \ arch/x86/include/asm/perf_event.h \ arch/x86/kernel/cpu/perf_event.c \ arch/x86/kernel/cpu/perf_event_p4.c \ arch/x86/kvm/pmu.c $ sed -i -e 's/X86_PMC_IDX_FIXED/INTEL_PMC_IDX_FIXED/g' \ arch/x86/include/asm/perf_event.h \ arch/x86/kernel/cpu/perf_event.c \ arch/x86/kernel/cpu/perf_event_intel.c \ arch/x86/kernel/cpu/perf_event_intel_ds.c \ arch/x86/kvm/pmu.c $ sed -i -e 's/X86_PMC_MSK_/INTEL_PMC_MSK_/g' \ arch/x86/include/asm/perf_event.h \ arch/x86/kernel/cpu/perf_event.c Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340217996-2254-2-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kvm_host.h | 4 +-- arch/x86/include/asm/perf_event.h | 17 +++++----- arch/x86/kernel/cpu/perf_event.c | 38 +++++++++++------------ arch/x86/kernel/cpu/perf_event_intel.c | 14 ++++----- arch/x86/kernel/cpu/perf_event_intel_ds.c | 4 +-- arch/x86/kernel/cpu/perf_event_p4.c | 2 +- arch/x86/kvm/pmu.c | 22 ++++++------- 7 files changed, 50 insertions(+), 51 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index db7c1f2709a2..2da88c0cda14 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -313,8 +313,8 @@ struct kvm_pmu { u64 counter_bitmask[2]; u64 global_ctrl_mask; u8 version; - struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC]; - struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED]; + struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; + struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; struct irq_work irq_work; u64 reprogram_pmi; }; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 588f52ea810e..3b31248caf60 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -5,11 +5,10 @@ * Performance event hw details: */ -#define X86_PMC_MAX_GENERIC 32 -#define X86_PMC_MAX_FIXED 3 +#define INTEL_PMC_MAX_GENERIC 32 +#define INTEL_PMC_MAX_FIXED 3 +#define INTEL_PMC_IDX_FIXED 32 -#define X86_PMC_IDX_GENERIC 0 -#define X86_PMC_IDX_FIXED 32 #define X86_PMC_IDX_MAX 64 #define MSR_ARCH_PERFMON_PERFCTR0 0xc1 @@ -121,16 +120,16 @@ struct x86_pmu_capability { /* Instr_Retired.Any: */ #define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 -#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) +#define INTEL_PMC_IDX_FIXED_INSTRUCTIONS (INTEL_PMC_IDX_FIXED + 0) /* CPU_CLK_Unhalted.Core: */ #define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a -#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) +#define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1) /* CPU_CLK_Unhalted.Ref: */ #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b -#define X86_PMC_IDX_FIXED_REF_CYCLES (X86_PMC_IDX_FIXED + 2) -#define X86_PMC_MSK_FIXED_REF_CYCLES (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES) +#define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2) +#define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES) /* * We model BTS tracing as another fixed-mode PMC. @@ -139,7 +138,7 @@ struct x86_pmu_capability { * values are used by actual fixed events and higher values are used * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. */ -#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) +#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16) /* * IBS cpuid feature detection diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e677d9923f4f..668050002602 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -63,7 +63,7 @@ u64 x86_perf_event_update(struct perf_event *event) int idx = hwc->idx; s64 delta; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -626,8 +626,8 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ - if (c->idxmsk64 & (~0ULL << X86_PMC_IDX_FIXED)) { - idx = X86_PMC_IDX_FIXED; + if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { + idx = INTEL_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; @@ -635,7 +635,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; - for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } @@ -813,13 +813,13 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; - if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { + if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { hwc->config_base = 0; hwc->event_base = 0; - } else if (hwc->idx >= X86_PMC_IDX_FIXED) { + } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); - hwc->event_base_rdpmc = (hwc->idx - X86_PMC_IDX_FIXED) | 1<<30; + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; } else { hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); @@ -921,7 +921,7 @@ int x86_perf_event_set_period(struct perf_event *event) s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -1338,21 +1338,21 @@ static int __init init_hw_perf_events(void) for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; + x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; } x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { + if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; + x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; } x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); @@ -1368,7 +1368,7 @@ static int __init init_hw_perf_events(void) */ for_each_event_constraint(c, x86_pmu.event_constraints) { if (c->cmask != X86_RAW_EVENT_MASK - || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) { + || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { continue; } @@ -1611,8 +1611,8 @@ static int x86_pmu_event_idx(struct perf_event *event) if (!x86_pmu.attr_rdpmc) return 0; - if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { - idx -= X86_PMC_IDX_FIXED; + if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { + idx -= INTEL_PMC_IDX_FIXED; idx |= 1 << 30; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 8408e37f5fa4..5b0b362c7ae6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -747,7 +747,7 @@ static void intel_pmu_disable_all(void) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); intel_pmu_pebs_disable_all(); @@ -763,9 +763,9 @@ static void intel_pmu_enable_all(int added) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { struct perf_event *event = - cpuc->events[X86_PMC_IDX_FIXED_BTS]; + cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; if (WARN_ON_ONCE(!event)) return; @@ -871,7 +871,7 @@ static inline void intel_pmu_ack_status(u64 ack) static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) { - int idx = hwc->idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, mask; mask = 0xfULL << (idx * 4); @@ -886,7 +886,7 @@ static void intel_pmu_disable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); return; @@ -915,7 +915,7 @@ static void intel_pmu_disable_event(struct perf_event *event) static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) { - int idx = hwc->idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; /* @@ -949,7 +949,7 @@ static void intel_pmu_enable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { if (!__this_cpu_read(cpu_hw_events.enabled)) return; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 026373edef7f..629ae0b7ad90 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -248,7 +248,7 @@ void reserve_ds_buffers(void) */ struct event_constraint bts_constraint = - EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); + EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0); void intel_pmu_enable_bts(u64 config) { @@ -295,7 +295,7 @@ int intel_pmu_drain_bts_buffer(void) u64 to; u64 flags; }; - struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; struct bts_record *at, *top; struct perf_output_handle handle; struct perf_event_header header; diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 6c82e4037989..92c7e39a079f 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1325,7 +1325,7 @@ __init int p4_pmu_init(void) unsigned int low, high; /* If we get stripped -- indexing fails */ - BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); + BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); rdmsr(MSR_IA32_MISC_ENABLE, low, high); if (!(low & (1 << 7))) { diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 2e88438ffd83..9b7ec1150ab0 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -80,10 +80,10 @@ static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx) static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) { - if (idx < X86_PMC_IDX_FIXED) + if (idx < INTEL_PMC_IDX_FIXED) return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); else - return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED); + return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED); } void kvm_deliver_pmi(struct kvm_vcpu *vcpu) @@ -291,7 +291,7 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx) if (pmc_is_gp(pmc)) reprogram_gp_counter(pmc, pmc->eventsel); else { - int fidx = idx - X86_PMC_IDX_FIXED; + int fidx = idx - INTEL_PMC_IDX_FIXED; reprogram_fixed_counter(pmc, fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); } @@ -452,7 +452,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) return; pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, - X86_PMC_MAX_GENERIC); + INTEL_PMC_MAX_GENERIC); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; bitmap_len = (entry->eax >> 24) & 0xff; @@ -462,13 +462,13 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = 0; } else { pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), - X86_PMC_MAX_FIXED); + INTEL_PMC_MAX_FIXED); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; } pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED); + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; } @@ -478,15 +478,15 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = &vcpu->arch.pmu; memset(pmu, 0, sizeof(*pmu)); - for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; } - for (i = 0; i < X86_PMC_MAX_FIXED; i++) { + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED; + pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; } init_irq_work(&pmu->irq_work, trigger_pmi); kvm_pmu_cpuid_update(vcpu); @@ -498,13 +498,13 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) int i; irq_work_sync(&pmu->irq_work); - for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { struct kvm_pmc *pmc = &pmu->gp_counters[i]; stop_counter(pmc); pmc->counter = pmc->eventsel = 0; } - for (i = 0; i < X86_PMC_MAX_FIXED; i++) + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) stop_counter(&pmu->fixed_counters[i]); pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = From a1eac7ac903ea9afbd4f133659710a0588c8eca5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 20 Jun 2012 20:46:34 +0200 Subject: [PATCH 297/612] perf/x86: Move Intel specific code to intel_pmu_init() There is some Intel specific code in the generic x86 path. Move it to intel_pmu_init(). Since p4 and p6 pmus don't have fixed counters we may skip the check in case such a pmu is detected. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340217996-2254-3-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 34 ++------------------------ arch/x86/kernel/cpu/perf_event_intel.c | 33 +++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 668050002602..7b4f1e871f7c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1307,7 +1307,6 @@ static struct attribute_group x86_pmu_format_group = { static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; - struct event_constraint *c; int err; pr_info("Performance Events: "); @@ -1338,21 +1337,8 @@ static int __init init_hw_perf_events(void) for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); - if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); - x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; - } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - - if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; - } - - x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + if (!x86_pmu.intel_ctrl) + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); @@ -1361,22 +1347,6 @@ static int __init init_hw_perf_events(void) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 0, x86_pmu.num_counters, 0); - if (x86_pmu.event_constraints) { - /* - * event on fixed counter2 (REF_CYCLES) only works on this - * counter, so do not extend mask to generic counters - */ - for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != X86_RAW_EVENT_MASK - || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { - continue; - } - - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - c->weight += x86_pmu.num_counters; - } - } - x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ x86_pmu_format_group.attrs = x86_pmu.format_attrs; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 5b0b362c7ae6..2e9444c80148 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1765,6 +1765,7 @@ __init int intel_pmu_init(void) union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; + struct event_constraint *c; unsigned int unused; int version; @@ -1953,5 +1954,37 @@ __init int intel_pmu_init(void) } } + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; + } + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + + if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; + } + + x86_pmu.intel_ctrl |= + ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + + if (x86_pmu.event_constraints) { + /* + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters + */ + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != X86_RAW_EVENT_MASK + || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { + continue; + } + + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; + } + } + return 0; } From b1dc3c4820428ac6216537416b2fcd140fdc52e5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 20 Jun 2012 20:46:35 +0200 Subject: [PATCH 298/612] perf/x86/amd: Unify AMD's generic and family 15h pmus There is no need for keeping separate pmu structs. We can enable amd_{get,put}_event_constraints() functions also for family 15h event. The advantage is that there is only a single pmu struct for all AMD cpus. This patch introduces functions to setup the pmu to enabe core performance counters or counter constraints. Also, cpuid checks are used instead of family checks where possible. Thus, it enables the code independently of cpu families if the feature flag is set. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340217996-2254-4-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 3 +- arch/x86/kernel/cpu/perf_event_amd.c | 105 ++++++++++++--------------- arch/x86/oprofile/op_model_amd.c | 4 +- 3 files changed, 50 insertions(+), 62 deletions(-) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 3b31248caf60..ffdf5e0991c4 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -47,8 +47,7 @@ (X86_RAW_EVENT_MASK | \ AMD64_EVENTSEL_EVENT) #define AMD64_NUM_COUNTERS 4 -#define AMD64_NUM_COUNTERS_F15H 6 -#define AMD64_NUM_COUNTERS_MAX AMD64_NUM_COUNTERS_F15H +#define AMD64_NUM_COUNTERS_CORE 6 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 11a4eb9131d5..4528ae7b6ec4 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -366,7 +366,7 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; - if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15) + if (boot_cpu_data.x86_max_cores < 2) return; nb_id = amd_get_nb_id(cpu); @@ -422,35 +422,6 @@ static struct attribute *amd_format_attr[] = { NULL, }; -static __initconst const struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, - .hw_config = amd_pmu_hw_config, - .schedule_events = x86_schedule_events, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = AMD64_NUM_COUNTERS, - .cntval_bits = 48, - .cntval_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints, - .put_event_constraints = amd_put_event_constraints, - - .format_attrs = amd_format_attr, - - .cpu_prepare = amd_pmu_cpu_prepare, - .cpu_starting = amd_pmu_cpu_starting, - .cpu_dead = amd_pmu_cpu_dead, -}; - /* AMD Family 15h */ #define AMD_EVENT_TYPE_MASK 0x000000F0ULL @@ -597,8 +568,8 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev } } -static __initconst const struct x86_pmu amd_pmu_f15h = { - .name = "AMD Family 15h", +static __initconst const struct x86_pmu amd_pmu = { + .name = "AMD", .handle_irq = x86_pmu_handle_irq, .disable_all = x86_pmu_disable_all, .enable_all = x86_pmu_enable_all, @@ -606,50 +577,68 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { .disable = x86_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, - .eventsel = MSR_F15H_PERF_CTL, - .perfctr = MSR_F15H_PERF_CTR, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = AMD64_NUM_COUNTERS_F15H, + .num_counters = AMD64_NUM_COUNTERS, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints_f15h, - /* nortbridge counters not yet implemented: */ -#if 0 + .get_event_constraints = amd_get_event_constraints, .put_event_constraints = amd_put_event_constraints, - .cpu_prepare = amd_pmu_cpu_prepare, - .cpu_dead = amd_pmu_cpu_dead, -#endif - .cpu_starting = amd_pmu_cpu_starting, .format_attrs = amd_format_attr, + + .cpu_prepare = amd_pmu_cpu_prepare, + .cpu_starting = amd_pmu_cpu_starting, + .cpu_dead = amd_pmu_cpu_dead, }; +static int setup_event_constraints(void) +{ + if (boot_cpu_data.x86 >= 0x15) + x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; + return 0; +} + +static int setup_perfctr_core(void) +{ + if (!cpu_has_perfctr_core) { + WARN(x86_pmu.get_event_constraints == amd_get_event_constraints_f15h, + KERN_ERR "Odd, counter constraints enabled but no core perfctrs detected!"); + return -ENODEV; + } + + WARN(x86_pmu.get_event_constraints == amd_get_event_constraints, + KERN_ERR "hw perf events core counters need constraints handler!"); + + /* + * If core performance counter extensions exists, we must use + * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also + * x86_pmu_addr_offset(). + */ + x86_pmu.eventsel = MSR_F15H_PERF_CTL; + x86_pmu.perfctr = MSR_F15H_PERF_CTR; + x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + printk(KERN_INFO "perf: AMD core performance counters detected\n"); + + return 0; +} + __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) return -ENODEV; - /* - * If core performance counter extensions exists, it must be - * family 15h, otherwise fail. See x86_pmu_addr_offset(). - */ - switch (boot_cpu_data.x86) { - case 0x15: - if (!cpu_has_perfctr_core) - return -ENODEV; - x86_pmu = amd_pmu_f15h; - break; - default: - if (cpu_has_perfctr_core) - return -ENODEV; - x86_pmu = amd_pmu; - break; - } + x86_pmu = amd_pmu; + + setup_event_constraints(); + setup_perfctr_core(); /* Events are common for all AMDs */ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 303f08637826..b2b94438ff05 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -312,7 +312,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) goto fail; } /* both registers must be reserved */ - if (num_counters == AMD64_NUM_COUNTERS_F15H) { + if (num_counters == AMD64_NUM_COUNTERS_CORE) { msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); } else { @@ -514,7 +514,7 @@ static int op_amd_init(struct oprofile_operations *ops) ops->create_files = setup_ibs_files; if (boot_cpu_data.x86 == 0x15) { - num_counters = AMD64_NUM_COUNTERS_F15H; + num_counters = AMD64_NUM_COUNTERS_CORE; } else { num_counters = AMD64_NUM_COUNTERS; } From f285f92f7e4c9af20149130c8fd5027131b39b0e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 20 Jun 2012 20:46:36 +0200 Subject: [PATCH 299/612] perf/x86: Improve debug output in check_hw_exists() It might be of interest which perfctr msr failed. Signed-off-by: Robert Richter [ added hunk to avoid GCC warn ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340217996-2254-5-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 7b4f1e871f7c..3eb88ebcec5a 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -178,7 +178,7 @@ static void release_pmc_hardware(void) {} static bool check_hw_exists(void) { - u64 val, val_new = 0; + u64 val, val_new = ~0; int i, reg, ret = 0; /* @@ -211,8 +211,9 @@ static bool check_hw_exists(void) * that don't trap on the MSR access and always return 0s. */ val = 0xabcdUL; - ret = wrmsrl_safe(x86_pmu_event_addr(0), val); - ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); + reg = x86_pmu_event_addr(0); + ret = wrmsrl_safe(reg, val); + ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; @@ -229,6 +230,7 @@ bios_fail: msr_fail: printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); return false; } From c93dc84cbe32435be3ffa2fbde355eff94955c32 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jun 2012 14:50:50 +0200 Subject: [PATCH 300/612] perf/x86: Add a microcode revision check for SNB-PEBS Recent Intel microcode resolved the SNB-PEBS issues, so conditionally enable PEBS on SNB hardware depending on the microcode revision. Thanks to Stephane for figuring out the various microcode revisions. Suggested-by: Stephane Eranian Acked-by: Borislav Petkov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-v3672ziwh9damwqwh1uz3krm@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 2 + arch/x86/kernel/cpu/perf_event.c | 21 +++++++---- arch/x86/kernel/cpu/perf_event.h | 4 +- arch/x86/kernel/cpu/perf_event_intel.c | 51 ++++++++++++++++++++++++-- arch/x86/kernel/microcode_core.c | 10 +++-- 5 files changed, 74 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index ffdf5e0991c4..c78f14a0df00 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -232,6 +232,7 @@ struct perf_guest_switch_msr { extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); +extern void perf_check_microcode(void); #else static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { @@ -245,6 +246,7 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) } static inline void perf_events_lapic_init(void) { } +static inline void perf_check_microcode(void) { } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3eb88ebcec5a..29557aa06dda 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -379,7 +379,7 @@ int x86_pmu_hw_config(struct perf_event *event) int precise = 0; /* Support for constant skid */ - if (x86_pmu.pebs_active) { + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; /* Support for IP fixup */ @@ -1650,13 +1650,20 @@ static void x86_pmu_flush_branch_stack(void) x86_pmu.flush_branch_stack(); } +void perf_check_microcode(void) +{ + if (x86_pmu.check_microcode) + x86_pmu.check_microcode(); +} +EXPORT_SYMBOL_GPL(perf_check_microcode); + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, - .attr_groups = x86_pmu_attr_groups, + .attr_groups = x86_pmu_attr_groups, - .event_init = x86_pmu_event_init, + .event_init = x86_pmu_event_init, .add = x86_pmu_add, .del = x86_pmu_del, @@ -1664,11 +1671,11 @@ static struct pmu pmu = { .stop = x86_pmu_stop, .read = x86_pmu_read, - .start_txn = x86_pmu_start_txn, - .cancel_txn = x86_pmu_cancel_txn, - .commit_txn = x86_pmu_commit_txn, + .start_txn = x86_pmu_start_txn, + .cancel_txn = x86_pmu_cancel_txn, + .commit_txn = x86_pmu_commit_txn, - .event_idx = x86_pmu_event_idx, + .event_idx = x86_pmu_event_idx, .flush_branch_stack = x86_pmu_flush_branch_stack, }; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 83238f2a12b2..3f5c66904355 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -361,6 +361,8 @@ struct x86_pmu { void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); + + void (*check_microcode)(void); void (*flush_branch_stack)(void); /* @@ -373,7 +375,7 @@ struct x86_pmu { * Intel DebugStore bits */ int bts, pebs; - int bts_active, pebs_active; + int bts_active, pebs_active, pebs_broken; int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2e9444c80148..5fdedb4bc3f1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1712,11 +1712,56 @@ static __init void intel_clovertown_quirk(void) x86_pmu.pebs_constraints = NULL; } +static int intel_snb_pebs_broken(int cpu) +{ + u32 rev = UINT_MAX; /* default to broken for unknown models */ + + switch (cpu_data(cpu).x86_model) { + case 42: /* SNB */ + rev = 0x28; + break; + + case 45: /* SNB-EP */ + switch (cpu_data(cpu).x86_mask) { + case 6: rev = 0x618; break; + case 7: rev = 0x70c; break; + } + } + + return (cpu_data(cpu).microcode < rev); +} + +static void intel_snb_check_microcode(void) +{ + int pebs_broken = 0; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + if ((pebs_broken = intel_snb_pebs_broken(cpu))) + break; + } + put_online_cpus(); + + if (pebs_broken == x86_pmu.pebs_broken) + return; + + /* + * Serialized by the microcode lock.. + */ + if (x86_pmu.pebs_broken) { + pr_info("PEBS enabled due to microcode update\n"); + x86_pmu.pebs_broken = 0; + } else { + pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n"); + x86_pmu.pebs_broken = 1; + } +} + static __init void intel_sandybridge_quirk(void) { - printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); - x86_pmu.pebs = 0; - x86_pmu.pebs_constraints = NULL; + x86_pmu.check_microcode = intel_snb_check_microcode; + intel_snb_check_microcode(); } static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 947e4c64b1d8..1649cf899ad6 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -87,6 +87,7 @@ #include #include #include +#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); @@ -277,7 +278,6 @@ static int reload_for_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; - mutex_lock(µcode_mutex); if (uci->valid) { enum ucode_state ustate; @@ -288,7 +288,6 @@ static int reload_for_cpu(int cpu) if (ustate == UCODE_ERROR) err = -EINVAL; } - mutex_unlock(µcode_mutex); return err; } @@ -309,6 +308,7 @@ static ssize_t reload_store(struct device *dev, return size; get_online_cpus(); + mutex_lock(µcode_mutex); for_each_online_cpu(cpu) { tmp_ret = reload_for_cpu(cpu); if (tmp_ret != 0) @@ -318,6 +318,9 @@ static ssize_t reload_store(struct device *dev, if (!ret) ret = tmp_ret; } + if (!ret) + perf_check_microcode(); + mutex_unlock(µcode_mutex); put_online_cpus(); if (!ret) @@ -557,7 +560,8 @@ static int __init microcode_init(void) mutex_lock(µcode_mutex); error = subsys_interface_register(&mc_cpu_interface); - + if (!error) + perf_check_microcode(); mutex_unlock(µcode_mutex); put_online_cpus(); From 3e0091e2b6f8cd59e567f247e345a3a6ad1f6e7e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jun 2012 23:38:39 +0200 Subject: [PATCH 301/612] perf/x86: Save a few bytes in 'struct x86_pmu' All these are basically boolean flags, use a bitfield to save a few bytes. Suggested-by: Borislav Petkov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-vsevd5g8lhcn129n3s7trl7r@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 3f5c66904355..a15df4be151f 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -374,8 +374,11 @@ struct x86_pmu { /* * Intel DebugStore bits */ - int bts, pebs; - int bts_active, pebs_active, pebs_broken; + int bts :1, + bts_active :1, + pebs :1, + pebs_active :1, + pebs_broken :1; int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; From eca26c9950f4d3e9c92ba275e9f4aee834aa1913 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 27 Jun 2012 15:09:12 +0800 Subject: [PATCH 302/612] perf/x86: Use 0xff as pseudo code for fixed uncore event Stephane Eranian suggestted using 0xff as pseudo code for fixed uncore event and using the umask value to determine which of the fixed events we want to map to. So far there is at most one fixed counter in a uncore PMU. So just change the definition of UNCORE_FIXED_EVENT to 0xff. Suggested-by: Stephane Eranian Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340780953-21130-1-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 4 ++-- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 6f43f9584e3d..c42a3f7b5233 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -179,7 +179,7 @@ static struct attribute *snbep_uncore_pcu_formats_attr[] = { }; static struct uncore_event_desc snbep_uncore_imc_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0xff"), + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), { /* end: all zeroes */ }, @@ -616,7 +616,7 @@ static struct attribute_group nhm_uncore_format_group = { }; static struct uncore_event_desc nhm_uncore_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0xff"), + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 4d52db0d1dfe..88498c7b3420 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -9,7 +9,7 @@ #define UNCORE_PMU_HRTIMER_INTERVAL (60 * NSEC_PER_SEC) -#define UNCORE_FIXED_EVENT 0xffff +#define UNCORE_FIXED_EVENT 0xff #define UNCORE_PMC_IDX_MAX_GENERIC 8 #define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC #define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) From 3b19e4c98c035c9ab218fc64ef26f4f7a30eafb9 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 28 Jun 2012 14:56:36 +0800 Subject: [PATCH 303/612] perf/x86: Fix event constraint for SandyBridge-EP C-Box The constraint for C-Box event 0x1f should have overlap flag set. Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340866596-22502-2-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index c42a3f7b5233..7d755d2e1c9c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -239,7 +239,7 @@ static struct event_constraint snbep_uncore_cbox_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), - UNCORE_EVENT_CONSTRAINT(0x1f, 0xe), + EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), UNCORE_EVENT_CONSTRAINT(0x21, 0x3), UNCORE_EVENT_CONSTRAINT(0x23, 0x3), UNCORE_EVENT_CONSTRAINT(0x31, 0x3), From 42089697244ba8e64fa43fb5e6d50d47a8e4cb00 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 4 Jul 2012 14:00:14 +0800 Subject: [PATCH 304/612] perf/x86: Detect number of instances of uncore CBox The CBox manages the interface between the core and the LLC, so the instances of uncore CBox is equal to number of cores. Reported-by: Andrew Cooks Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1341381616-12229-4-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 7d755d2e1c9c..4fecbd00ee75 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -1605,8 +1605,9 @@ static void __init uncore_cpu_setup(void *dummy) static int __init uncore_cpu_init(void) { - int ret, cpu; + int ret, cpu, max_cores; + max_cores = boot_cpu_data.x86_max_cores; switch (boot_cpu_data.x86_model) { case 26: /* Nehalem */ case 30: @@ -1615,9 +1616,13 @@ static int __init uncore_cpu_init(void) msr_uncores = nhm_msr_uncores; break; case 42: /* Sandy Bridge */ + if (snb_uncore_cbox.num_boxes > max_cores) + snb_uncore_cbox.num_boxes = max_cores; msr_uncores = snb_msr_uncores; break; case 45: /* Sandy Birdge-EP */ + if (snbep_uncore_cbox.num_boxes > max_cores) + snbep_uncore_cbox.num_boxes = max_cores; msr_uncores = snbep_msr_uncores; break; default: From 6a67943a18c264d5f3df436da38edb3e59adc905 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 4 Jul 2012 14:00:15 +0800 Subject: [PATCH 305/612] perf/x86: Uncore filter support for SandyBridge-EP This patch adds C-Box and PCU filter support for SandyBridge-EP uncore. We can filter C-Box events by thread/core ID and filter PCU events by frequency/voltage. Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1341381616-12229-5-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 228 ++++++++++++++---- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 24 +- 2 files changed, 206 insertions(+), 46 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 4fecbd00ee75..19faffc60886 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -14,10 +14,13 @@ static cpumask_t uncore_cpu_mask; /* constraint for the fixed counter */ static struct event_constraint constraint_fixed = EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); +static struct event_constraint constraint_empty = + EVENT_CONSTRAINT(0, 0, 0); DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); @@ -26,8 +29,19 @@ DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand0, filter_brand0, "config1:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand1, filter_brand1, "config1:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand2, filter_brand2, "config1:16-23"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand3, filter_brand3, "config1:24-31"); /* Sandy Bridge-EP uncore support */ +static struct intel_uncore_type snbep_uncore_cbox; +static struct intel_uncore_type snbep_uncore_pcu; + static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; @@ -120,6 +134,10 @@ static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, reg1->config); wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); } @@ -149,6 +167,71 @@ static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); } +static struct event_constraint * +snbep_uncore_get_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + unsigned long flags; + bool ok = false; + + if (reg1->idx == EXTRA_REG_NONE || (box->phys_id >= 0 && reg1->alloc)) + return NULL; + + er = &box->shared_regs[reg1->idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || er->config1 == reg1->config) { + atomic_inc(&er->ref); + er->config1 = reg1->config; + ok = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (ok) { + if (box->phys_id >= 0) + reg1->alloc = 1; + return NULL; + } + return &constraint_empty; +} + +static void snbep_uncore_put_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + + if (box->phys_id < 0 || !reg1->alloc) + return; + + er = &box->shared_regs[reg1->idx]; + atomic_dec(&er->ref); + reg1->alloc = 0; +} + +static int snbep_uncore_hw_config(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (box->pmu->type == &snbep_uncore_cbox) { + reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + + SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & + SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK; + } else if (box->pmu->type == &snbep_uncore_pcu) { + reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; + reg1->config = event->attr.config1 & + SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK; + } else { + return 0; + } + reg1->idx = 0; + return 0; +} + static struct attribute *snbep_uncore_formats_attr[] = { &format_attr_event.attr, &format_attr_umask.attr, @@ -167,6 +250,20 @@ static struct attribute *snbep_uncore_ubox_formats_attr[] = { NULL, }; +static struct attribute *snbep_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid.attr, + &format_attr_filter_nid.attr, + &format_attr_filter_state.attr, + &format_attr_filter_opc.attr, + NULL, +}; + static struct attribute *snbep_uncore_pcu_formats_attr[] = { &format_attr_event.attr, &format_attr_occ_sel.attr, @@ -175,6 +272,10 @@ static struct attribute *snbep_uncore_pcu_formats_attr[] = { &format_attr_thresh5.attr, &format_attr_occ_invert.attr, &format_attr_occ_edge.attr, + &format_attr_filter_brand0.attr, + &format_attr_filter_brand1.attr, + &format_attr_filter_brand2.attr, + &format_attr_filter_brand3.attr, NULL, }; @@ -203,6 +304,11 @@ static struct attribute_group snbep_uncore_ubox_format_group = { .attrs = snbep_uncore_ubox_formats_attr, }; +static struct attribute_group snbep_uncore_cbox_format_group = { + .name = "format", + .attrs = snbep_uncore_cbox_formats_attr, +}; + static struct attribute_group snbep_uncore_pcu_format_group = { .name = "format", .attrs = snbep_uncore_pcu_formats_attr, @@ -215,6 +321,9 @@ static struct intel_uncore_ops snbep_uncore_msr_ops = { .disable_event = snbep_uncore_msr_disable_event, .enable_event = snbep_uncore_msr_enable_event, .read_counter = snbep_uncore_msr_read_counter, + .get_constraint = snbep_uncore_get_constraint, + .put_constraint = snbep_uncore_put_constraint, + .hw_config = snbep_uncore_hw_config, }; static struct intel_uncore_ops snbep_uncore_pci_ops = { @@ -307,31 +416,33 @@ static struct intel_uncore_type snbep_uncore_ubox = { }; static struct intel_uncore_type snbep_uncore_cbox = { - .name = "cbox", - .num_counters = 4, - .num_boxes = 8, - .perf_ctr_bits = 44, - .event_ctl = SNBEP_C0_MSR_PMON_CTL0, - .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, - .event_mask = SNBEP_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, - .msr_offset = SNBEP_CBO_MSR_OFFSET, - .constraints = snbep_uncore_cbox_constraints, - .ops = &snbep_uncore_msr_ops, - .format_group = &snbep_uncore_format_group, + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = snbep_uncore_cbox_constraints, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_cbox_format_group, }; static struct intel_uncore_type snbep_uncore_pcu = { - .name = "pcu", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, - .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, - .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, - .ops = &snbep_uncore_msr_ops, - .format_group = &snbep_uncore_pcu_format_group, + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_pcu_format_group, }; static struct intel_uncore_type *snbep_msr_uncores[] = { @@ -747,15 +858,22 @@ static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) box->hrtimer.function = uncore_pmu_hrtimer; } -struct intel_uncore_box *uncore_alloc_box(int cpu) +struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, + int cpu) { struct intel_uncore_box *box; + int i, size; - box = kmalloc_node(sizeof(*box), GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); + size = sizeof(*box) + type->num_shared_regs * + sizeof(struct intel_uncore_extra_reg); + + box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); if (!box) return NULL; + for (i = 0; i < type->num_shared_regs; i++) + raw_spin_lock_init(&box->shared_regs[i].lock); + uncore_pmu_init_hrtimer(box); atomic_set(&box->refcnt, 1); box->cpu = -1; @@ -834,11 +952,18 @@ static int uncore_collect_events(struct intel_uncore_box *box, } static struct event_constraint * -uncore_event_constraint(struct intel_uncore_type *type, - struct perf_event *event) +uncore_get_event_constraint(struct intel_uncore_box *box, + struct perf_event *event) { + struct intel_uncore_type *type = box->pmu->type; struct event_constraint *c; + if (type->ops->get_constraint) { + c = type->ops->get_constraint(box, event); + if (c) + return c; + } + if (event->hw.config == ~0ULL) return &constraint_fixed; @@ -852,19 +977,25 @@ uncore_event_constraint(struct intel_uncore_type *type, return &type->unconstrainted; } +static void uncore_put_event_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + if (box->pmu->type->ops->put_constraint) + box->pmu->type->ops->put_constraint(box, event); +} + static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n) { unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX]; - int i, ret, wmin, wmax; + int i, wmin, wmax, ret = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { - c = uncore_event_constraint(box->pmu->type, - box->event_list[i]); + c = uncore_get_event_constraint(box, box->event_list[i]); constraints[i] = c; wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); @@ -888,13 +1019,17 @@ static int uncore_assign_events(struct intel_uncore_box *box, break; __set_bit(hwc->idx, used_mask); - assign[i] = hwc->idx; + if (assign) + assign[i] = hwc->idx; } - if (i == n) - return 0; - /* slow path */ - ret = perf_assign_events(constraints, n, wmin, wmax, assign); + if (i != n) + ret = perf_assign_events(constraints, n, wmin, wmax, assign); + + if (!assign || ret) { + for (i = 0; i < n; i++) + uncore_put_event_constraint(box, box->event_list[i]); + } return ret ? -EINVAL : 0; } @@ -1021,6 +1156,8 @@ static void uncore_pmu_event_del(struct perf_event *event, int flags) for (i = 0; i < box->n_events; i++) { if (event == box->event_list[i]) { + uncore_put_event_constraint(box, event); + while (++i < box->n_events) box->event_list[i - 1] = box->event_list[i]; @@ -1048,10 +1185,9 @@ static int uncore_validate_group(struct intel_uncore_pmu *pmu, { struct perf_event *leader = event->group_leader; struct intel_uncore_box *fake_box; - int assign[UNCORE_PMC_IDX_MAX]; int ret = -EINVAL, n; - fake_box = uncore_alloc_box(smp_processor_id()); + fake_box = uncore_alloc_box(pmu->type, smp_processor_id()); if (!fake_box) return -ENOMEM; @@ -1073,7 +1209,7 @@ static int uncore_validate_group(struct intel_uncore_pmu *pmu, fake_box->n_events = n; - ret = uncore_assign_events(fake_box, assign, n); + ret = uncore_assign_events(fake_box, NULL, n); out: kfree(fake_box); return ret; @@ -1117,6 +1253,10 @@ int uncore_pmu_event_init(struct perf_event *event) return -EINVAL; event->cpu = box->cpu; + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + if (event->attr.config == UNCORE_FIXED_EVENT) { /* no fixed counter */ if (!pmu->type->fixed_ctl) @@ -1130,11 +1270,13 @@ int uncore_pmu_event_init(struct perf_event *event) hwc->config = ~0ULL; } else { hwc->config = event->attr.config & pmu->type->event_mask; + if (pmu->type->ops->hw_config) { + ret = pmu->type->ops->hw_config(box, event); + if (ret) + return ret; + } } - event->hw.idx = -1; - event->hw.last_tag = ~0ULL; - if (event->group_leader != event) ret = uncore_validate_group(pmu, event); else @@ -1276,7 +1418,7 @@ static int __devinit uncore_pci_add(struct intel_uncore_type *type, if (phys_id < 0) return -ENODEV; - box = uncore_alloc_box(0); + box = uncore_alloc_box(type, 0); if (!box) return -ENOMEM; @@ -1458,7 +1600,7 @@ static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) if (pmu->func_id < 0) pmu->func_id = j; - box = uncore_alloc_box(cpu); + box = uncore_alloc_box(type, cpu); if (!box) return -ENOMEM; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 88498c7b3420..b13e9ea81def 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -97,6 +97,10 @@ SNBEP_PMON_CTL_INVERT | \ SNBEP_U_MSR_PMON_CTL_TRESH_MASK) +#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19) +#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + /* SNB-EP PCU event control */ #define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000 #define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000 @@ -140,15 +144,17 @@ /* SNB-EP Cbo register */ #define SNBEP_C0_MSR_PMON_CTR0 0xd16 #define SNBEP_C0_MSR_PMON_CTL0 0xd10 -#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 #define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04 +#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK 0xfffffc1f #define SNBEP_CBO_MSR_OFFSET 0x20 /* SNB-EP PCU register */ #define SNBEP_PCU_MSR_PMON_CTR0 0xc36 #define SNBEP_PCU_MSR_PMON_CTL0 0xc30 -#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 #define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff #define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc #define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd @@ -163,7 +169,6 @@ struct intel_uncore_type { int num_boxes; int perf_ctr_bits; int fixed_ctr_bits; - int single_fixed; unsigned perf_ctr; unsigned event_ctl; unsigned event_mask; @@ -171,6 +176,8 @@ struct intel_uncore_type { unsigned fixed_ctl; unsigned box_ctl; unsigned msr_offset; + unsigned num_shared_regs:8; + unsigned single_fixed:1; struct event_constraint unconstrainted; struct event_constraint *constraints; struct intel_uncore_pmu *pmus; @@ -188,6 +195,10 @@ struct intel_uncore_ops { void (*disable_event)(struct intel_uncore_box *, struct perf_event *); void (*enable_event)(struct intel_uncore_box *, struct perf_event *); u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *); + int (*hw_config)(struct intel_uncore_box *, struct perf_event *); + struct event_constraint *(*get_constraint)(struct intel_uncore_box *, + struct perf_event *); + void (*put_constraint)(struct intel_uncore_box *, struct perf_event *); }; struct intel_uncore_pmu { @@ -200,6 +211,12 @@ struct intel_uncore_pmu { struct list_head box_list; }; +struct intel_uncore_extra_reg { + raw_spinlock_t lock; + u64 config1; + atomic_t ref; +}; + struct intel_uncore_box { int phys_id; int n_active; /* number of active events */ @@ -215,6 +232,7 @@ struct intel_uncore_box { struct intel_uncore_pmu *pmu; struct hrtimer hrtimer; struct list_head list; + struct intel_uncore_extra_reg shared_regs[0]; }; #define UNCORE_BOX_FLAG_INITIATED 0 From 2d3f6ccc4ea5c74d4b4af1b47c56b4cff4bbfcb7 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Wed, 4 Jul 2012 20:38:19 +0200 Subject: [PATCH 306/612] batman-adv: check incoming packet type for bla If the gateway functionality is used, some broadcast packets (DHCP requests) may be transmitted as unicast packets. As the bridge loop avoidance code now only considers the payload Ethernet destination, it may drop the DHCP request for clients which are claimed by other backbone gateways, because it falsely infers from the broadcast address that the right backbone gateway should havehandled the broadcast. Fix this by checking and delegating the batman-adv packet type used for transmission. Reported-by: Guido Iribarren Signed-off-by: Simon Wunderlich --- net/batman-adv/bridge_loop_avoidance.c | 15 +++++++++++---- net/batman-adv/bridge_loop_avoidance.h | 5 +++-- net/batman-adv/soft-interface.c | 6 +++++- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 8bf97515a77d..c5863f499133 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1351,6 +1351,7 @@ void bla_free(struct bat_priv *bat_priv) * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame + * @is_bcast: the packet came in a broadcast packet type. * * bla_rx avoidance checks if: * * we have to race for a claim @@ -1361,7 +1362,8 @@ void bla_free(struct bat_priv *bat_priv) * process the skb. * */ -int bla_rx(struct bat_priv *bat_priv, struct sk_buff *skb, short vid) +int bla_rx(struct bat_priv *bat_priv, struct sk_buff *skb, short vid, + bool is_bcast) { struct ethhdr *ethhdr; struct claim search_claim, *claim = NULL; @@ -1380,7 +1382,7 @@ int bla_rx(struct bat_priv *bat_priv, struct sk_buff *skb, short vid) if (unlikely(atomic_read(&bat_priv->bla_num_requests))) /* don't allow broadcasts while requests are in flight */ - if (is_multicast_ether_addr(ethhdr->h_dest)) + if (is_multicast_ether_addr(ethhdr->h_dest) && is_bcast) goto handled; memcpy(search_claim.addr, ethhdr->h_source, ETH_ALEN); @@ -1406,8 +1408,13 @@ int bla_rx(struct bat_priv *bat_priv, struct sk_buff *skb, short vid) } /* if it is a broadcast ... */ - if (is_multicast_ether_addr(ethhdr->h_dest)) { - /* ... drop it. the responsible gateway is in charge. */ + if (is_multicast_ether_addr(ethhdr->h_dest) && is_bcast) { + /* ... drop it. the responsible gateway is in charge. + * + * We need to check is_bcast because with the gateway + * feature, broadcasts (like DHCP requests) may be sent + * using a unicast packet type. + */ goto handled; } else { /* seems the client considers us as its best gateway. diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index e39f93acc28f..dc5227b398d4 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -23,7 +23,8 @@ #define _NET_BATMAN_ADV_BLA_H_ #ifdef CONFIG_BATMAN_ADV_BLA -int bla_rx(struct bat_priv *bat_priv, struct sk_buff *skb, short vid); +int bla_rx(struct bat_priv *bat_priv, struct sk_buff *skb, short vid, + bool is_bcast); int bla_tx(struct bat_priv *bat_priv, struct sk_buff *skb, short vid); int bla_is_backbone_gw(struct sk_buff *skb, struct orig_node *orig_node, int hdr_size); @@ -41,7 +42,7 @@ void bla_free(struct bat_priv *bat_priv); #else /* ifdef CONFIG_BATMAN_ADV_BLA */ static inline int bla_rx(struct bat_priv *bat_priv, struct sk_buff *skb, - short vid) + short vid, bool is_bcast) { return 0; } diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 6e2530b02043..a0ec0e4ada4c 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -256,7 +256,11 @@ void interface_rx(struct net_device *soft_iface, struct bat_priv *bat_priv = netdev_priv(soft_iface); struct ethhdr *ethhdr; struct vlan_ethhdr *vhdr; + struct batman_header *batadv_header = (struct batman_header *)skb->data; short vid __maybe_unused = -1; + bool is_bcast; + + is_bcast = (batadv_header->packet_type == BAT_BCAST); /* check if enough space is available for pulling, and pull */ if (!pskb_may_pull(skb, hdr_size)) @@ -302,7 +306,7 @@ void interface_rx(struct net_device *soft_iface, /* Let the bridge loop avoidance check the packet. If will * not handle it, we can safely push it up. */ - if (bla_rx(bat_priv, skb, vid)) + if (bla_rx(bat_priv, skb, vid, is_bcast)) goto out; netif_rx(skb); From 8e16e33c168a6efd0c9f7fa9dd4c1e1db9a74553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Mon, 2 Jul 2012 19:53:55 +0200 Subject: [PATCH 307/612] USB: option: add ZTE MF60 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switches into a composite device by ejecting the initial driver CD. The four interfaces are: QCDM, AT, QMI/wwan and mass storage. Let this driver manage the two serial interfaces: T: Bus=02 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 28 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=19d2 ProdID=1402 Rev= 0.00 S: Manufacturer=ZTE,Incorporated S: Product=ZTE WCDMA Technologies MSM S: SerialNumber=xxxxx C:* #Ifs= 4 Cfg#= 1 Atr=c0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=08(stor.) Sub=06 Prot=50 Driver=usb-storage E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Bjørn Mork Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index adf8ce72be50..167aed8b2488 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -554,6 +554,10 @@ static const struct option_blacklist_info net_intf1_blacklist = { .reserved = BIT(1), }; +static const struct option_blacklist_info net_intf2_blacklist = { + .reserved = BIT(2), +}; + static const struct option_blacklist_info net_intf3_blacklist = { .reserved = BIT(3), }; @@ -1099,6 +1103,8 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1298, 0xff, 0xff, 0xff) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1299, 0xff, 0xff, 0xff) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1300, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1402, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&net_intf2_blacklist }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x2002, 0xff, 0xff, 0xff), .driver_info = (kernel_ulong_t)&zte_k3765_z_blacklist }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x2003, 0xff, 0xff, 0xff) }, From aacef9c561a693341566a6850c451ce3df68cb9a Mon Sep 17 00:00:00 2001 From: Gaosen Zhang Date: Thu, 5 Jul 2012 21:49:00 +0800 Subject: [PATCH 308/612] USB: option: Add MEDIATEK product ids Signed-off-by: Gaosen Zhang Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 167aed8b2488..417ab1b0aa30 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -497,6 +497,15 @@ static void option_instat_callback(struct urb *urb); /* MediaTek products */ #define MEDIATEK_VENDOR_ID 0x0e8d +#define MEDIATEK_PRODUCT_DC_1COM 0x00a0 +#define MEDIATEK_PRODUCT_DC_4COM 0x00a5 +#define MEDIATEK_PRODUCT_DC_5COM 0x00a4 +#define MEDIATEK_PRODUCT_7208_1COM 0x7101 +#define MEDIATEK_PRODUCT_7208_2COM 0x7102 +#define MEDIATEK_PRODUCT_FP_1COM 0x0003 +#define MEDIATEK_PRODUCT_FP_2COM 0x0023 +#define MEDIATEK_PRODUCT_FPDC_1COM 0x0043 +#define MEDIATEK_PRODUCT_FPDC_2COM 0x0033 /* Cellient products */ #define CELLIENT_VENDOR_ID 0x2692 @@ -1246,6 +1255,17 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a1, 0xff, 0x02, 0x01) }, { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a2, 0xff, 0x00, 0x00) }, { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a2, 0xff, 0x02, 0x01) }, /* MediaTek MT6276M modem & app port */ + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_1COM, 0x0a, 0x00, 0x00) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_5COM, 0xff, 0x02, 0x01) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_5COM, 0xff, 0x00, 0x00) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_4COM, 0xff, 0x02, 0x01) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_4COM, 0xff, 0x00, 0x00) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_7208_1COM, 0x02, 0x00, 0x00) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_7208_2COM, 0x02, 0x02, 0x01) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FP_1COM, 0x0a, 0x00, 0x00) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FP_2COM, 0x0a, 0x00, 0x00) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FPDC_1COM, 0x0a, 0x00, 0x00) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FPDC_2COM, 0x0a, 0x00, 0x00) }, { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MEN200) }, { } /* Terminating entry */ }; From b7d28e32c93801d60c1a7a817f774a02b7bdde43 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 2 Jul 2012 12:34:24 +0200 Subject: [PATCH 309/612] USB: metro-usb: fix tty_flip_buffer_push use Do not set low_latency flag at open as tty_flip_buffer_push must not be called in IRQ context with low_latency set. Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/metro-usb.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/usb/serial/metro-usb.c b/drivers/usb/serial/metro-usb.c index 81423f7361db..d47eb06fe463 100644 --- a/drivers/usb/serial/metro-usb.c +++ b/drivers/usb/serial/metro-usb.c @@ -222,14 +222,6 @@ static int metrousb_open(struct tty_struct *tty, struct usb_serial_port *port) metro_priv->throttled = 0; spin_unlock_irqrestore(&metro_priv->lock, flags); - /* - * Force low_latency on so that our tty_push actually forces the data - * through, otherwise it is scheduled, and with high data rates (like - * with OHCI) data can get lost. - */ - if (tty) - tty->low_latency = 1; - /* Clear the urb pipe. */ usb_clear_halt(serial->dev, port->interrupt_in_urb->pipe); From b086b6b10d9f182cd8d2f0dcfd7fd11edba93fc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Mon, 2 Jul 2012 10:33:14 +0200 Subject: [PATCH 310/612] USB: cdc-wdm: fix lockup on error in wdm_read MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clear the WDM_READ flag on empty reads to avoid running forever in an infinite tight loop, causing lockups: Jul 1 21:58:11 nemi kernel: [ 3658.898647] qmi_wwan 2-1:1.2: Unexpected error -71 Jul 1 21:58:36 nemi kernel: [ 3684.072021] BUG: soft lockup - CPU#0 stuck for 23s! [qmi.pl:12235] Jul 1 21:58:36 nemi kernel: [ 3684.072212] CPU 0 Jul 1 21:58:36 nemi kernel: [ 3684.072355] Jul 1 21:58:36 nemi kernel: [ 3684.072367] Pid: 12235, comm: qmi.pl Tainted: P O 3.5.0-rc2+ #13 LENOVO 2776LEG/2776LEG Jul 1 21:58:36 nemi kernel: [ 3684.072383] RIP: 0010:[] [] spin_unlock_irq+0x8/0xc [cdc_wdm] Jul 1 21:58:36 nemi kernel: [ 3684.072388] RSP: 0018:ffff88022dca1e70 EFLAGS: 00000282 Jul 1 21:58:36 nemi kernel: [ 3684.072393] RAX: ffff88022fc3f650 RBX: ffffffff811c56f7 RCX: 00000001000ce8c1 Jul 1 21:58:36 nemi kernel: [ 3684.072398] RDX: 0000000000000010 RSI: 000000000267d810 RDI: ffff88022fc3f650 Jul 1 21:58:36 nemi kernel: [ 3684.072403] RBP: ffff88022dca1eb0 R08: ffffffffa063578e R09: 0000000000000000 Jul 1 21:58:36 nemi kernel: [ 3684.072407] R10: 0000000000000008 R11: 0000000000000246 R12: 0000000000000002 Jul 1 21:58:36 nemi kernel: [ 3684.072412] R13: 0000000000000246 R14: ffffffff00000002 R15: ffff8802281d8c88 Jul 1 21:58:36 nemi kernel: [ 3684.072418] FS: 00007f666a260700(0000) GS:ffff88023bc00000(0000) knlGS:0000000000000000 Jul 1 21:58:36 nemi kernel: [ 3684.072423] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 Jul 1 21:58:36 nemi kernel: [ 3684.072428] CR2: 000000000270d9d8 CR3: 000000022e865000 CR4: 00000000000007f0 Jul 1 21:58:36 nemi kernel: [ 3684.072433] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 Jul 1 21:58:36 nemi kernel: [ 3684.072438] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Jul 1 21:58:36 nemi kernel: [ 3684.072444] Process qmi.pl (pid: 12235, threadinfo ffff88022dca0000, task ffff88022ff76380) Jul 1 21:58:36 nemi kernel: [ 3684.072448] Stack: Jul 1 21:58:36 nemi kernel: [ 3684.072458] ffffffffa063592e 0000000100020000 ffff88022fc3f650 ffff88022fc3f6a8 Jul 1 21:58:36 nemi kernel: [ 3684.072466] 0000000000000200 0000000100000000 000000000267d810 0000000000000000 Jul 1 21:58:36 nemi kernel: [ 3684.072475] 0000000000000000 ffff880212cfb6d0 0000000000000200 ffff880212cfb6c0 Jul 1 21:58:36 nemi kernel: [ 3684.072479] Call Trace: Jul 1 21:58:36 nemi kernel: [ 3684.072489] [] ? wdm_read+0x1a0/0x263 [cdc_wdm] Jul 1 21:58:36 nemi kernel: [ 3684.072500] [] ? vfs_read+0xa1/0xfb Jul 1 21:58:36 nemi kernel: [ 3684.072509] [] ? alarm_setitimer+0x35/0x64 Jul 1 21:58:36 nemi kernel: [ 3684.072517] [] ? sys_read+0x45/0x6e Jul 1 21:58:36 nemi kernel: [ 3684.072525] [] ? system_call_fastpath+0x16/0x1b Jul 1 21:58:36 nemi kernel: [ 3684.072557] Code: <66> 66 90 c3 83 ff ed 89 f8 74 16 7f 06 83 ff a1 75 0a c3 83 ff f4 The WDM_READ flag is normally cleared by wdm_int_callback before resubmitting the read urb, and set by wdm_in_callback when this urb returns with data or an error. But a crashing device may cause both a read error and cancelling all urbs. Make sure that the flag is cleared by wdm_read if the buffer is empty. We don't clear the flag on errors, as there may be pending data in the buffer which should be processed. The flag will instead be cleared on the next wdm_read call. Cc: stable Signed-off-by: Bjørn Mork Acked-by: Oliver Neukum Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 8fd398dffced..ee469274a3fe 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -500,6 +500,8 @@ retry: goto retry; } if (!desc->reslength) { /* zero length read */ + dev_dbg(&desc->intf->dev, "%s: zero length - clearing WDM_READ\n", __func__); + clear_bit(WDM_READ, &desc->flags); spin_unlock_irq(&desc->iuspin); goto retry; } From 006c7f18449a06027b0165e938c67b3a029813c9 Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Wed, 4 Jul 2012 05:22:53 -0600 Subject: [PATCH 311/612] ARM: OMAP2+: hwmod code/clockdomain data: fix 32K sync timer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kevin discovered that commit c8d82ff68fb6873691536cf33021977efbf5593c ("ARM: OMAP2/3: hwmod data: Add 32k-sync timer data to hwmod database") broke CORE idle on OMAP3. This prevents device low power states. The root cause is that the 32K sync timer IP block does not support smart-idle mode[1], and so the hwmod code keeps the IP block in no-idle mode while it is active. This in turn prevents the WKUP clockdomain from transitioning to idle. There is a hardcoded sleep dependency that prevents the CORE_L3 and CORE_CM clockdomains from transitioning to idle when the WKUP clockdomain is active[2], so the chip cannot enter any device low power states. It turns out that there is no need to take the 32k sync timer out of idle. The IP block itself probably does not have any native idle handling at all, due to its simplicity. Furthermore, the PRCM will never request target idle for this IP block while the kernel is running, due to the sleep dependency that prevents the WKUP clockdomain from idling while the CORE_L3 clockdomain is active. So we can safely leave the 32k sync timer in target-force-idle mode, even while we continue to access it. This workaround is implemented by defining a new clockdomain flag, CLKDM_ACTIVE_WITH_MPU, that indicates that the clockdomain is guaranteed to be active whenever the MPU is inactive. If an IP block's main functional clock exists inside this clockdomain, and the IP block does not support smart-idle modes, then the hwmod code will place the IP block into target force-idle mode even when enabled. The WKUP clockdomains on OMAP3/4 are marked with this flag. (On OMAP2xxx, no OCP header existed on the 32k sync timer.) Other clockdomains also should be marked with this flag, but those changes are deferred until a later merge window, to create a minimal fix. Another theoretically clean fix for this problem would be to implement PM runtime-based control for 32k sync timer accesses. These PM runtime calls would need to located in a custom clocksource, since the 32k sync timer is currently used as an MMIO clocksource. But in practice, there would be little benefit to doing so; and there would be some cost, due to the addition of unnecessary lines of code and the additional CPU overhead of the PM runtime and hwmod code - unnecessary in this case. Another possible fix would have been to modify the pm34xx.c code to force the IP block idle before entering WFI. But this would not have been an acceptable approach: we are trying to remove this type of centralized IP block idle control from the PM code. This patch is a collaboration between Kevin Hilman and Paul Walmsley . Thanks to Vaibhav Hiremath for providing comments on an earlier version of this patch. Thanks to Tero Kristo for identifying a bug in an earlier version of this patch. Thanks to Benoît Cousson for identifying some bugs in several versions of this patch and for implementation comments. References: 1. Table 16-96 "REG_32KSYNCNT_SYSCONFIG" of the OMAP34xx TRM Rev. ZU (SWPU223U), available from: http://www.ti.com/pdfs/wtbu/OMAP34x_ES3.1.x_PUBLIC_TRM_vzU.zip 2. Table 4-72 "Sleep Dependencies" of the OMAP34xx TRM Rev. ZU (SWPU223U) 3. ibid. Cc: Tony Lindgren Cc: Vaibhav Hiremath Cc: Benoît Cousson Cc: Tero Kristo Tested-by: Kevin Hilman Signed-off-by: Paul Walmsley Signed-off-by: Kevin Hilman --- arch/arm/mach-omap2/clockdomain.h | 4 +++ .../mach-omap2/clockdomains2xxx_3xxx_data.c | 1 + arch/arm/mach-omap2/clockdomains44xx_data.c | 2 +- arch/arm/mach-omap2/omap_hwmod.c | 32 ++++++++++++++----- 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/arch/arm/mach-omap2/clockdomain.h b/arch/arm/mach-omap2/clockdomain.h index f7b58609bad8..6227e9505c2d 100644 --- a/arch/arm/mach-omap2/clockdomain.h +++ b/arch/arm/mach-omap2/clockdomain.h @@ -31,12 +31,16 @@ * * CLKDM_NO_AUTODEPS: Prevent "autodeps" from being added/removed from this * clockdomain. (Currently, this applies to OMAP3 clockdomains only.) + * CLKDM_ACTIVE_WITH_MPU: The PRCM guarantees that this clockdomain is + * active whenever the MPU is active. True for interconnects and + * the WKUP clockdomains. */ #define CLKDM_CAN_FORCE_SLEEP (1 << 0) #define CLKDM_CAN_FORCE_WAKEUP (1 << 1) #define CLKDM_CAN_ENABLE_AUTO (1 << 2) #define CLKDM_CAN_DISABLE_AUTO (1 << 3) #define CLKDM_NO_AUTODEPS (1 << 4) +#define CLKDM_ACTIVE_WITH_MPU (1 << 5) #define CLKDM_CAN_HWSUP (CLKDM_CAN_ENABLE_AUTO | CLKDM_CAN_DISABLE_AUTO) #define CLKDM_CAN_SWSUP (CLKDM_CAN_FORCE_SLEEP | CLKDM_CAN_FORCE_WAKEUP) diff --git a/arch/arm/mach-omap2/clockdomains2xxx_3xxx_data.c b/arch/arm/mach-omap2/clockdomains2xxx_3xxx_data.c index 839145e1cfbe..4972219653ce 100644 --- a/arch/arm/mach-omap2/clockdomains2xxx_3xxx_data.c +++ b/arch/arm/mach-omap2/clockdomains2xxx_3xxx_data.c @@ -88,4 +88,5 @@ struct clockdomain wkup_common_clkdm = { .name = "wkup_clkdm", .pwrdm = { .name = "wkup_pwrdm" }, .dep_bit = OMAP_EN_WKUP_SHIFT, + .flags = CLKDM_ACTIVE_WITH_MPU, }; diff --git a/arch/arm/mach-omap2/clockdomains44xx_data.c b/arch/arm/mach-omap2/clockdomains44xx_data.c index c53425847493..7f2133abe7d3 100644 --- a/arch/arm/mach-omap2/clockdomains44xx_data.c +++ b/arch/arm/mach-omap2/clockdomains44xx_data.c @@ -381,7 +381,7 @@ static struct clockdomain l4_wkup_44xx_clkdm = { .cm_inst = OMAP4430_PRM_WKUP_CM_INST, .clkdm_offs = OMAP4430_PRM_WKUP_CM_WKUP_CDOFFS, .dep_bit = OMAP4430_L4WKUP_STATDEP_SHIFT, - .flags = CLKDM_CAN_HWSUP, + .flags = CLKDM_CAN_HWSUP | CLKDM_ACTIVE_WITH_MPU, }; static struct clockdomain emu_sys_44xx_clkdm = { diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index 773193670ea2..2d710f50fca2 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c @@ -1124,15 +1124,18 @@ static struct omap_hwmod_addr_space * __init _find_mpu_rt_addr_space(struct omap * _enable_sysc - try to bring a module out of idle via OCP_SYSCONFIG * @oh: struct omap_hwmod * * - * If module is marked as SWSUP_SIDLE, force the module out of slave - * idle; otherwise, configure it for smart-idle. If module is marked - * as SWSUP_MSUSPEND, force the module out of master standby; - * otherwise, configure it for smart-standby. No return value. + * Ensure that the OCP_SYSCONFIG register for the IP block represented + * by @oh is set to indicate to the PRCM that the IP block is active. + * Usually this means placing the module into smart-idle mode and + * smart-standby, but if there is a bug in the automatic idle handling + * for the IP block, it may need to be placed into the force-idle or + * no-idle variants of these modes. No return value. */ static void _enable_sysc(struct omap_hwmod *oh) { u8 idlemode, sf; u32 v; + bool clkdm_act; if (!oh->class->sysc) return; @@ -1141,8 +1144,16 @@ static void _enable_sysc(struct omap_hwmod *oh) sf = oh->class->sysc->sysc_flags; if (sf & SYSC_HAS_SIDLEMODE) { - idlemode = (oh->flags & HWMOD_SWSUP_SIDLE) ? - HWMOD_IDLEMODE_NO : HWMOD_IDLEMODE_SMART; + clkdm_act = ((oh->clkdm && + oh->clkdm->flags & CLKDM_ACTIVE_WITH_MPU) || + (oh->_clk && oh->_clk->clkdm && + oh->_clk->clkdm->flags & CLKDM_ACTIVE_WITH_MPU)); + if (clkdm_act && !(oh->class->sysc->idlemodes & + (SIDLE_SMART | SIDLE_SMART_WKUP))) + idlemode = HWMOD_IDLEMODE_FORCE; + else + idlemode = (oh->flags & HWMOD_SWSUP_SIDLE) ? + HWMOD_IDLEMODE_NO : HWMOD_IDLEMODE_SMART; _set_slave_idlemode(oh, idlemode, &v); } @@ -1208,8 +1219,13 @@ static void _idle_sysc(struct omap_hwmod *oh) sf = oh->class->sysc->sysc_flags; if (sf & SYSC_HAS_SIDLEMODE) { - idlemode = (oh->flags & HWMOD_SWSUP_SIDLE) ? - HWMOD_IDLEMODE_FORCE : HWMOD_IDLEMODE_SMART; + /* XXX What about HWMOD_IDLEMODE_SMART_WKUP? */ + if (oh->flags & HWMOD_SWSUP_SIDLE || + !(oh->class->sysc->idlemodes & + (SIDLE_SMART | SIDLE_SMART_WKUP))) + idlemode = HWMOD_IDLEMODE_FORCE; + else + idlemode = HWMOD_IDLEMODE_SMART; _set_slave_idlemode(oh, idlemode, &v); } From 9e9b594661e8dee54955607277c937c3ff3a5f01 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 6 Jul 2012 08:11:43 +0200 Subject: [PATCH 312/612] ALSA: usb-audio: Fix the first PCM interface assignment In the new PCM streaming logic, the interface number is assigned to usb stream instance (subs->interface) after the format and rate setups are succeeded, but some codes are still passing subs->interface as the reference to helper functions. This leads to initializing with an invalid iface number (-1). This patch replaces the wrong references with the ones from the target fmt correctly. Signed-off-by: Takashi Iwai --- sound/usb/pcm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c index 54607f8c4f66..f0ede134a111 100644 --- a/sound/usb/pcm.c +++ b/sound/usb/pcm.c @@ -387,7 +387,7 @@ add_sync_ep: subs->data_endpoint->sync_master = subs->sync_endpoint; } - if ((err = snd_usb_init_pitch(subs->stream->chip, subs->interface, alts, fmt)) < 0) + if ((err = snd_usb_init_pitch(subs->stream->chip, fmt->iface, alts, fmt)) < 0) return err; subs->cur_audiofmt = fmt; @@ -450,7 +450,7 @@ static int snd_usb_hw_params(struct snd_pcm_substream *substream, struct usb_interface *iface; iface = usb_ifnum_to_if(subs->dev, fmt->iface); alts = &iface->altsetting[fmt->altset_idx]; - ret = snd_usb_init_sample_rate(subs->stream->chip, subs->interface, alts, fmt, rate); + ret = snd_usb_init_sample_rate(subs->stream->chip, fmt->iface, alts, fmt, rate); if (ret < 0) return ret; subs->cur_rate = rate; From 326f418b4c74ab4a6066f38b6144bc6461f9447b Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 28 Jun 2012 15:04:57 -0400 Subject: [PATCH 313/612] tracepoint: Use static_key_false(), since static_branch() is deprecated Convert the last user of static_branch() -> static_key_false(). Signed-off-by: Jason Baron Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/5fffcd40a6c063769badcdd74a7d90980500dbcb.1340909155.git.jbaron@redhat.com Signed-off-by: Ingo Molnar --- include/linux/tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index bd96ecd0e05c..802de56c41e8 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -153,7 +153,7 @@ static inline void tracepoint_synchronize_unregister(void) } \ static inline void trace_##name##_rcuidle(proto) \ { \ - if (static_branch(&__tracepoint_##name.key)) \ + if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ From 47fbc518a4b5c9a949f7cab8b14a00d3549bf138 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 28 Jun 2012 15:05:02 -0400 Subject: [PATCH 314/612] jump label: Remove static_branch() Remove the obsolete static_branch() interface, since the supported interface is now static_key_false()/true() - which is used by all in-tree code. See commit: c5905afb0e ("static keys: Introduce 'struct static_key', static_key_true()/false() and static_key_slow_[inc|dec]()"). Signed-off-by: Jason Baron Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/199332c47eef8005d5a5bf1018a80d25929a5746.1340909155.git.jbaron@redhat.com Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index c513a40510f5..0976fc46d1e0 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -42,8 +42,7 @@ * allowed. * * Not initializing the key (static data is initialized to 0s anyway) is the - * same as using STATIC_KEY_INIT_FALSE and static_key_false() is - * equivalent with static_branch(). + * same as using STATIC_KEY_INIT_FALSE. * */ @@ -107,12 +106,6 @@ static __always_inline bool static_key_true(struct static_key *key) return !static_key_false(key); } -/* Deprecated. Please use 'static_key_false() instead. */ -static __always_inline bool static_branch(struct static_key *key) -{ - return arch_static_branch(key); -} - extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; @@ -166,14 +159,6 @@ static __always_inline bool static_key_true(struct static_key *key) return false; } -/* Deprecated. Please use 'static_key_false() instead. */ -static __always_inline bool static_branch(struct static_key *key) -{ - if (unlikely(atomic_read(&key->enabled)) > 0) - return true; - return false; -} - static inline void static_key_slow_inc(struct static_key *key) { atomic_inc(&key->enabled); From b39f25a849d7677a7dbf183f2483fd41c201a5ce Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 25 Jun 2012 13:38:27 -0700 Subject: [PATCH 315/612] x86/apic: Optimize cpu traversal in __assign_irq_vector() using domain membership Currently __assign_irq_vector() goes through each cpu in the specified mask until it finds a free vector in all the cpu's that are part of the same interrupt domain. We visit all the interrupt domain sibling cpus to reserve the free vector. So, when we fail to find a free vector in an interrupt domain, it is safe to continue our search with a cpu belonging to a new interrupt domain. No need to go through each cpu, if the domain containing that cpu is already visited. Use the irq_cfg's old_domain to track the visited domains and optimize the cpu traversal while finding a free vector in the given cpumask. NOTE: We can also optimize the search by using for_each_cpu() and skip the current cpu, if it is not the first cpu in the mask returned by the vector_allocation_domain(). But re-using the cfg->old_domain to track the visited domains will be slightly faster. Signed-off-by: Suresh Siddha Acked-by: Yinghai Lu Acked-by: Alexander Gordeev Acked-by: Cyrill Gorcunov Link: http://lkml.kernel.org/r/1340656709-11423-2-git-send-email-suresh.b.siddha@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 8 +++----- arch/x86/kernel/apic/apic_noop.c | 3 +-- arch/x86/kernel/apic/io_apic.c | 15 ++++++++------- arch/x86/kernel/apic/x2apic_cluster.c | 3 +-- arch/x86/kernel/vsmp_64.c | 3 +-- 5 files changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index eec240e12091..8bebeb8952fb 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -306,7 +306,7 @@ struct apic { unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); unsigned long (*check_apicid_present)(int apicid); - bool (*vector_allocation_domain)(int cpu, struct cpumask *retmask); + void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); void (*init_apic_ldr)(void); void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); @@ -614,7 +614,7 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, unsigned int *apicid); -static inline bool +static inline void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus @@ -627,14 +627,12 @@ flat_vector_allocation_domain(int cpu, struct cpumask *retmask) */ cpumask_clear(retmask); cpumask_bits(retmask)[0] = APIC_ALL_CPUS; - return false; } -static inline bool +static inline void default_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_copy(retmask, cpumask_of(cpu)); - return true; } static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 65c07fc630a1..08c337bc49ff 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -100,12 +100,11 @@ static unsigned long noop_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } -static bool noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) { if (cpu != 0) pr_warning("APIC: Vector allocated for non-BSP cpu\n"); cpumask_copy(retmask, cpumask_of(cpu)); - return true; } static u32 noop_apic_read(u32 reg) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a951ef7decb1..8a08f09aa505 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1134,12 +1134,13 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; - for_each_cpu_and(cpu, mask, cpu_online_mask) { + cpumask_clear(cfg->old_domain); + cpu = cpumask_first_and(mask, cpu_online_mask); + while (cpu < nr_cpu_ids) { int new_cpu; int vector, offset; - bool more_domains; - more_domains = apic->vector_allocation_domain(cpu, tmp_mask); + apic->vector_allocation_domain(cpu, tmp_mask); if (cpumask_subset(tmp_mask, cfg->domain)) { free_cpumask_var(tmp_mask); @@ -1156,10 +1157,10 @@ next: } if (unlikely(current_vector == vector)) { - if (more_domains) - continue; - else - break; + cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask); + cpumask_andnot(tmp_mask, mask, cfg->old_domain); + cpu = cpumask_first_and(tmp_mask, cpu_online_mask); + continue; } if (test_bit(vector, used_vectors)) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 943d03fc6fc4..b5d889b5659a 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -212,11 +212,10 @@ static int x2apic_cluster_probe(void) /* * Each x2apic cluster is an allocation domain. */ -static bool cluster_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_clear(retmask); cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu)); - return true; } static struct apic apic_x2apic_cluster = { diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index fa5adb7c228c..3f0285ac00fa 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -208,10 +208,9 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) * In vSMP, all cpus should be capable of handling interrupts, regardless of * the APIC used. */ -static bool fill_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_setall(retmask); - return false; } static void vsmp_apic_post_init(void) From 1ac322d0b169c95ce34d55b3ed6d40ce1a5f3a02 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 25 Jun 2012 13:38:28 -0700 Subject: [PATCH 316/612] x86/apic/x2apic: Limit the vector reservation to the user specified mask For the x2apic cluster mode, vector for an interrupt is currently reserved on all the cpu's that are part of the x2apic cluster. But the interrupts will be routed only to the cluster (derived from the first cpu in the mask) members specified in the mask. So there is no need to reserve the vector in the unused cluster members. Modify __assign_irq_vector() to reserve the vectors based on the user specified irq destination mask. If the new mask is a proper subset of the currently used mask, cleanup the vector allocation on the unused cpu members. Also, allow the apic driver to tune the vector domain based on the affinity mask (which in most cases is the user-specified mask). Signed-off-by: Suresh Siddha Acked-by: Yinghai Lu Acked-by: Alexander Gordeev Acked-by: Cyrill Gorcunov Link: http://lkml.kernel.org/r/1340656709-11423-3-git-send-email-suresh.b.siddha@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 9 +++++--- arch/x86/kernel/apic/apic_noop.c | 3 ++- arch/x86/kernel/apic/io_apic.c | 31 +++++++++++++-------------- arch/x86/kernel/apic/x2apic_cluster.c | 6 +++--- arch/x86/kernel/vsmp_64.c | 3 ++- 5 files changed, 28 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 8bebeb8952fb..88093c1d44fd 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -306,7 +306,8 @@ struct apic { unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); unsigned long (*check_apicid_present)(int apicid); - void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); + void (*vector_allocation_domain)(int cpu, struct cpumask *retmask, + const struct cpumask *mask); void (*init_apic_ldr)(void); void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); @@ -615,7 +616,8 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, unsigned int *apicid); static inline void -flat_vector_allocation_domain(int cpu, struct cpumask *retmask) +flat_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -630,7 +632,8 @@ flat_vector_allocation_domain(int cpu, struct cpumask *retmask) } static inline void -default_vector_allocation_domain(int cpu, struct cpumask *retmask) +default_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) { cpumask_copy(retmask, cpumask_of(cpu)); } diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 08c337bc49ff..e145f28b4099 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -100,7 +100,8 @@ static unsigned long noop_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } -static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) { if (cpu != 0) pr_warning("APIC: Vector allocated for non-BSP cpu\n"); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8a08f09aa505..9684f963befe 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1113,7 +1113,6 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) */ static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; static int current_offset = VECTOR_OFFSET_START % 16; - unsigned int old_vector; int cpu, err; cpumask_var_t tmp_mask; @@ -1123,28 +1122,28 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) return -ENOMEM; - old_vector = cfg->vector; - if (old_vector) { - cpumask_and(tmp_mask, mask, cpu_online_mask); - if (cpumask_subset(tmp_mask, cfg->domain)) { - free_cpumask_var(tmp_mask); - return 0; - } - } - /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; cpumask_clear(cfg->old_domain); cpu = cpumask_first_and(mask, cpu_online_mask); while (cpu < nr_cpu_ids) { - int new_cpu; - int vector, offset; + int new_cpu, vector, offset; - apic->vector_allocation_domain(cpu, tmp_mask); + apic->vector_allocation_domain(cpu, tmp_mask, mask); if (cpumask_subset(tmp_mask, cfg->domain)) { - free_cpumask_var(tmp_mask); - return 0; + err = 0; + if (cpumask_equal(tmp_mask, cfg->domain)) + break; + /* + * New cpumask using the vector is a proper subset of + * the current in use mask. So cleanup the vector + * allocation for the members that are not used anymore. + */ + cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); + cfg->move_in_progress = 1; + cpumask_and(cfg->domain, cfg->domain, tmp_mask); + break; } vector = current_vector; @@ -1172,7 +1171,7 @@ next: /* Found one! */ current_vector = vector; current_offset = offset; - if (old_vector) { + if (cfg->vector) { cfg->move_in_progress = 1; cpumask_copy(cfg->old_domain, cfg->domain); } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index b5d889b5659a..bde78d0098a4 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -212,10 +212,10 @@ static int x2apic_cluster_probe(void) /* * Each x2apic cluster is an allocation domain. */ -static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) { - cpumask_clear(retmask); - cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu)); + cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); } static struct apic apic_x2apic_cluster = { diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 3f0285ac00fa..992f890283e9 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -208,7 +208,8 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) * In vSMP, all cpus should be capable of handling interrupts, regardless of * the APIC used. */ -static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) { cpumask_setall(retmask); } From d872818dbbeed1bccf58c7f8c7db432154c802f9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 25 Jun 2012 13:38:29 -0700 Subject: [PATCH 317/612] x86/apic/x2apic: Use multiple cluster members for the irq destination only with the explicit affinity During boot or driver load etc, interrupt destination is setup using default target cpu's. Later the user (irqbalance etc) or the driver (irq_set_affinity/ irq_set_affinity_hint) can request the interrupt to be migrated to some specific set of cpu's. In the x2apic cluster routing, for the default scenario use single cpu as the interrupt destination and when there is an explicit interrupt affinity request, route the interrupt to multiple members of a x2apic cluster specified in the cpumask of the migration request. This will minmize the vector pressure when there are lot of interrupt sources and relatively few x2apic clusters (for example a single socket server). This will allow the performance critical interrupts to be routed to multiple cpu's in the x2apic cluster (irqbalance for example uses the cache siblings etc while specifying the interrupt destination) and allow non-critical interrupts to be serviced by a single logical cpu. Signed-off-by: Suresh Siddha Acked-by: Yinghai Lu Acked-by: Alexander Gordeev Acked-by: Cyrill Gorcunov Link: http://lkml.kernel.org/r/1340656709-11423-4-git-send-email-suresh.b.siddha@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index bde78d0098a4..c88baa4ff0e5 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -209,13 +209,30 @@ static int x2apic_cluster_probe(void) return 0; } +static const struct cpumask *x2apic_cluster_target_cpus(void) +{ + return cpu_all_mask; +} + /* * Each x2apic cluster is an allocation domain. */ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, const struct cpumask *mask) { - cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); + /* + * To minimize vector pressure, default case of boot, device bringup + * etc will use a single cpu for the interrupt destination. + * + * On explicit migration requests coming from irqbalance etc, + * interrupts will be routed to the x2apic cluster (cluster-id + * derived from the first cpu in the mask) members specified + * in the mask. + */ + if (mask == x2apic_cluster_target_cpus()) + cpumask_copy(retmask, cpumask_of(cpu)); + else + cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); } static struct apic apic_x2apic_cluster = { @@ -229,7 +246,7 @@ static struct apic apic_x2apic_cluster = { .irq_delivery_mode = dest_LowestPrio, .irq_dest_mode = 1, /* logical */ - .target_cpus = online_target_cpus, + .target_cpus = x2apic_cluster_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, From 6a6dccba2fdc2a69f1f36b8f1c0acc8598e7221b Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Thu, 5 Jul 2012 15:52:23 +0530 Subject: [PATCH 318/612] mm: cma: don't replace lowmem pages with highmem The filesystem layer expects pages in the block device's mapping to not be in highmem (the mapping's gfp mask is set in bdget()), but CMA can currently replace lowmem pages with highmem pages, leading to crashes in filesystem code such as the one below: Unable to handle kernel NULL pointer dereference at virtual address 00000400 pgd = c0c98000 [00000400] *pgd=00c91831, *pte=00000000, *ppte=00000000 Internal error: Oops: 817 [#1] PREEMPT SMP ARM CPU: 0 Not tainted (3.5.0-rc5+ #80) PC is at __memzero+0x24/0x80 ... Process fsstress (pid: 323, stack limit = 0xc0cbc2f0) Backtrace: [] (ext4_getblk+0x0/0x180) from [] (ext4_bread+0x1c/0x98) [] (ext4_bread+0x0/0x98) from [] (ext4_mkdir+0x160/0x3bc) r4:c15337f0 [] (ext4_mkdir+0x0/0x3bc) from [] (vfs_mkdir+0x8c/0x98) [] (vfs_mkdir+0x0/0x98) from [] (sys_mkdirat+0x74/0xac) r6:00000000 r5:c152eb40 r4:000001ff r3:c14b43f0 [] (sys_mkdirat+0x0/0xac) from [] (sys_mkdir+0x20/0x24) r6:beccdcf0 r5:00074000 r4:beccdbbc [] (sys_mkdir+0x0/0x24) from [] (ret_fast_syscall+0x0/0x30) Fix this by replacing only highmem pages with highmem. Reported-by: Laura Abbott Signed-off-by: Rabin Vincent Acked-by: Michal Nazarewicz Signed-off-by: Marek Szyprowski --- mm/page_alloc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44030096da63..4a4f9219683f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5635,7 +5635,12 @@ static struct page * __alloc_contig_migrate_alloc(struct page *page, unsigned long private, int **resultp) { - return alloc_page(GFP_HIGHUSER_MOVABLE); + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + + return alloc_page(gfp_mask); } /* [start, end) must belong to a single zone. */ From cc2caea5b6152b8ce66dc2bbe83dc72b60612da8 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 6 Jul 2012 12:02:04 +0200 Subject: [PATCH 319/612] mm: cma: fix condition check when setting global cma area dev_set_cma_area incorrectly assigned cma to global area on first call due to incorrect check. This patch fixes this issue. Signed-off-by: Marek Szyprowski --- include/asm-generic/dma-contiguous.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/dma-contiguous.h b/include/asm-generic/dma-contiguous.h index c544356b374b..294b1e755ab2 100644 --- a/include/asm-generic/dma-contiguous.h +++ b/include/asm-generic/dma-contiguous.h @@ -18,7 +18,7 @@ static inline void dev_set_cma_area(struct device *dev, struct cma *cma) { if (dev) dev->cma_area = cma; - if (!dev || !dma_contiguous_default_area) + if (!dev && !dma_contiguous_default_area) dma_contiguous_default_area = cma; } From 95c0d71dcb853c2eca5f2231ebbd4c1d3af775b7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 3 Jul 2012 23:37:31 +0900 Subject: [PATCH 320/612] MAINTAINERS/sched: Update scheduler file pattern The commit 391e43da797a ("sched: Move all scheduler bits into kernel/sched/") moved all scheduler codes to the kernel/sched/ directory, but missed the MAINTAINERS. Since it still expects files from kernel/ directory, get_maintainer script has to rely on the git (log) fallback mechanism. $ scripts/get_maintainer.pl -f kernel/sched/core.c --nogit-fallback linux-kernel@vger.kernel.org (open list) With this patch: $ scripts/get_maintainer.pl -f kernel/sched/core.c --nogit-fallback Ingo Molnar (maintainer:SCHEDULER) Peter Zijlstra (maintainer:SCHEDULER) linux-kernel@vger.kernel.org (open list) Signed-off-by: Namhyung Kim Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1341326251-4140-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 03df1d15ebf3..214d75489640 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5909,7 +5909,7 @@ M: Ingo Molnar M: Peter Zijlstra T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core S: Maintained -F: kernel/sched* +F: kernel/sched/ F: include/linux/sched.h SCORE ARCHITECTURE From c3b7cdf180090d2686239a75bb0ae408108ed749 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Fri, 6 Jul 2012 12:59:46 +0300 Subject: [PATCH 321/612] perf/x86: Fix intel_perfmon_event_mapformatting Use tabs for "intel_perfmon_event_map" formatting in perf_event_intel.c. Signed-off-by: Pekka Enberg Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1341568786-7045-1-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 5fdedb4bc3f1..1f4c8add6759 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -21,14 +21,14 @@ */ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, - [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ }; static struct event_constraint intel_core_event_constraints[] __read_mostly = From cfca927972e31a5b3da49bf641c525732ff3c357 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Jun 2012 12:54:17 -0700 Subject: [PATCH 322/612] rcu: Introduce check for callback list/count mismatch The recent bug that introduced the RCU callback list/count mismatch showed the need for a diagnostic to check for this, which this commit adds. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 95c7b61e77e4..4154c9567a6d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1612,6 +1612,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rdp->n_force_qs_snap = rsp->n_force_qs; } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = rdp->qlen; + WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); local_irq_restore(flags); From c701d5d9b384ff03ceb232ef21236364d784a411 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jun 2012 08:08:25 -0700 Subject: [PATCH 323/612] rcu: Fix code-style issues involving "else" The Linux kernel coding style says that single-statement blocks should omit curly braces unless the other leg of the "if" statement has multiple statements, in which case the curly braces should be included. This commit fixes RCU's violations of this rule. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny_plugin.h | 7 ++++--- kernel/rcutorture.c | 3 ++- kernel/rcutree.c | 7 ++++--- kernel/rcutree_plugin.h | 14 ++++++++------ 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 116725b5edfb..918fd1e8509c 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -350,8 +350,9 @@ static int rcu_initiate_boost(void) rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; invoke_rcu_callbacks(); - } else + } else { RCU_TRACE(rcu_initiate_boost_trace()); + } return 1; } @@ -778,9 +779,9 @@ void synchronize_rcu_expedited(void) rpcp->exp_tasks = NULL; /* Wait for tail of ->blkd_tasks list to drain. */ - if (!rcu_preempted_readers_exp()) + if (!rcu_preempted_readers_exp()) { local_irq_restore(flags); - else { + } else { rcu_initiate_boost(); local_irq_restore(flags); wait_event(sync_rcu_preempt_exp_wq, diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c279ee920947..155fb129b641 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -408,8 +408,9 @@ rcu_torture_cb(struct rcu_head *p) if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; rcu_torture_free(rp); - } else + } else { cur_ops->deferred_free(rp); + } } static int rcu_no_completed(void) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 117218a43724..f280e542e3e9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -892,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct if (rnp->qsmask & rdp->grpmask) { rdp->qs_pending = 1; rdp->passed_quiesce = 0; - } else + } else { rdp->qs_pending = 0; + } zero_cpu_stall_ticks(rdp); } } @@ -2130,9 +2131,9 @@ void synchronize_sched_expedited(void) put_online_cpus(); /* No joy, try again later. Or just synchronize_sched(). */ - if (trycount++ < 10) + if (trycount++ < 10) { udelay(trycount * num_online_cpus()); - else { + } else { synchronize_sched(); return; } diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a9194d5606c4..7f3244c0df01 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -390,8 +390,9 @@ void rcu_read_unlock_special(struct task_struct *t) rnp->grphi, !!rnp->gp_tasks); rcu_report_unblock_qs_rnp(rnp, flags); - } else + } else { raw_spin_unlock_irqrestore(&rnp->lock, flags); + } #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ @@ -757,9 +758,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) int must_wait = 0; raw_spin_lock_irqsave(&rnp->lock, flags); - if (list_empty(&rnp->blkd_tasks)) + if (list_empty(&rnp->blkd_tasks)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - else { + } else { rnp->exp_tasks = rnp->blkd_tasks.next; rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ must_wait = 1; @@ -803,9 +804,9 @@ void synchronize_rcu_expedited(void) * expedited grace period for us, just leave. */ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { - if (trycount++ < 10) + if (trycount++ < 10) { udelay(trycount * num_online_cpus()); - else { + } else { synchronize_rcu(); return; } @@ -2093,8 +2094,9 @@ static void rcu_prepare_for_idle(int cpu) if (rcu_cpu_has_callbacks(cpu)) { trace_rcu_prep_idle("More callbacks"); invoke_rcu_core(); - } else + } else { trace_rcu_prep_idle("Callbacks drained"); + } } /* From 5cf05ad758c30d17ff23c2be346b5de982bc2121 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2012 15:12:45 -0700 Subject: [PATCH 324/612] rcu: Fix broken strings in RCU's source code. Although the C language allows you to break strings across lines, doing this makes it hard for people to find the Linux kernel code corresponding to a given console message. This commit therefore fixes broken strings throughout RCU's source code. Suggested-by: Josh Triplett Suggested-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 3 +-- kernel/rcutorture.c | 33 ++++++++++++++------------------- kernel/rcutree_trace.c | 25 ++++++++++++------------- lib/list_debug.c | 6 ++---- 4 files changed, 29 insertions(+), 38 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c2c0d86dd3ac..115ead2b5155 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -427,8 +427,7 @@ extern int rcu_my_thread_group_empty(void); static inline void rcu_preempt_sleep_check(void) { rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), - "Illegal context switch in RCU read-side " - "critical section"); + "Illegal context switch in RCU read-side critical section"); } #else /* #ifdef CONFIG_PROVE_RCU */ static inline void rcu_preempt_sleep_check(void) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 155fb129b641..25b15033c61f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -49,8 +49,7 @@ #include MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney and " - "Josh Triplett "); +MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ @@ -1200,27 +1199,27 @@ rcu_torture_printk(char *page) } cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], - "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d rtbke: %ld rtbre: %ld " - "rtbf: %ld rtb: %ld nt: %ld " - "onoff: %ld/%ld:%ld/%ld " - "barrier: %ld/%ld:%ld", + "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free), + atomic_read(&n_rcu_torture_free)); + cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror, + n_rcu_torture_boost_rterror); + cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, - n_rcu_torture_timers, + n_rcu_torture_timers); + cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", n_online_successes, n_online_attempts, n_offline_successes, - n_offline_attempts, + n_offline_attempts); + cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", n_barrier_successes, n_barrier_attempts, n_rcu_torture_barrier_error); @@ -1462,8 +1461,7 @@ rcu_torture_shutdown(void *arg) delta = shutdown_time - jiffies_snap; if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu " - "jiffies remaining\n", + "rcu_torture_shutdown task: %lu jiffies remaining\n", torture_type, delta); schedule_timeout_interruptible(delta); jiffies_snap = ACCESS_ONCE(jiffies); @@ -1515,8 +1513,7 @@ rcu_torture_onoff(void *arg) if (cpu_down(cpu) == 0) { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: " - "offlined %d\n", + "rcu_torture_onoff task: offlined %d\n", torture_type, cpu); n_offline_successes++; } @@ -1529,8 +1526,7 @@ rcu_torture_onoff(void *arg) if (cpu_up(cpu) == 0) { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: " - "onlined %d\n", + "rcu_torture_onoff task: onlined %d\n", torture_type, cpu); n_online_successes++; } @@ -1952,8 +1948,7 @@ rcu_torture_init(void) return -EINVAL; } if (cur_ops->fqs == NULL && fqs_duration != 0) { - printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " - "fqs_duration, fqs disabled.\n"); + printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } if (cur_ops->init) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index a16ddbd6fdc4..abffb486e94e 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -218,8 +218,7 @@ static const struct file_operations rcudata_csv_fops = { static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) { - seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " - "j=%04x bt=%04x\n", + seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", rnp->grplo, rnp->grphi, "T."[list_empty(&rnp->blkd_tasks)], "N."[!rnp->gp_tasks], @@ -227,11 +226,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) "B."[!rnp->boost_tasks], convert_kthread_status(rnp->boost_kthread_status), rnp->n_tasks_boosted, rnp->n_exp_boosts, - rnp->n_normal_boosts, + rnp->n_normal_boosts); + seq_printf(m, "j=%04x bt=%04x\n", (int)(jiffies & 0xffff), (int)(rnp->boost_time & 0xffff)); - seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", - " balk", + seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", rnp->n_balk_blkd_tasks, rnp->n_balk_exp_gp_tasks, rnp->n_balk_boost_tasks, @@ -287,11 +286,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; gpnum = rsp->gpnum; - seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", + seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", rsp->name, rsp->completed, gpnum, rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), - (int)(jiffies & 0xffff), + (int)(jiffies & 0xffff)); + seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); @@ -378,16 +377,16 @@ static const struct file_operations rcugp_fops = { static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) { - seq_printf(m, "%3d%cnp=%ld " - "qsp=%ld rpq=%ld cbr=%ld cng=%ld " - "gpc=%ld gps=%ld nf=%ld nn=%ld\n", + seq_printf(m, "%3d%cnp=%ld ", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->n_rcu_pending, + rdp->n_rcu_pending); + seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", rdp->n_rp_qs_pending, rdp->n_rp_report_qs, rdp->n_rp_cb_ready, - rdp->n_rp_cpu_needs_gp, + rdp->n_rp_cpu_needs_gp); + seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", rdp->n_rp_gp_completed, rdp->n_rp_gp_started, rdp->n_rp_need_fqs, diff --git a/lib/list_debug.c b/lib/list_debug.c index 23a5e031cd8b..c24c2f7e296f 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -87,12 +87,10 @@ void __list_add_rcu(struct list_head *new, struct list_head *prev, struct list_head *next) { WARN(next->prev != prev, - "list_add_rcu corruption. next->prev should be " - "prev (%p), but was %p. (next=%p).\n", + "list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", prev, next->prev, next); WARN(prev->next != next, - "list_add_rcu corruption. prev->next should be " - "next (%p), but was %p. (prev=%p).\n", + "list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", next, prev->next, prev); new->next = next; new->prev = prev; From 096bcc231fd263bc8df215f0d616b08e3696c6db Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 29 May 2012 10:16:09 +0200 Subject: [PATCH 325/612] mtd: mxc_nand: use 32bit copy functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following commit changes the function used to copy from/to the hardware buffer to memcpy_[from|to]io. This does not work since the hardware cannot handle the byte accesses used by these functions. Instead of reverting this patch introduce 32bit correspondents of these functions. | commit 5775ba36ea9c760c2d7e697dac04f2f7fc95aa62 | Author: Uwe Kleine-König | Date: Tue Apr 24 10:05:22 2012 +0200 | | mtd: mxc_nand: fix several sparse warnings about incorrect address space | | Signed-off-by: Uwe Kleine-König | Signed-off-by: Artem Bityutskiy Signed-off-by: Sascha Hauer Cc: Uwe Kleine-König Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/nand/mxc_nand.c | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c index c58e6a93f445..6acc790c2fbb 100644 --- a/drivers/mtd/nand/mxc_nand.c +++ b/drivers/mtd/nand/mxc_nand.c @@ -273,6 +273,26 @@ static struct nand_ecclayout nandv2_hw_eccoob_4k = { static const char *part_probes[] = { "RedBoot", "cmdlinepart", "ofpart", NULL }; +static void memcpy32_fromio(void *trg, const void __iomem *src, size_t size) +{ + int i; + u32 *t = trg; + const __iomem u32 *s = src; + + for (i = 0; i < (size >> 2); i++) + *t++ = __raw_readl(s++); +} + +static void memcpy32_toio(void __iomem *trg, const void *src, int size) +{ + int i; + u32 __iomem *t = trg; + const u32 *s = src; + + for (i = 0; i < (size >> 2); i++) + __raw_writel(*s++, t++); +} + static int check_int_v3(struct mxc_nand_host *host) { uint32_t tmp; @@ -519,7 +539,7 @@ static void send_read_id_v3(struct mxc_nand_host *host) wait_op_done(host, true); - memcpy_fromio(host->data_buf, host->main_area0, 16); + memcpy32_fromio(host->data_buf, host->main_area0, 16); } /* Request the NANDFC to perform a read of the NAND device ID. */ @@ -535,7 +555,7 @@ static void send_read_id_v1_v2(struct mxc_nand_host *host) /* Wait for operation to complete */ wait_op_done(host, true); - memcpy_fromio(host->data_buf, host->main_area0, 16); + memcpy32_fromio(host->data_buf, host->main_area0, 16); if (this->options & NAND_BUSWIDTH_16) { /* compress the ID info */ @@ -797,16 +817,16 @@ static void copy_spare(struct mtd_info *mtd, bool bfrom) if (bfrom) { for (i = 0; i < n - 1; i++) - memcpy_fromio(d + i * j, s + i * t, j); + memcpy32_fromio(d + i * j, s + i * t, j); /* the last section */ - memcpy_fromio(d + i * j, s + i * t, mtd->oobsize - i * j); + memcpy32_fromio(d + i * j, s + i * t, mtd->oobsize - i * j); } else { for (i = 0; i < n - 1; i++) - memcpy_toio(&s[i * t], &d[i * j], j); + memcpy32_toio(&s[i * t], &d[i * j], j); /* the last section */ - memcpy_toio(&s[i * t], &d[i * j], mtd->oobsize - i * j); + memcpy32_toio(&s[i * t], &d[i * j], mtd->oobsize - i * j); } } @@ -1070,7 +1090,8 @@ static void mxc_nand_command(struct mtd_info *mtd, unsigned command, host->devtype_data->send_page(mtd, NFC_OUTPUT); - memcpy_fromio(host->data_buf, host->main_area0, mtd->writesize); + memcpy32_fromio(host->data_buf, host->main_area0, + mtd->writesize); copy_spare(mtd, true); break; @@ -1086,7 +1107,7 @@ static void mxc_nand_command(struct mtd_info *mtd, unsigned command, break; case NAND_CMD_PAGEPROG: - memcpy_toio(host->main_area0, host->data_buf, mtd->writesize); + memcpy32_toio(host->main_area0, host->data_buf, mtd->writesize); copy_spare(mtd, false); host->devtype_data->send_page(mtd, NFC_INPUT); host->devtype_data->send_cmd(host, command, true); From 6023813a2d5949ba368e7df464f2ccb649719777 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 26 Jun 2012 17:26:16 +0200 Subject: [PATCH 326/612] mtd: gpmi-nand: fix read page when reading to vmalloced area The gpmi-nand driver uses virt_addr_valid() to check whether a buffer is suitable for dma. If it's not, a driver allocated buffer is used instead. Then after a page read the driver allocated buffer must be copied to the user supplied buffer. This does not happen since commit 7725cc85932bd02dd12c23108e0ef748c551ccba. This patch fixes the issue. The bug is encountered with UBI which uses a vmalloced buffer for the volume table. Signed-off-by: Sascha Hauer Tested-by: snijsure@grid-net.com Acked-by: Huang Shijie Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/nand/gpmi-nand/gpmi-nand.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c index a05b7b444d4f..a6cad5caba78 100644 --- a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c +++ b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c @@ -920,12 +920,12 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip, */ memset(chip->oob_poi, ~0, mtd->oobsize); chip->oob_poi[0] = ((uint8_t *) auxiliary_virt)[0]; - - read_page_swap_end(this, buf, mtd->writesize, - this->payload_virt, this->payload_phys, - nfc_geo->payload_size, - payload_virt, payload_phys); } + + read_page_swap_end(this, buf, mtd->writesize, + this->payload_virt, this->payload_phys, + nfc_geo->payload_size, + payload_virt, payload_phys); exit_nfc: return ret; } From 021796b892dcad45743f3196e3eef7222870dd55 Mon Sep 17 00:00:00 2001 From: Mike Dunn Date: Tue, 22 May 2012 11:03:42 -0700 Subject: [PATCH 327/612] mtd: ABI documentation: clarification of bitflip_threshold The -EUCLEAN return value applies to mtd_read_oob() as well as mtd_read(), but only mtd_read() was mentioned in the blurd on bitflip_threshold in the ABI documentation. Signed-off-by: Mike Dunn Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- Documentation/ABI/testing/sysfs-class-mtd | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-class-mtd b/Documentation/ABI/testing/sysfs-class-mtd index db1ad7e34fc3..938ef71e2035 100644 --- a/Documentation/ABI/testing/sysfs-class-mtd +++ b/Documentation/ABI/testing/sysfs-class-mtd @@ -142,13 +142,14 @@ KernelVersion: 3.4 Contact: linux-mtd@lists.infradead.org Description: This allows the user to examine and adjust the criteria by which - mtd returns -EUCLEAN from mtd_read(). If the maximum number of - bit errors that were corrected on any single region comprising - an ecc step (as reported by the driver) equals or exceeds this - value, -EUCLEAN is returned. Otherwise, absent an error, 0 is - returned. Higher layers (e.g., UBI) use this return code as an - indication that an erase block may be degrading and should be - scrutinized as a candidate for being marked as bad. + mtd returns -EUCLEAN from mtd_read() and mtd_read_oob(). If the + maximum number of bit errors that were corrected on any single + region comprising an ecc step (as reported by the driver) equals + or exceeds this value, -EUCLEAN is returned. Otherwise, absent + an error, 0 is returned. Higher layers (e.g., UBI) use this + return code as an indication that an erase block may be + degrading and should be scrutinized as a candidate for being + marked as bad. The initial value may be specified by the flash device driver. If not, then the default value is ecc_strength. @@ -167,7 +168,7 @@ Description: block degradation, but high enough to avoid the consequences of a persistent return value of -EUCLEAN on devices where sticky bitflips occur. Note that if bitflip_threshold exceeds - ecc_strength, -EUCLEAN is never returned by mtd_read(). + ecc_strength, -EUCLEAN is never returned by the read operations. Conversely, if bitflip_threshold is zero, -EUCLEAN is always returned, absent a hard error. From 596fd46268634082314b3af1ded4612e1b7f3f03 Mon Sep 17 00:00:00 2001 From: Herton Ronaldo Krzesinski Date: Wed, 16 May 2012 16:21:52 -0300 Subject: [PATCH 328/612] mtd: nandsim: don't open code a do_div helper We don't need to open code the divide function, just use div_u64 that already exists and do the same job. While this is a straightforward clean up, there is more to that, the real motivation for this. While building on a cross compiling environment in armel, using gcc 4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5), I was getting the following build error: ERROR: "__aeabi_uldivmod" [drivers/mtd/nand/nandsim.ko] undefined! After investigating with objdump and hand built assembly version generated with the compiler, I narrowed __aeabi_uldivmod as being generated from the divide function. When nandsim.c is built with -fno-inline-functions-called-once, that happens when CONFIG_DEBUG_SECTION_MISMATCH is enabled, the do_div optimization in arch/arm/include/asm/div64.h doesn't work as expected with the open coded divide function: even if the do_div we are using doesn't have a constant divisor, the compiler still includes the else parts of the optimized do_div macro, and translates the divisions there to use __aeabi_uldivmod, instead of only calling __do_div_asm -> __do_div64 and optimizing/removing everything else out. So to reproduce, gcc 4.6 plus CONFIG_DEBUG_SECTION_MISMATCH=y and CONFIG_MTD_NAND_NANDSIM=m should do it, building on armel. After this change, the compiler does the intended thing even with -fno-inline-functions-called-once, and optimizes out as expected the constant handling in the optimized do_div on arm. As this also avoids a build issue, I'm marking for Stable, as I think is applicable for this case. Signed-off-by: Herton Ronaldo Krzesinski Cc: stable@vger.kernel.org Acked-by: Nicolas Pitre Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/nand/nandsim.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c index 6cc8fbfabb8e..cf0cd3146817 100644 --- a/drivers/mtd/nand/nandsim.c +++ b/drivers/mtd/nand/nandsim.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include @@ -546,12 +546,6 @@ static char *get_partition_name(int i) return kstrdup(buf, GFP_KERNEL); } -static uint64_t divide(uint64_t n, uint32_t d) -{ - do_div(n, d); - return n; -} - /* * Initialize the nandsim structure. * @@ -580,7 +574,7 @@ static int init_nandsim(struct mtd_info *mtd) ns->geom.oobsz = mtd->oobsize; ns->geom.secsz = mtd->erasesize; ns->geom.pgszoob = ns->geom.pgsz + ns->geom.oobsz; - ns->geom.pgnum = divide(ns->geom.totsz, ns->geom.pgsz); + ns->geom.pgnum = div_u64(ns->geom.totsz, ns->geom.pgsz); ns->geom.totszoob = ns->geom.totsz + (uint64_t)ns->geom.pgnum * ns->geom.oobsz; ns->geom.secshift = ffs(ns->geom.secsz) - 1; ns->geom.pgshift = chip->page_shift; @@ -921,7 +915,7 @@ static int setup_wear_reporting(struct mtd_info *mtd) if (!rptwear) return 0; - wear_eb_count = divide(mtd->size, mtd->erasesize); + wear_eb_count = div_u64(mtd->size, mtd->erasesize); mem = wear_eb_count * sizeof(unsigned long); if (mem / sizeof(unsigned long) != wear_eb_count) { NS_ERR("Too many erase blocks for wear reporting\n"); From 5c53d819c71c63fdc91f30a59164583f68e2d63a Mon Sep 17 00:00:00 2001 From: liu chuansheng Date: Fri, 6 Jul 2012 09:50:08 -0700 Subject: [PATCH 329/612] printk: replacing the raw_spin_lock/unlock with raw_spin_lock/unlock_irq In function devkmsg_read/writev/llseek/poll/open()..., the function raw_spin_lock/unlock is used, there is potential deadlock case happening. CPU1: thread1 doing the cat /dev/kmsg: raw_spin_lock(&logbuf_lock); while (user->seq == log_next_seq) { when thread1 run here, at this time one interrupt is coming on CPU1 and running based on this thread,if the interrupt handle called the printk which need the logbuf_lock spin also, it will cause deadlock. So we should use raw_spin_lock/unlock_irq here. Acked-by: Kay Sievers Signed-off-by: liu chuansheng Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/printk.c b/kernel/printk.c index dba18211685e..12886cd19cd9 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -430,20 +430,20 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); goto out; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); ret = wait_event_interruptible(log_wait, user->seq != log_next_seq); if (ret) goto out; - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } if (user->seq < log_first_seq) { @@ -451,7 +451,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_first_idx; user->seq = log_first_seq; ret = -EPIPE; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); goto out; } @@ -501,7 +501,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_next(user->idx); user->seq++; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (len > count) { ret = -EINVAL; @@ -528,7 +528,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) if (offset) return -ESPIPE; - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); switch (whence) { case SEEK_SET: /* the first record */ @@ -552,7 +552,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) default: ret = -EINVAL; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); return ret; } @@ -566,14 +566,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; ret = POLLIN|POLLRDNORM; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); return ret; } @@ -597,10 +597,10 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); user->idx = log_first_idx; user->seq = log_first_seq; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); file->private_data = user; return 0; From e3f5a5f27153228569f3396049838e9727dae86e Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 6 Jul 2012 09:50:09 -0700 Subject: [PATCH 330/612] kmsg: escape the backslash character while exporting data Non-printable characters in the log data are hex-escaped to ensure safe post processing. We need to escape a backslash we find in the data, to be able to distinguish it from a backslash we add for the escaping. Also escape the non-printable character 127. Thanks to Miloslav Trmac for the heads up. Reported-by: Michael Neuling Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/printk.c b/kernel/printk.c index 12886cd19cd9..505863aa3a7f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -465,7 +465,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, for (i = 0; i < msg->text_len; i++) { unsigned char c = log_text(msg)[i]; - if (c < ' ' || c >= 128) + if (c < ' ' || c >= 127 || c == '\\') len += sprintf(user->buf + len, "\\x%02x", c); else user->buf[len++] = c; @@ -489,7 +489,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, continue; } - if (c < ' ' || c >= 128) { + if (c < ' ' || c >= 127 || c == '\\') { len += sprintf(user->buf + len, "\\x%02x", c); continue; } From 43a73a50b352cd3df25b3ced72033942a6a0f919 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 6 Jul 2012 09:50:09 -0700 Subject: [PATCH 331/612] kmsg: add the facility number to the syslog prefix After the recent split of facility and level into separate variables, we miss the facility value (always 0 for kernel-originated messages) in the syslog prefix. On Tue, Jul 3, 2012 at 12:45 PM, Dan Carpenter wrote: > Static checkers complain about the impossible condition here. > > In 084681d14e ('printk: flush continuation lines immediately to > console'), we changed msg->level from being a u16 to being an unsigned > 3 bit bitfield. Cc: Dan Carpenter Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/printk.c b/kernel/printk.c index 505863aa3a7f..37cde752cb8a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -818,15 +818,18 @@ static size_t print_time(u64 ts, char *buf) static size_t print_prefix(const struct log *msg, bool syslog, char *buf) { size_t len = 0; + unsigned int prefix = (msg->facility << 3) | msg->level; if (syslog) { if (buf) { - len += sprintf(buf, "<%u>", msg->level); + len += sprintf(buf, "<%u>", prefix); } else { len += 3; - if (msg->level > 9) - len++; - if (msg->level > 99) + if (prefix > 999) + len += 3; + else if (prefix > 99) + len += 2; + else if (prefix > 9) len++; } } From cb424ffe9f45ad80267f2a98fbd9bf21caa0ce22 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 6 Jul 2012 09:50:09 -0700 Subject: [PATCH 332/612] kmsg: properly handle concurrent non-blocking read() from /proc/kmsg The /proc/kmsg read() interface is internally simply wired up to a sequence of syslog() syscalls, which might are racy between their checks and actions, regarding concurrency. In the (very uncommon) case of concurrent readers of /dev/kmsg, relying on usual O_NONBLOCK behavior, the recently introduced mutex might block an O_NONBLOCK reader in read(), when poll() returns for it, but another process has already read the data in the meantime. We've seen that while running artificial test setups and tools that "fight" about /proc/kmsg data. This restores the original /proc/kmsg behavior, where in case of concurrent read()s, poll() might wake up but the read() syscall will just return 0 to the caller, while another process has "stolen" the data. This is in the general case not the expected behavior, but it is the exact same one, that can easily be triggered with a 3.4 kernel, and some tools might just rely on it. The mutex is not needed, the original integrity issue which introduced it, is in the meantime covered by: "fill buffer with more than a single message for SYSLOG_ACTION_READ" 116e90b23f74d303e8d607c7a7d54f60f14ab9f2 Cc: Yuanhan Liu Acked-by: Jan Beulich Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/kernel/printk.c b/kernel/printk.c index 37cde752cb8a..be9a82b2f0b3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1021,7 +1021,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) { bool clear = false; static int saved_console_loglevel = -1; - static DEFINE_MUTEX(syslog_mutex); int error; error = check_syslog_permissions(type, from_file); @@ -1048,17 +1047,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) error = -EFAULT; goto out; } - error = mutex_lock_interruptible(&syslog_mutex); - if (error) - goto out; error = wait_event_interruptible(log_wait, syslog_seq != log_next_seq); - if (error) { - mutex_unlock(&syslog_mutex); + if (error) goto out; - } error = syslog_print(buf, len); - mutex_unlock(&syslog_mutex); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: From 68b6507dc554ba015b5ed5e13b1ed4993cdf4024 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 6 Jul 2012 09:50:09 -0700 Subject: [PATCH 333/612] kmsg: make sure all messages reach a newly registered boot console We suppress printing kmsg records to the console, which are already printed immediately while we have received their fragments. Newly registered boot consoles print the entire kmsg buffer during registration. Clear the console-suppress flag after we skipped the record during its first storage, so any later print will see these records as usual. Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/printk.c b/kernel/printk.c index be9a82b2f0b3..f02f1f5ddc30 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1953,6 +1953,12 @@ skip: */ console_idx = log_next(console_idx); console_seq++; + /* + * We will get here again when we register a new + * CON_PRINTBUFFER console. Clear the flag so we + * will properly dump everything later. + */ + msg->flags &= ~LOG_NOCONS; goto skip; } From 84774e615749acfce6b2f679b95924dffb5f5de1 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Thu, 5 Jul 2012 15:15:36 +0100 Subject: [PATCH 334/612] of: address: Don't fail a lookup just because a node has no reg property Sometimes it doesn't make any sense for a node to have an address. In this case device lookup will always be unsuccessful because we currently assume every node will have a reg property. This patch changes the semantics so that the resource address and the lookup address will only be compared if one exists. Things like AUXDATA() rely on of_dev_lookup to return the lookup entry of a particular device in order to do things like apply platform_data to a device. However, this is currently broken for nodes which do not have a reg property, meaning that platform_data can not be passed in those cases. Acked-by: Rob Herring Acked-by: Arnd Bergmann Signed-off-by: Lee Jones --- drivers/of/platform.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/of/platform.c b/drivers/of/platform.c index 343ad29e211c..96004807dd28 100644 --- a/drivers/of/platform.c +++ b/drivers/of/platform.c @@ -317,10 +317,9 @@ static const struct of_dev_auxdata *of_dev_lookup(const struct of_dev_auxdata *l for(; lookup->compatible != NULL; lookup++) { if (!of_device_is_compatible(np, lookup->compatible)) continue; - if (of_address_to_resource(np, 0, &res)) - continue; - if (res.start != lookup->phys_addr) - continue; + if (!of_address_to_resource(np, 0, &res)) + if (res.start != lookup->phys_addr) + continue; pr_debug("%s: devname=%s\n", np->full_name, lookup->name); return lookup; } From c57920e6c23350b08c1b05606aafe356ebc6d8cc Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 15 Jun 2012 09:27:54 +0100 Subject: [PATCH 335/612] ARM: ux500: Fix build errors/warnings when MACH_UX500_DT is not set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When MACH_UX500_DT and all related Device Tree configurations are forced off the warning and error below prevent the kernel from compiling. This simple patch fixes both issues and allows for full build and boot of ST-Ericsson's low-cost development board, Snowball. Warnings fixed: arch/arm/mach-ux500/board-mop500.c:680:32: warning: ‘snowball_of_platform_devs’ defined but not used Errors fixed: arch/arm/mach-ux500/timer.c: In function ‘ux500_timer_init’: arch/arm/mach-ux500/timer.c:66:3: error: implicit declaration of function ‘of_find_matching_node’ arch/arm/mach-ux500/timer.c:66:6: warning: assignment makes pointer from integer without a cast Cc: stable@vger.kernel.org Acked-by: Linus Walleij Signed-off-by: Lee Jones --- arch/arm/mach-ux500/board-mop500.c | 10 +++++----- arch/arm/mach-ux500/timer.c | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm/mach-ux500/board-mop500.c b/arch/arm/mach-ux500/board-mop500.c index 1509a3cb5833..461012a16e74 100644 --- a/arch/arm/mach-ux500/board-mop500.c +++ b/arch/arm/mach-ux500/board-mop500.c @@ -625,11 +625,6 @@ static struct platform_device *snowball_platform_devs[] __initdata = { &ab8500_device, }; -static struct platform_device *snowball_of_platform_devs[] __initdata = { - &snowball_led_dev, - &snowball_key_dev, -}; - static void __init mop500_init_machine(void) { struct device *parent = NULL; @@ -769,6 +764,11 @@ MACHINE_END #ifdef CONFIG_MACH_UX500_DT +static struct platform_device *snowball_of_platform_devs[] __initdata = { + &snowball_led_dev, + &snowball_key_dev, +}; + struct of_dev_auxdata u8500_auxdata_lookup[] __initdata = { /* Requires DMA and call-back bindings. */ OF_DEV_AUXDATA("arm,pl011", 0x80120000, "uart0", &uart0_plat), diff --git a/arch/arm/mach-ux500/timer.c b/arch/arm/mach-ux500/timer.c index 741e71feca78..66e7f00884ab 100644 --- a/arch/arm/mach-ux500/timer.c +++ b/arch/arm/mach-ux500/timer.c @@ -63,8 +63,10 @@ static void __init ux500_timer_init(void) /* TODO: Once MTU has been DT:ed place code above into else. */ if (of_have_populated_dt()) { +#ifdef CONFIG_OF np = of_find_matching_node(NULL, prcmu_timer_of_match); if (!np) +#endif goto dt_fail; tmp_base = of_iomap(np, 0); From 2b667a2d8005e7c8362a77365cedbcd71fa5d6c1 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 6 Jul 2012 15:59:01 +0100 Subject: [PATCH 336/612] ARM: ux500: Over-ride the DT device naming scheme for pinctrl When pin control mapping tables are written the registered device name is supplied for use in name-based searches within the pinctrl driver. In the case of the DB8500 the string "pinctrl-db8500" is used. However, when we register the driver with Device Tree, its naming convention uses something that looks more like "pinctrl.2". To work around the device naming inconsistencies between devices registered via platform code and the ones registered by Device Tree, we use AUXDATA to over-ride the Device Tree naming scheme. Acked-by: Linus Walleij Signed-off-by: Lee Jones --- arch/arm/mach-ux500/board-mop500.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/mach-ux500/board-mop500.c b/arch/arm/mach-ux500/board-mop500.c index 461012a16e74..4fd93f5c49ec 100644 --- a/arch/arm/mach-ux500/board-mop500.c +++ b/arch/arm/mach-ux500/board-mop500.c @@ -786,6 +786,8 @@ struct of_dev_auxdata u8500_auxdata_lookup[] __initdata = { OF_DEV_AUXDATA("st,nomadik-gpio", 0x8011e000, "gpio.6", NULL), OF_DEV_AUXDATA("st,nomadik-gpio", 0x8011e080, "gpio.7", NULL), OF_DEV_AUXDATA("st,nomadik-gpio", 0xa03fe000, "gpio.8", NULL), + /* Requires device name bindings. */ + OF_DEV_AUXDATA("stericsson,nmk_pinctrl", 0, "pinctrl-db8500", NULL), {}, }; From 873e9f7a3b8a5ca36085437da364654bfe3e5974 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 6 Jul 2012 17:20:05 +0900 Subject: [PATCH 337/612] ARM: shmobile: fix platsmp.c build when ARCH_SH73A0=n Fix build error in the case of SMP=y but ARCH_SH73A0=n introduced by: 9601e87 ARM: shmobile: fix smp build The use of of_machine_is_compatible() will link in the the SoC-specific symbols: "sh73a0_get_core_count", "sh73a0_smp_prepare_cpus", "sh73a0_secondary_init" and "sh73a0_boot_secondary". This patch adds an ugly #ifdef wrapper as a stop-gap solution. Signed-off-by: Magnus Damm Tested-by: Tested-by: Simon Horman Acked-by: "Rafael J. Wysocki" Signed-off-by: Arnd Bergmann --- arch/arm/mach-shmobile/platsmp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm/mach-shmobile/platsmp.c b/arch/arm/mach-shmobile/platsmp.c index e859fcdb3d58..fde0d23121dc 100644 --- a/arch/arm/mach-shmobile/platsmp.c +++ b/arch/arm/mach-shmobile/platsmp.c @@ -22,8 +22,13 @@ #include #include +#ifdef CONFIG_ARCH_SH73A0 #define is_sh73a0() (machine_is_ag5evm() || machine_is_kota2() || \ of_machine_is_compatible("renesas,sh73a0")) +#else +#define is_sh73a0() (0) +#endif + #define is_r8a7779() machine_is_marzen() #ifdef CONFIG_ARCH_EMEV2 From c6a4ebb9ae30ead7684bce623955f74b17df496d Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Fri, 6 Jul 2012 23:56:00 +0200 Subject: [PATCH 338/612] MIPS: Provide a symbol for the legacy performance counter interrupt. Based on https://patchwork.linux-mips.org/patch/3576 - but this really deserves its own patchset and the symbol should also be used :) Signed-off-by: Ralf Baechle --- arch/mips/include/asm/irq.h | 1 + arch/mips/kernel/traps.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h index fb698dc09bc9..78dbb8a86da2 100644 --- a/arch/mips/include/asm/irq.h +++ b/arch/mips/include/asm/irq.h @@ -136,6 +136,7 @@ extern void free_irqno(unsigned int irq); * IE7. Since R2 their number has to be read from the c0_intctl register. */ #define CP0_LEGACY_COMPARE_IRQ 7 +#define CP0_LEGACY_PERFCNT_IRQ 7 extern int cp0_compare_irq; extern int cp0_compare_irq_shift; diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 2d0c2a277f52..f985b7292cd9 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -1597,7 +1597,7 @@ void __cpuinit per_cpu_trap_init(bool is_boot_cpu) cp0_perfcount_irq = -1; } else { cp0_compare_irq = CP0_LEGACY_COMPARE_IRQ; - cp0_compare_irq_shift = cp0_compare_irq; + cp0_compare_irq_shift = CP0_LEGACY_PERFCNT_IRQ; cp0_perfcount_irq = -1; } From f0b77f2c0eed1e37c96b9a995d5a45e9eb4aaca8 Mon Sep 17 00:00:00 2001 From: "Steven J. Hill" Date: Fri, 6 Jul 2012 23:56:00 +0200 Subject: [PATCH 339/612] MIPS: Clean-up GIC and vectored interrupts. This change adds macros for routing of GIC interrupts for EIC and non-EIC hardware modes. Also added Malta GIC macros having to do with performance and timer interrupts. Signed-off-by: Steven J. Hill Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/3576/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/gic.h | 15 ++++++++++++++- arch/mips/include/asm/mips-boards/maltaint.h | 10 ++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/gic.h b/arch/mips/include/asm/gic.h index 86548da650e7..991b659e2548 100644 --- a/arch/mips/include/asm/gic.h +++ b/arch/mips/include/asm/gic.h @@ -206,7 +206,7 @@ #define GIC_VPE_EIC_SHADOW_SET_BASE 0x0100 #define GIC_VPE_EIC_SS(intr) \ - (GIC_EIC_SHADOW_SET_BASE + (4 * intr)) + (GIC_VPE_EIC_SHADOW_SET_BASE + (4 * intr)) #define GIC_VPE_EIC_VEC_BASE 0x0800 #define GIC_VPE_EIC_VEC(intr) \ @@ -330,6 +330,17 @@ struct gic_intr_map { #define GIC_FLAG_TRANSPARENT 0x02 }; +/* + * This is only used in EIC mode. This helps to figure out which + * shared interrupts we need to process when we get a vector interrupt. + */ +#define GIC_MAX_SHARED_INTR 0x5 +struct gic_shared_intr_map { + unsigned int num_shared_intr; + unsigned int intr_list[GIC_MAX_SHARED_INTR]; + unsigned int local_intr_mask; +}; + extern void gic_init(unsigned long gic_base_addr, unsigned long gic_addrspace_size, struct gic_intr_map *intrmap, unsigned int intrmap_size, unsigned int irqbase); @@ -338,5 +349,7 @@ extern unsigned int gic_get_int(void); extern void gic_send_ipi(unsigned int intr); extern unsigned int plat_ipi_call_int_xlate(unsigned int); extern unsigned int plat_ipi_resched_int_xlate(unsigned int); +extern void gic_bind_eic_interrupt(int irq, int set); +extern unsigned int gic_get_timer_pending(void); #endif /* _ASM_GICREGS_H */ diff --git a/arch/mips/include/asm/mips-boards/maltaint.h b/arch/mips/include/asm/mips-boards/maltaint.h index d11aa02a956a..5447d9fc4219 100644 --- a/arch/mips/include/asm/mips-boards/maltaint.h +++ b/arch/mips/include/asm/mips-boards/maltaint.h @@ -86,6 +86,16 @@ #define GIC_CPU_INT4 4 /* . */ #define GIC_CPU_INT5 5 /* Core Interrupt 5 */ +/* MALTA GIC local interrupts */ +#define GIC_INT_TMR (GIC_CPU_INT5) +#define GIC_INT_PERFCTR (GIC_CPU_INT5) + +/* GIC constants */ +/* Add 2 to convert non-eic hw int # to eic vector # */ +#define GIC_CPU_TO_VEC_OFFSET (2) +/* If we map an intr to pin X, GIC will actually generate vector X+1 */ +#define GIC_PIN_TO_VEC_OFFSET (1) + #define GIC_EXT_INTR(x) x /* External Interrupts used for IPI */ From 839efb4ffbfba74857e3411413f7ca53c8ff1925 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Fri, 6 Jul 2012 23:56:00 +0200 Subject: [PATCH 340/612] MIPS: MT: Fix indentation damage. Split off from https://patchwork.linux-mips.org/patch/3603/. Signed-off-by: Ralf Baechle --- arch/mips/include/asm/mipsmtregs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/include/asm/mipsmtregs.h b/arch/mips/include/asm/mipsmtregs.h index c9420aa97e32..e71ff4c317f2 100644 --- a/arch/mips/include/asm/mipsmtregs.h +++ b/arch/mips/include/asm/mipsmtregs.h @@ -48,7 +48,7 @@ #define CP0_VPECONF0 $1, 2 #define CP0_VPECONF1 $1, 3 #define CP0_YQMASK $1, 4 -#define CP0_VPESCHEDULE $1, 5 +#define CP0_VPESCHEDULE $1, 5 #define CP0_VPESCHEFBK $1, 6 #define CP0_TCSTATUS $2, 1 #define CP0_TCBIND $2, 2 From 113c62d9844d9037508fa156e47db1b5407a27c3 Mon Sep 17 00:00:00 2001 From: "Steven J. Hill" Date: Fri, 6 Jul 2012 23:56:00 +0200 Subject: [PATCH 341/612] MIPS: Add support for the M14Kc core. [ralf@linux-mips.org: Fixed whitespace damage.] Signed-off-by: Steven J. Hill Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/3773/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/cpu.h | 5 +++-- arch/mips/kernel/cpu-probe.c | 7 ++++++- arch/mips/mm/c-r4k.c | 1 + arch/mips/mm/tlbex.c | 2 ++ arch/mips/oprofile/common.c | 1 + arch/mips/oprofile/op_model_mipsxx.c | 4 ++++ 6 files changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/mips/include/asm/cpu.h b/arch/mips/include/asm/cpu.h index f9fa2a479dd0..c64910586b74 100644 --- a/arch/mips/include/asm/cpu.h +++ b/arch/mips/include/asm/cpu.h @@ -94,6 +94,7 @@ #define PRID_IMP_24KE 0x9600 #define PRID_IMP_74K 0x9700 #define PRID_IMP_1004K 0x9900 +#define PRID_IMP_M14KC 0x9c00 /* * These are the PRID's for when 23:16 == PRID_COMP_SIBYTE @@ -260,7 +261,7 @@ enum cpu_type_enum { */ CPU_4KC, CPU_4KEC, CPU_4KSC, CPU_24K, CPU_34K, CPU_1004K, CPU_74K, CPU_ALCHEMY, CPU_PR4450, CPU_BMIPS32, CPU_BMIPS3300, CPU_BMIPS4350, - CPU_BMIPS4380, CPU_BMIPS5000, CPU_JZRISC, + CPU_BMIPS4380, CPU_BMIPS5000, CPU_JZRISC, CPU_M14KC, /* * MIPS64 class processors @@ -288,7 +289,7 @@ enum cpu_type_enum { #define MIPS_CPU_ISA_M64R2 0x00000100 #define MIPS_CPU_ISA_32BIT (MIPS_CPU_ISA_I | MIPS_CPU_ISA_II | \ - MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M32R2 ) + MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M32R2) #define MIPS_CPU_ISA_64BIT (MIPS_CPU_ISA_III | MIPS_CPU_ISA_IV | \ MIPS_CPU_ISA_V | MIPS_CPU_ISA_M64R1 | MIPS_CPU_ISA_M64R2) diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index 6ae7ce4ac63e..aaf39f3eaa51 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -4,7 +4,7 @@ * Copyright (C) xxxx the Anonymous * Copyright (C) 1994 - 2006 Ralf Baechle * Copyright (C) 2003, 2004 Maciej W. Rozycki - * Copyright (C) 2001, 2004 MIPS Inc. + * Copyright (C) 2001, 2004, 2011, 2012 MIPS Technologies, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -199,6 +199,7 @@ void __init check_wait(void) cpu_wait = rm7k_wait_irqoff; break; + case CPU_M14KC: case CPU_24K: case CPU_34K: case CPU_1004K: @@ -831,6 +832,10 @@ static inline void cpu_probe_mips(struct cpuinfo_mips *c, unsigned int cpu) c->cputype = CPU_74K; __cpu_name[cpu] = "MIPS 74Kc"; break; + case PRID_IMP_M14KC: + c->cputype = CPU_M14KC; + __cpu_name[cpu] = "MIPS M14Kc"; + break; case PRID_IMP_1004K: c->cputype = CPU_1004K; __cpu_name[cpu] = "MIPS 1004Kc"; diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index 5109be96d98d..e56efd059189 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -1051,6 +1051,7 @@ static void __cpuinit probe_pcache(void) case CPU_R14000: break; + case CPU_M14KC: case CPU_24K: case CPU_34K: case CPU_74K: diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 0bc485b3cd60..03eb0ef91580 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -9,6 +9,7 @@ * Copyright (C) 2005, 2007, 2008, 2009 Maciej W. Rozycki * Copyright (C) 2006 Ralf Baechle (ralf@linux-mips.org) * Copyright (C) 2008, 2009 Cavium Networks, Inc. + * Copyright (C) 2011 MIPS Technologies, Inc. * * ... and the days got worse and worse and now you see * I've gone completly out of my mind. @@ -494,6 +495,7 @@ static void __cpuinit build_tlb_write_entry(u32 **p, struct uasm_label **l, case CPU_R14000: case CPU_4KC: case CPU_4KEC: + case CPU_M14KC: case CPU_SB1: case CPU_SB1A: case CPU_4KSC: diff --git a/arch/mips/oprofile/common.c b/arch/mips/oprofile/common.c index d1f2d4c52d42..b6e378211a2c 100644 --- a/arch/mips/oprofile/common.c +++ b/arch/mips/oprofile/common.c @@ -78,6 +78,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) switch (current_cpu_type()) { case CPU_5KC: + case CPU_M14KC: case CPU_20KC: case CPU_24K: case CPU_25KF: diff --git a/arch/mips/oprofile/op_model_mipsxx.c b/arch/mips/oprofile/op_model_mipsxx.c index baba3bcaa3c2..4d80a856048d 100644 --- a/arch/mips/oprofile/op_model_mipsxx.c +++ b/arch/mips/oprofile/op_model_mipsxx.c @@ -322,6 +322,10 @@ static int __init mipsxx_init(void) op_model_mipsxx_ops.num_counters = counters; switch (current_cpu_type()) { + case CPU_M14KC: + op_model_mipsxx_ops.cpu_type = "mips/M14Kc"; + break; + case CPU_20KC: op_model_mipsxx_ops.cpu_type = "mips/20K"; break; From 82163edcdfa4eb3d74516cc8e9f38dd3d039b67d Mon Sep 17 00:00:00 2001 From: Santosh Nayak Date: Sat, 23 Jun 2012 07:59:54 -0300 Subject: [PATCH 342/612] [media] dvb-core: Release semaphore on error path dvb_register_device() There is a missing "up_write()" here. Semaphore should be released before returning error value. Cc: stable@kernel.org Signed-off-by: Santosh Nayak Signed-off-by: Mauro Carvalho Chehab --- drivers/media/dvb/dvb-core/dvbdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/dvb/dvb-core/dvbdev.c b/drivers/media/dvb/dvb-core/dvbdev.c index 00a67326c193..39eab73b01ae 100644 --- a/drivers/media/dvb/dvb-core/dvbdev.c +++ b/drivers/media/dvb/dvb-core/dvbdev.c @@ -243,6 +243,7 @@ int dvb_register_device(struct dvb_adapter *adap, struct dvb_device **pdvbdev, if (minor == MAX_DVB_MINORS) { kfree(dvbdevfops); kfree(dvbdev); + up_write(&minor_rwsem); mutex_unlock(&dvbdev_register_lock); return -EINVAL; } From 4dab0e5fe8ceaaf267003b5d7aa0c31c1092bee0 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 5 Jul 2012 22:52:34 -0300 Subject: [PATCH 343/612] [media] em28xx: fix em28xx-rc load The logic that checks if a device has remote control is wrong. Due to that, the em28xx RC module is not loaded by default. Fix the logic, in order to make it work properly. Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/em28xx/em28xx-cards.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/video/em28xx/em28xx-cards.c b/drivers/media/video/em28xx/em28xx-cards.c index 92da7c28b6f0..862c6575c557 100644 --- a/drivers/media/video/em28xx/em28xx-cards.c +++ b/drivers/media/video/em28xx/em28xx-cards.c @@ -2893,7 +2893,7 @@ static void request_module_async(struct work_struct *work) if (dev->board.has_dvb) request_module("em28xx-dvb"); - if (dev->board.has_ir_i2c && !disable_ir) + if (dev->board.ir_codes && !disable_ir) request_module("em28xx-rc"); } From a7deca6fa79d5c65575532e780f3c93f6bf8ddad Mon Sep 17 00:00:00 2001 From: David Dillow Date: Mon, 18 Jun 2012 00:15:21 -0300 Subject: [PATCH 344/612] [media] cx231xx: don't DMA to random addresses Commit 7a6f6c29d264cdd2fe0eb3d923217eed5f0ad134 (cx231xx: use URB_NO_TRANSFER_DMA_MAP) was intended to avoid mapping the DMA buffer for URB twice. This works for the URBs allocated with usb_alloc_urb(), as those are allocated from cohernent DMA pools, but the flag was also added for the VBI and audio URBs, which have a manually allocated area. This leaves the random trash in the structure after allocation as the DMA address, corrupting memory and preventing VBI and audio from working. Letting the USB core map the buffers solves the problem. Cc: stable@kernel.org Signed-off-by: David Dillow Cc: Sri Deevi Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/cx231xx/cx231xx-audio.c | 4 ++-- drivers/media/video/cx231xx/cx231xx-vbi.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/video/cx231xx/cx231xx-audio.c b/drivers/media/video/cx231xx/cx231xx-audio.c index 068f78dc5d13..b4c99c7270cf 100644 --- a/drivers/media/video/cx231xx/cx231xx-audio.c +++ b/drivers/media/video/cx231xx/cx231xx-audio.c @@ -307,7 +307,7 @@ static int cx231xx_init_audio_isoc(struct cx231xx *dev) urb->context = dev; urb->pipe = usb_rcvisocpipe(dev->udev, dev->adev.end_point_addr); - urb->transfer_flags = URB_ISO_ASAP | URB_NO_TRANSFER_DMA_MAP; + urb->transfer_flags = URB_ISO_ASAP; urb->transfer_buffer = dev->adev.transfer_buffer[i]; urb->interval = 1; urb->complete = cx231xx_audio_isocirq; @@ -368,7 +368,7 @@ static int cx231xx_init_audio_bulk(struct cx231xx *dev) urb->context = dev; urb->pipe = usb_rcvbulkpipe(dev->udev, dev->adev.end_point_addr); - urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP; + urb->transfer_flags = 0; urb->transfer_buffer = dev->adev.transfer_buffer[i]; urb->complete = cx231xx_audio_bulkirq; urb->transfer_buffer_length = sb_size; diff --git a/drivers/media/video/cx231xx/cx231xx-vbi.c b/drivers/media/video/cx231xx/cx231xx-vbi.c index 3d15314e1f88..ac7db52f404f 100644 --- a/drivers/media/video/cx231xx/cx231xx-vbi.c +++ b/drivers/media/video/cx231xx/cx231xx-vbi.c @@ -448,7 +448,7 @@ int cx231xx_init_vbi_isoc(struct cx231xx *dev, int max_packets, return -ENOMEM; } dev->vbi_mode.bulk_ctl.urb[i] = urb; - urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP; + urb->transfer_flags = 0; dev->vbi_mode.bulk_ctl.transfer_buffer[i] = kzalloc(sb_size, GFP_KERNEL); From a65c3262a766311d4704a9ea29354480c5e8590d Mon Sep 17 00:00:00 2001 From: Kamil Debski Date: Tue, 26 Jun 2012 04:44:40 -0300 Subject: [PATCH 345/612] [media] s5p-mfc: Fixed setup of custom controls in decoder and encoder Fixed bugs in functions that initialize custom controls. Signed-off-by: Kamil Debski Signed-off-by: Kyungmin Park Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/s5p-mfc/s5p_mfc_dec.c | 1 + drivers/media/video/s5p-mfc/s5p_mfc_enc.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/media/video/s5p-mfc/s5p_mfc_dec.c b/drivers/media/video/s5p-mfc/s5p_mfc_dec.c index 4dd32fc8fd82..feea867f318c 100644 --- a/drivers/media/video/s5p-mfc/s5p_mfc_dec.c +++ b/drivers/media/video/s5p-mfc/s5p_mfc_dec.c @@ -996,6 +996,7 @@ int s5p_mfc_dec_ctrls_setup(struct s5p_mfc_ctx *ctx) for (i = 0; i < NUM_CTRLS; i++) { if (IS_MFC51_PRIV(controls[i].id)) { + memset(&cfg, 0, sizeof(struct v4l2_ctrl_config)); cfg.ops = &s5p_mfc_dec_ctrl_ops; cfg.id = controls[i].id; cfg.min = controls[i].minimum; diff --git a/drivers/media/video/s5p-mfc/s5p_mfc_enc.c b/drivers/media/video/s5p-mfc/s5p_mfc_enc.c index 03d83340e7fb..158b78989b89 100644 --- a/drivers/media/video/s5p-mfc/s5p_mfc_enc.c +++ b/drivers/media/video/s5p-mfc/s5p_mfc_enc.c @@ -1773,6 +1773,7 @@ int s5p_mfc_enc_ctrls_setup(struct s5p_mfc_ctx *ctx) } for (i = 0; i < NUM_CTRLS; i++) { if (IS_MFC51_PRIV(controls[i].id)) { + memset(&cfg, 0, sizeof(struct v4l2_ctrl_config)); cfg.ops = &s5p_mfc_enc_ctrl_ops; cfg.id = controls[i].id; cfg.min = controls[i].minimum; From ba50e7e16b86e25048290b682f7a4e09e069d321 Mon Sep 17 00:00:00 2001 From: Devin Heitmueller Date: Sun, 1 Jul 2012 16:15:09 -0300 Subject: [PATCH 346/612] [media] cx25840: fix regression in HVR-1800 analog support The refactoring of the cx25840 driver to support the cx23888 caused breakage with the existing support for cx23885/cx23887 analog support. Rework the routines such that the new code is only used for the 888. Validated with the following boards: HVR-1800 retail (0070:7801) HVR-1800 OEM (0070:7809) HVR_1850 retail (0070:8541) Thanks to Steven Toth and Hauppauge for loaning me various boards to regression test with. Reported-by: Jonathan Thanks-to: Steven Toth Signed-off-by: Devin Heitmueler Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/cx25840/cx25840-core.c | 23 ++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c index fc1ff69cffd0..a82b7045f8c9 100644 --- a/drivers/media/video/cx25840/cx25840-core.c +++ b/drivers/media/video/cx25840/cx25840-core.c @@ -84,7 +84,7 @@ MODULE_PARM_DESC(debug, "Debugging messages [0=Off (default) 1=On]"); /* ----------------------------------------------------------------------- */ -static void cx23885_std_setup(struct i2c_client *client); +static void cx23888_std_setup(struct i2c_client *client); int cx25840_write(struct i2c_client *client, u16 addr, u8 value) { @@ -638,10 +638,13 @@ static void cx23885_initialize(struct i2c_client *client) finish_wait(&state->fw_wait, &wait); destroy_workqueue(q); - /* Call the cx23885 specific std setup func, we no longer rely on + /* Call the cx23888 specific std setup func, we no longer rely on * the generic cx24840 func. */ - cx23885_std_setup(client); + if (is_cx23888(state)) + cx23888_std_setup(client); + else + cx25840_std_setup(client); /* (re)set input */ set_input(client, state->vid_input, state->aud_input); @@ -1298,8 +1301,8 @@ static int set_v4lstd(struct i2c_client *client) } cx25840_and_or(client, 0x400, ~0xf, fmt); cx25840_and_or(client, 0x403, ~0x3, pal_m); - if (is_cx2388x(state)) - cx23885_std_setup(client); + if (is_cx23888(state)) + cx23888_std_setup(client); else cx25840_std_setup(client); if (!is_cx2583x(state)) @@ -1782,8 +1785,8 @@ static int cx25840_s_video_routing(struct v4l2_subdev *sd, struct cx25840_state *state = to_state(sd); struct i2c_client *client = v4l2_get_subdevdata(sd); - if (is_cx2388x(state)) - cx23885_std_setup(client); + if (is_cx23888(state)) + cx23888_std_setup(client); return set_input(client, input, state->aud_input); } @@ -1794,8 +1797,8 @@ static int cx25840_s_audio_routing(struct v4l2_subdev *sd, struct cx25840_state *state = to_state(sd); struct i2c_client *client = v4l2_get_subdevdata(sd); - if (is_cx2388x(state)) - cx23885_std_setup(client); + if (is_cx23888(state)) + cx23888_std_setup(client); return set_input(client, state->vid_input, input); } @@ -4939,7 +4942,7 @@ void cx23885_dif_setup(struct i2c_client *client, u32 ifHz) } } -static void cx23885_std_setup(struct i2c_client *client) +static void cx23888_std_setup(struct i2c_client *client) { struct cx25840_state *state = to_state(i2c_get_clientdata(client)); v4l2_std_id std = state->std; From e6d0db1d47c0ac4d61c842a95988cbe8712df447 Mon Sep 17 00:00:00 2001 From: Devin Heitmueller Date: Sun, 1 Jul 2012 16:15:10 -0300 Subject: [PATCH 347/612] [media] cx25840: fix regression in analog support hue/saturation controls Fix regression in HVR-1800 analog support hue/saturation controls. The changes made for the cx23888 caused regressions in the analog support for cx23885/cx23887 based boards (partly due to changes in the locations of the hue/saturation controls). As a result the wrong registers were being overwritten. Add code to use the correct registers if it's a cx23888 Validated with the following boards: HVR-1800 retail (0070:7801) HVR-1800 OEM (0070:7809) HVR-1850 retail (0070:8541) Thanks to Steven Toth and Hauppauge for loaning me various boards to regression test with. Reported-by: Jonathan Thanks-to: Steven Toth Signed-off-by: Devin Heitmueler Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/cx25840/cx25840-core.c | 33 ++++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c index a82b7045f8c9..e12b3b0d7f35 100644 --- a/drivers/media/video/cx25840/cx25840-core.c +++ b/drivers/media/video/cx25840/cx25840-core.c @@ -1106,9 +1106,23 @@ static int set_input(struct i2c_client *client, enum cx25840_video_input vid_inp cx25840_write4(client, 0x410, 0xffff0dbf); cx25840_write4(client, 0x414, 0x00137d03); - cx25840_write4(client, 0x418, 0x01008080); + + /* on the 887, 0x418 is HSCALE_CTRL, on the 888 it is + CHROMA_CTRL */ + if (is_cx23888(state)) + cx25840_write4(client, 0x418, 0x01008080); + else + cx25840_write4(client, 0x418, 0x01000000); + cx25840_write4(client, 0x41c, 0x00000000); - cx25840_write4(client, 0x420, 0x001c3e0f); + + /* on the 887, 0x420 is CHROMA_CTRL, on the 888 it is + CRUSH_CTRL */ + if (is_cx23888(state)) + cx25840_write4(client, 0x420, 0x001c3e0f); + else + cx25840_write4(client, 0x420, 0x001c8282); + cx25840_write4(client, 0x42c, 0x42600000); cx25840_write4(client, 0x430, 0x0000039b); cx25840_write4(client, 0x438, 0x00000000); @@ -1315,6 +1329,7 @@ static int set_v4lstd(struct i2c_client *client) static int cx25840_s_ctrl(struct v4l2_ctrl *ctrl) { struct v4l2_subdev *sd = to_sd(ctrl); + struct cx25840_state *state = to_state(sd); struct i2c_client *client = v4l2_get_subdevdata(sd); switch (ctrl->id) { @@ -1327,12 +1342,20 @@ static int cx25840_s_ctrl(struct v4l2_ctrl *ctrl) break; case V4L2_CID_SATURATION: - cx25840_write(client, 0x420, ctrl->val << 1); - cx25840_write(client, 0x421, ctrl->val << 1); + if (is_cx23888(state)) { + cx25840_write(client, 0x418, ctrl->val << 1); + cx25840_write(client, 0x419, ctrl->val << 1); + } else { + cx25840_write(client, 0x420, ctrl->val << 1); + cx25840_write(client, 0x421, ctrl->val << 1); + } break; case V4L2_CID_HUE: - cx25840_write(client, 0x422, ctrl->val); + if (is_cx23888(state)) + cx25840_write(client, 0x41a, ctrl->val); + else + cx25840_write(client, 0x422, ctrl->val); break; default: From d90133ec58d4fad822ff322557b6885c5b77f127 Mon Sep 17 00:00:00 2001 From: Devin Heitmueller Date: Sun, 1 Jul 2012 16:15:11 -0300 Subject: [PATCH 348/612] [media] cx25840: fix regression in HVR-1800 analog audio The refactoring of the cx25840 driver to support the cx23888 caused breakage with the existing support for cx23885/cx23887 analog audio support. Tweak the code so that it only uses the code if it really is a cx23888 instead of applying it to all cx2388x based devices. Validated with the following boards: HVR-1800 retail (0070:7801) HVR-1800 OEM (0070:7809) HVR_1850 retail (0070:8541) Thanks to Steven Toth and Hauppauge for loaning me various boards to regression test with. Reported-by: Jonathan Thanks-to: Steven Toth Signed-off-by: Devin Heitmueler Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/cx25840/cx25840-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c index e12b3b0d7f35..7dc7bb155dff 100644 --- a/drivers/media/video/cx25840/cx25840-core.c +++ b/drivers/media/video/cx25840/cx25840-core.c @@ -1250,7 +1250,7 @@ static int set_input(struct i2c_client *client, enum cx25840_video_input vid_inp cx25840_write4(client, 0x8d0, 0x1f063870); } - if (is_cx2388x(state)) { + if (is_cx23888(state)) { /* HVR1850 */ /* AUD_IO_CTRL - I2S Input, Parallel1*/ /* - Channel 1 src - Parallel1 (Merlin out) */ From b5c5c17babfaf83f274ca8f0a7660284882d8d4d Mon Sep 17 00:00:00 2001 From: Devin Heitmueller Date: Sun, 1 Jul 2012 16:15:12 -0300 Subject: [PATCH 349/612] [media] cx25840: fix vsrc/hsrc usage on cx23888 designs The location of the vsrc/hsrc registers moved in the cx23888, causing the s_mbus call to fail prematurely indicating that "720x480 is not a valid size". The function bailed out before many pertinent registers were set related to the scaler (causing unexpected results in video rendering when doing raw video capture). Use the correct registers for the cx23888. Validated with the following boards: HVR-1800 retail (0070:7801) HVR-1800 OEM (0070:7809) HVR-1850 retail (0070:8541) Thanks to Steven Toth and Hauppauge for loaning me various boards to regression test with. Reported-by: Jonathan Thanks-to: Steven Toth Signed-off-by: Devin Heitmueler Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/cx25840/cx25840-core.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c index 7dc7bb155dff..d8eac3e30a7e 100644 --- a/drivers/media/video/cx25840/cx25840-core.c +++ b/drivers/media/video/cx25840/cx25840-core.c @@ -1380,11 +1380,21 @@ static int cx25840_s_mbus_fmt(struct v4l2_subdev *sd, struct v4l2_mbus_framefmt fmt->field = V4L2_FIELD_INTERLACED; fmt->colorspace = V4L2_COLORSPACE_SMPTE170M; - Vsrc = (cx25840_read(client, 0x476) & 0x3f) << 4; - Vsrc |= (cx25840_read(client, 0x475) & 0xf0) >> 4; + if (is_cx23888(state)) { + Vsrc = (cx25840_read(client, 0x42a) & 0x3f) << 4; + Vsrc |= (cx25840_read(client, 0x429) & 0xf0) >> 4; + } else { + Vsrc = (cx25840_read(client, 0x476) & 0x3f) << 4; + Vsrc |= (cx25840_read(client, 0x475) & 0xf0) >> 4; + } - Hsrc = (cx25840_read(client, 0x472) & 0x3f) << 4; - Hsrc |= (cx25840_read(client, 0x471) & 0xf0) >> 4; + if (is_cx23888(state)) { + Hsrc = (cx25840_read(client, 0x426) & 0x3f) << 4; + Hsrc |= (cx25840_read(client, 0x425) & 0xf0) >> 4; + } else { + Hsrc = (cx25840_read(client, 0x472) & 0x3f) << 4; + Hsrc |= (cx25840_read(client, 0x471) & 0xf0) >> 4; + } Vlines = fmt->height + (is_50Hz ? 4 : 7); From d214ddc86807fb1995fd3400347a202826332710 Mon Sep 17 00:00:00 2001 From: Devin Heitmueller Date: Sun, 1 Jul 2012 16:15:13 -0300 Subject: [PATCH 350/612] [media] cx23885: make analog support work for HVR_1250 (cx23885 variant) The analog support in the cx23885 driver was completely broken for the HVR-1250. Add the necessary code. Note that this only implements analog for the composite and s-video inputs. The tuner input continues to be non-functional due to a lack of analog support in the mt2131 driver. Validated with the following boards: HVR-1250 (0070:7911) Thanks to Steven Toth and Hauppauge for loaning me various boards to regression test with. Thanks-to: Steven Toth Signed-off-by: Devin Heitmueler Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/cx23885/cx23885-cards.c | 31 +++++++++++++++------ drivers/media/video/cx23885/cx23885-video.c | 1 + 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/drivers/media/video/cx23885/cx23885-cards.c b/drivers/media/video/cx23885/cx23885-cards.c index 13739e002a63..f8664f2ccfda 100644 --- a/drivers/media/video/cx23885/cx23885-cards.c +++ b/drivers/media/video/cx23885/cx23885-cards.c @@ -127,22 +127,37 @@ struct cx23885_board cx23885_boards[] = { }, [CX23885_BOARD_HAUPPAUGE_HVR1250] = { .name = "Hauppauge WinTV-HVR1250", + .porta = CX23885_ANALOG_VIDEO, .portc = CX23885_MPEG_DVB, +#ifdef MT2131_NO_ANALOG_SUPPORT_YET + .tuner_type = TUNER_PHILIPS_TDA8290, + .tuner_addr = 0x42, /* 0x84 >> 1 */ + .tuner_bus = 1, +#endif + .force_bff = 1, .input = {{ +#ifdef MT2131_NO_ANALOG_SUPPORT_YET .type = CX23885_VMUX_TELEVISION, - .vmux = 0, + .vmux = CX25840_VIN7_CH3 | + CX25840_VIN5_CH2 | + CX25840_VIN2_CH1, + .amux = CX25840_AUDIO8, .gpio0 = 0xff00, }, { - .type = CX23885_VMUX_DEBUG, - .vmux = 0, - .gpio0 = 0xff01, - }, { +#endif .type = CX23885_VMUX_COMPOSITE1, - .vmux = 1, + .vmux = CX25840_VIN7_CH3 | + CX25840_VIN4_CH2 | + CX25840_VIN6_CH1, + .amux = CX25840_AUDIO7, .gpio0 = 0xff02, }, { .type = CX23885_VMUX_SVIDEO, - .vmux = 2, + .vmux = CX25840_VIN7_CH3 | + CX25840_VIN4_CH2 | + CX25840_VIN8_CH1 | + CX25840_SVIDEO_ON, + .amux = CX25840_AUDIO7, .gpio0 = 0xff02, } }, }, @@ -1526,10 +1541,10 @@ void cx23885_card_setup(struct cx23885_dev *dev) */ switch (dev->board) { case CX23885_BOARD_TEVII_S470: - case CX23885_BOARD_HAUPPAUGE_HVR1250: /* Currently only enabled for the integrated IR controller */ if (!enable_885_ir) break; + case CX23885_BOARD_HAUPPAUGE_HVR1250: case CX23885_BOARD_HAUPPAUGE_HVR1800: case CX23885_BOARD_HAUPPAUGE_HVR1800lp: case CX23885_BOARD_HAUPPAUGE_HVR1700: diff --git a/drivers/media/video/cx23885/cx23885-video.c b/drivers/media/video/cx23885/cx23885-video.c index c654bdc7ccb2..4d05689df426 100644 --- a/drivers/media/video/cx23885/cx23885-video.c +++ b/drivers/media/video/cx23885/cx23885-video.c @@ -505,6 +505,7 @@ static int cx23885_video_mux(struct cx23885_dev *dev, unsigned int input) if ((dev->board == CX23885_BOARD_HAUPPAUGE_HVR1800) || (dev->board == CX23885_BOARD_MPX885) || + (dev->board == CX23885_BOARD_HAUPPAUGE_HVR1250) || (dev->board == CX23885_BOARD_HAUPPAUGE_HVR1850)) { /* Configure audio routing */ v4l2_subdev_call(dev->sd_cx25840, audio, s_routing, From 0ac60acb5491df565141c0e3a87d7148a865fe36 Mon Sep 17 00:00:00 2001 From: Devin Heitmueller Date: Sun, 1 Jul 2012 16:15:14 -0300 Subject: [PATCH 351/612] [media] cx23885: add support for HVR-1255 analog (cx23888 variant) Get the HVR-1255 analog support working for all supported inputs. This includes introduction of a new board profile for an OEM variant which doesn't have all the same inputs as the retail version of the board. Validated with the following boards: HVR-1255 (0070:2259) Thanks to Steven Toth and Hauppauge for loaning me various boards to regression test with. Thanks-to: Steven Toth Signed-off-by: Devin Heitmueler Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/cx23885/cx23885-cards.c | 56 ++++++++++++++++++++- drivers/media/video/cx23885/cx23885-dvb.c | 6 +++ drivers/media/video/cx23885/cx23885-video.c | 8 ++- drivers/media/video/cx23885/cx23885.h | 1 + 4 files changed, 69 insertions(+), 2 deletions(-) diff --git a/drivers/media/video/cx23885/cx23885-cards.c b/drivers/media/video/cx23885/cx23885-cards.c index f8664f2ccfda..bf79003bd8f6 100644 --- a/drivers/media/video/cx23885/cx23885-cards.c +++ b/drivers/media/video/cx23885/cx23885-cards.c @@ -282,7 +282,55 @@ struct cx23885_board cx23885_boards[] = { }, [CX23885_BOARD_HAUPPAUGE_HVR1255] = { .name = "Hauppauge WinTV-HVR1255", + .porta = CX23885_ANALOG_VIDEO, .portc = CX23885_MPEG_DVB, + .tuner_type = TUNER_ABSENT, + .tuner_addr = 0x42, /* 0x84 >> 1 */ + .force_bff = 1, + .input = {{ + .type = CX23885_VMUX_TELEVISION, + .vmux = CX25840_VIN7_CH3 | + CX25840_VIN5_CH2 | + CX25840_VIN2_CH1 | + CX25840_DIF_ON, + .amux = CX25840_AUDIO8, + }, { + .type = CX23885_VMUX_COMPOSITE1, + .vmux = CX25840_VIN7_CH3 | + CX25840_VIN4_CH2 | + CX25840_VIN6_CH1, + .amux = CX25840_AUDIO7, + }, { + .type = CX23885_VMUX_SVIDEO, + .vmux = CX25840_VIN7_CH3 | + CX25840_VIN4_CH2 | + CX25840_VIN8_CH1 | + CX25840_SVIDEO_ON, + .amux = CX25840_AUDIO7, + } }, + }, + [CX23885_BOARD_HAUPPAUGE_HVR1255_22111] = { + .name = "Hauppauge WinTV-HVR1255", + .porta = CX23885_ANALOG_VIDEO, + .portc = CX23885_MPEG_DVB, + .tuner_type = TUNER_ABSENT, + .tuner_addr = 0x42, /* 0x84 >> 1 */ + .force_bff = 1, + .input = {{ + .type = CX23885_VMUX_TELEVISION, + .vmux = CX25840_VIN7_CH3 | + CX25840_VIN5_CH2 | + CX25840_VIN2_CH1 | + CX25840_DIF_ON, + .amux = CX25840_AUDIO8, + }, { + .type = CX23885_VMUX_SVIDEO, + .vmux = CX25840_VIN7_CH3 | + CX25840_VIN4_CH2 | + CX25840_VIN8_CH1 | + CX25840_SVIDEO_ON, + .amux = CX25840_AUDIO7, + } }, }, [CX23885_BOARD_HAUPPAUGE_HVR1210] = { .name = "Hauppauge WinTV-HVR1210", @@ -639,7 +687,7 @@ struct cx23885_subid cx23885_subids[] = { }, { .subvendor = 0x0070, .subdevice = 0x2259, - .card = CX23885_BOARD_HAUPPAUGE_HVR1255, + .card = CX23885_BOARD_HAUPPAUGE_HVR1255_22111, }, { .subvendor = 0x0070, .subdevice = 0x2291, @@ -1145,6 +1193,7 @@ void cx23885_gpio_setup(struct cx23885_dev *dev) case CX23885_BOARD_HAUPPAUGE_HVR1270: case CX23885_BOARD_HAUPPAUGE_HVR1275: case CX23885_BOARD_HAUPPAUGE_HVR1255: + case CX23885_BOARD_HAUPPAUGE_HVR1255_22111: case CX23885_BOARD_HAUPPAUGE_HVR1210: /* GPIO-5 RF Control: 0 = RF1 Terrestrial, 1 = RF2 Cable */ /* GPIO-6 I2C Gate which can isolate the demod from the bus */ @@ -1282,6 +1331,7 @@ int cx23885_ir_init(struct cx23885_dev *dev) case CX23885_BOARD_HAUPPAUGE_HVR1400: case CX23885_BOARD_HAUPPAUGE_HVR1275: case CX23885_BOARD_HAUPPAUGE_HVR1255: + case CX23885_BOARD_HAUPPAUGE_HVR1255_22111: case CX23885_BOARD_HAUPPAUGE_HVR1210: /* FIXME: Implement me */ break; @@ -1439,6 +1489,7 @@ void cx23885_card_setup(struct cx23885_dev *dev) case CX23885_BOARD_HAUPPAUGE_HVR1270: case CX23885_BOARD_HAUPPAUGE_HVR1275: case CX23885_BOARD_HAUPPAUGE_HVR1255: + case CX23885_BOARD_HAUPPAUGE_HVR1255_22111: case CX23885_BOARD_HAUPPAUGE_HVR1210: case CX23885_BOARD_HAUPPAUGE_HVR1850: case CX23885_BOARD_HAUPPAUGE_HVR1290: @@ -1526,6 +1577,7 @@ void cx23885_card_setup(struct cx23885_dev *dev) case CX23885_BOARD_HAUPPAUGE_HVR1270: case CX23885_BOARD_HAUPPAUGE_HVR1275: case CX23885_BOARD_HAUPPAUGE_HVR1255: + case CX23885_BOARD_HAUPPAUGE_HVR1255_22111: case CX23885_BOARD_HAUPPAUGE_HVR1210: case CX23885_BOARD_COMPRO_VIDEOMATE_E800: case CX23885_BOARD_HAUPPAUGE_HVR1290: @@ -1554,6 +1606,8 @@ void cx23885_card_setup(struct cx23885_dev *dev) case CX23885_BOARD_NETUP_DUAL_DVBS2_CI: case CX23885_BOARD_NETUP_DUAL_DVB_T_C_CI_RF: case CX23885_BOARD_COMPRO_VIDEOMATE_E800: + case CX23885_BOARD_HAUPPAUGE_HVR1255: + case CX23885_BOARD_HAUPPAUGE_HVR1255_22111: case CX23885_BOARD_HAUPPAUGE_HVR1270: case CX23885_BOARD_HAUPPAUGE_HVR1850: case CX23885_BOARD_MYGICA_X8506: diff --git a/drivers/media/video/cx23885/cx23885-dvb.c b/drivers/media/video/cx23885/cx23885-dvb.c index a80a92c47455..cd542684ba02 100644 --- a/drivers/media/video/cx23885/cx23885-dvb.c +++ b/drivers/media/video/cx23885/cx23885-dvb.c @@ -712,6 +712,7 @@ static int dvb_register(struct cx23885_tsport *port) } break; case CX23885_BOARD_HAUPPAUGE_HVR1255: + case CX23885_BOARD_HAUPPAUGE_HVR1255_22111: i2c_bus = &dev->i2c_bus[0]; fe0->dvb.frontend = dvb_attach(s5h1411_attach, &hcw_s5h1411_config, @@ -721,6 +722,11 @@ static int dvb_register(struct cx23885_tsport *port) 0x60, &dev->i2c_bus[1].i2c_adap, &hauppauge_tda18271_config); } + + tda18271_attach(&dev->ts1.analog_fe, + 0x60, &dev->i2c_bus[1].i2c_adap, + &hauppauge_tda18271_config); + break; case CX23885_BOARD_HAUPPAUGE_HVR1800: i2c_bus = &dev->i2c_bus[0]; diff --git a/drivers/media/video/cx23885/cx23885-video.c b/drivers/media/video/cx23885/cx23885-video.c index 4d05689df426..22f8e7fbd665 100644 --- a/drivers/media/video/cx23885/cx23885-video.c +++ b/drivers/media/video/cx23885/cx23885-video.c @@ -506,6 +506,8 @@ static int cx23885_video_mux(struct cx23885_dev *dev, unsigned int input) if ((dev->board == CX23885_BOARD_HAUPPAUGE_HVR1800) || (dev->board == CX23885_BOARD_MPX885) || (dev->board == CX23885_BOARD_HAUPPAUGE_HVR1250) || + (dev->board == CX23885_BOARD_HAUPPAUGE_HVR1255) || + (dev->board == CX23885_BOARD_HAUPPAUGE_HVR1255_22111) || (dev->board == CX23885_BOARD_HAUPPAUGE_HVR1850)) { /* Configure audio routing */ v4l2_subdev_call(dev->sd_cx25840, audio, s_routing, @@ -1579,7 +1581,9 @@ static int cx23885_set_freq_via_ops(struct cx23885_dev *dev, fe = vfe->dvb.frontend; - if (dev->board == CX23885_BOARD_HAUPPAUGE_HVR1850) + if ((dev->board == CX23885_BOARD_HAUPPAUGE_HVR1850) || + (dev->board == CX23885_BOARD_HAUPPAUGE_HVR1255) || + (dev->board == CX23885_BOARD_HAUPPAUGE_HVR1255_22111)) fe = &dev->ts1.analog_fe; if (fe && fe->ops.tuner_ops.set_analog_params) { @@ -1609,6 +1613,8 @@ int cx23885_set_frequency(struct file *file, void *priv, int ret; switch (dev->board) { + case CX23885_BOARD_HAUPPAUGE_HVR1255: + case CX23885_BOARD_HAUPPAUGE_HVR1255_22111: case CX23885_BOARD_HAUPPAUGE_HVR1850: ret = cx23885_set_freq_via_ops(dev, f); break; diff --git a/drivers/media/video/cx23885/cx23885.h b/drivers/media/video/cx23885/cx23885.h index d884784a1c85..13c37ec07ae7 100644 --- a/drivers/media/video/cx23885/cx23885.h +++ b/drivers/media/video/cx23885/cx23885.h @@ -90,6 +90,7 @@ #define CX23885_BOARD_MYGICA_X8507 33 #define CX23885_BOARD_TERRATEC_CINERGY_T_PCIE_DUAL 34 #define CX23885_BOARD_TEVII_S471 35 +#define CX23885_BOARD_HAUPPAUGE_HVR1255_22111 36 #define GPIO_0 0x00000001 #define GPIO_1 0x00000002 From c6cff169268bba9de687acf317ac24aa038cc263 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 1 Jul 2012 21:38:03 -0300 Subject: [PATCH 352/612] [media] cx23885: Silence unknown command warnings I am seeing a constant stream of warnings on my cx23885 based card: cx23885_tuner_callback(): Unknown command 0x2. Add a check in cx23885_tuner_callback to silence it. Signed-off-by: Anton Blanchard Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/cx23885/cx23885-cards.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/video/cx23885/cx23885-cards.c b/drivers/media/video/cx23885/cx23885-cards.c index bf79003bd8f6..080e11157e5f 100644 --- a/drivers/media/video/cx23885/cx23885-cards.c +++ b/drivers/media/video/cx23885/cx23885-cards.c @@ -963,7 +963,7 @@ int cx23885_tuner_callback(void *priv, int component, int command, int arg) struct cx23885_dev *dev = port->dev; u32 bitmask = 0; - if (command == XC2028_RESET_CLK) + if ((command == XC2028_RESET_CLK) || (command == XC2028_I2C_FLUSH)) return 0; if (command != 0) { From 57f4422f7bd83bb0a092e3f9aea0d9c8ac59045e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 1 Jul 2012 21:58:00 -0300 Subject: [PATCH 353/612] [media] winbond-cir: Fix txandrx module info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We aren't getting any module info for the txandx option because of a typo: parm: txandrx:bool Signed-off-by: Anton Blanchard Acked-by: David Härdeman Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/winbond-cir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/winbond-cir.c b/drivers/media/rc/winbond-cir.c index 342c2c8c1ddf..eca17b564cc2 100644 --- a/drivers/media/rc/winbond-cir.c +++ b/drivers/media/rc/winbond-cir.c @@ -232,7 +232,7 @@ MODULE_PARM_DESC(invert, "Invert the signal from the IR receiver"); static bool txandrx; /* default = 0 */ module_param(txandrx, bool, 0444); -MODULE_PARM_DESC(invert, "Allow simultaneous TX and RX"); +MODULE_PARM_DESC(txandrx, "Allow simultaneous TX and RX"); static unsigned int wake_sc = 0x800F040C; module_param(wake_sc, uint, 0644); From 8299d62843d4dcc5d769ce901799b8e7806ec93f Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 1 Jul 2012 21:58:52 -0300 Subject: [PATCH 354/612] [media] winbond-cir: Initialise timeout, driver_type and allowed_protos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to set a timeout so we can go idle on no activity. Signed-off-by: Anton Blanchard Acked-by: David Härdeman Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/winbond-cir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/rc/winbond-cir.c b/drivers/media/rc/winbond-cir.c index eca17b564cc2..54ee34872d14 100644 --- a/drivers/media/rc/winbond-cir.c +++ b/drivers/media/rc/winbond-cir.c @@ -1032,6 +1032,8 @@ wbcir_probe(struct pnp_dev *device, const struct pnp_device_id *dev_id) data->dev->tx_ir = wbcir_tx; data->dev->priv = data; data->dev->dev.parent = &device->dev; + data->dev->timeout = MS_TO_NS(100); + data->dev->allowed_protos = RC_TYPE_ALL; if (!request_region(data->wbase, WAKEUP_IOMEM_LEN, DRVNAME)) { dev_err(dev, "Region 0x%lx-0x%lx already in use!\n", From 7103180b4385de908d69815dbd6807879d371cbf Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Thu, 5 Jul 2012 14:31:33 -0300 Subject: [PATCH 355/612] [media] omap3isp: preview: Fix output size computation depending on input format The preview engine crops 4 columns and 4 lines when CFA is enabled. Commit b2da46e52fe7871cba36e1a435844502c0eccf39 ("omap3isp: preview: Add support for greyscale input") inverted the condition by mistake, fix this. Reported-by: Florian Neuhaus Signed-off-by: Laurent Pinchart Tested-by: Florian Neuhaus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/omap3isp/isppreview.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/video/omap3isp/isppreview.c b/drivers/media/video/omap3isp/isppreview.c index 8a4935ecc655..a48a747b6fcc 100644 --- a/drivers/media/video/omap3isp/isppreview.c +++ b/drivers/media/video/omap3isp/isppreview.c @@ -1102,7 +1102,7 @@ static void preview_config_input_size(struct isp_prev_device *prev, u32 active) unsigned int elv = prev->crop.top + prev->crop.height - 1; u32 features; - if (format->code == V4L2_MBUS_FMT_Y10_1X10) { + if (format->code != V4L2_MBUS_FMT_Y10_1X10) { sph -= 2; eph += 2; slv -= 2; From 5aedc1094041335598c6aa73c5cf882f30886cd7 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 6 Jul 2012 08:41:25 -0300 Subject: [PATCH 356/612] [media] omap3isp: preview: Fix contrast and brightness handling Commit bac387efbb88cf0e8df6f46a38387897cea464ee ("omap3isp: preview: Simplify configuration parameters access") added three fields to the preview_update structure, but failed to properly update the related initializers. Fix this. Signed-off-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/omap3isp/isppreview.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/video/omap3isp/isppreview.c b/drivers/media/video/omap3isp/isppreview.c index a48a747b6fcc..dd91da26f1b0 100644 --- a/drivers/media/video/omap3isp/isppreview.c +++ b/drivers/media/video/omap3isp/isppreview.c @@ -888,12 +888,12 @@ static const struct preview_update update_attrs[] = { preview_config_contrast, NULL, offsetof(struct prev_params, contrast), - 0, true, + 0, 0, true, }, /* OMAP3ISP_PREV_BRIGHTNESS */ { preview_config_brightness, NULL, offsetof(struct prev_params, brightness), - 0, true, + 0, 0, true, }, }; From 4e39da01027eb73556c1ad416945e37f2771464e Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Mon, 4 Jun 2012 13:15:56 -0300 Subject: [PATCH 357/612] [media] s5p-fimc: Add missing FIMC-LITE file operations locking commit 5126f2590bee412e3053de851cb07f531e4be36a "v4l2-dev: add flag to have the core lock all file operations" introduced an additional bit flag (V4L2_FL_LOCK_ALL_FOPS) that should be set by drivers that use the v4l2 core lock for all file operations. Since this driver has been merged at the same time as the core changes it doesn't set this flags and thus its all file operations except IOCTL are not properly serialized. Fix this by adding file ops locking in the driver. Signed-off-by: Sylwester Nawrocki Signed-off-by: Kyungmin Park Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/s5p-fimc/fimc-lite.c | 61 +++++++++++++++++------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/drivers/media/video/s5p-fimc/fimc-lite.c b/drivers/media/video/s5p-fimc/fimc-lite.c index 4d269b8fa1ef..74ff310db30c 100644 --- a/drivers/media/video/s5p-fimc/fimc-lite.c +++ b/drivers/media/video/s5p-fimc/fimc-lite.c @@ -453,34 +453,42 @@ static int fimc_lite_open(struct file *file) struct fimc_lite *fimc = video_drvdata(file); int ret; + if (mutex_lock_interruptible(&fimc->lock)) + return -ERESTARTSYS; + set_bit(ST_FLITE_IN_USE, &fimc->state); ret = pm_runtime_get_sync(&fimc->pdev->dev); if (ret < 0) - return ret; - - if (++fimc->ref_count != 1 || fimc->out_path != FIMC_IO_DMA) - return 0; + goto done; ret = v4l2_fh_open(file); if (ret < 0) - return ret; + goto done; - ret = fimc_pipeline_initialize(&fimc->pipeline, &fimc->vfd->entity, - true); - if (ret < 0) { - pm_runtime_put_sync(&fimc->pdev->dev); - fimc->ref_count--; - v4l2_fh_release(file); - clear_bit(ST_FLITE_IN_USE, &fimc->state); + if (++fimc->ref_count == 1 && fimc->out_path == FIMC_IO_DMA) { + ret = fimc_pipeline_initialize(&fimc->pipeline, + &fimc->vfd->entity, true); + if (ret < 0) { + pm_runtime_put_sync(&fimc->pdev->dev); + fimc->ref_count--; + v4l2_fh_release(file); + clear_bit(ST_FLITE_IN_USE, &fimc->state); + } + + fimc_lite_clear_event_counters(fimc); } - - fimc_lite_clear_event_counters(fimc); +done: + mutex_unlock(&fimc->lock); return ret; } static int fimc_lite_close(struct file *file) { struct fimc_lite *fimc = video_drvdata(file); + int ret; + + if (mutex_lock_interruptible(&fimc->lock)) + return -ERESTARTSYS; if (--fimc->ref_count == 0 && fimc->out_path == FIMC_IO_DMA) { clear_bit(ST_FLITE_IN_USE, &fimc->state); @@ -494,20 +502,39 @@ static int fimc_lite_close(struct file *file) if (fimc->ref_count == 0) vb2_queue_release(&fimc->vb_queue); - return v4l2_fh_release(file); + ret = v4l2_fh_release(file); + + mutex_unlock(&fimc->lock); + return ret; } static unsigned int fimc_lite_poll(struct file *file, struct poll_table_struct *wait) { struct fimc_lite *fimc = video_drvdata(file); - return vb2_poll(&fimc->vb_queue, file, wait); + int ret; + + if (mutex_lock_interruptible(&fimc->lock)) + return POLL_ERR; + + ret = vb2_poll(&fimc->vb_queue, file, wait); + mutex_unlock(&fimc->lock); + + return ret; } static int fimc_lite_mmap(struct file *file, struct vm_area_struct *vma) { struct fimc_lite *fimc = video_drvdata(file); - return vb2_mmap(&fimc->vb_queue, vma); + int ret; + + if (mutex_lock_interruptible(&fimc->lock)) + return -ERESTARTSYS; + + ret = vb2_mmap(&fimc->vb_queue, vma); + mutex_unlock(&fimc->lock); + + return ret; } static const struct v4l2_file_operations fimc_lite_fops = { From ec3ed85f926f4e900bc48cec6e72abbe6475321f Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Wed, 27 Jun 2012 10:12:31 -0300 Subject: [PATCH 358/612] [media] Revert "[media] V4L: JPEG class documentation corrections" This reverts commit feed0258e11e04b7e0, as the same issues are already covered in another version of that patch that was also applied (579e92ffac65c717c9c8a50feb755a). Signed-off-by: Sylwester Nawrocki Signed-off-by: Kyungmin Park Signed-off-by: Mauro Carvalho Chehab --- Documentation/DocBook/media/v4l/controls.xml | 2 +- Documentation/DocBook/media/v4l/vidioc-g-ext-ctrls.xml | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/Documentation/DocBook/media/v4l/controls.xml b/Documentation/DocBook/media/v4l/controls.xml index 89941329290e..43ca749fc5e3 100644 --- a/Documentation/DocBook/media/v4l/controls.xml +++ b/Documentation/DocBook/media/v4l/controls.xml @@ -3988,7 +3988,7 @@ interface and may change in the future. from RGB to Y'CbCr color space. - + diff --git a/Documentation/DocBook/media/v4l/vidioc-g-ext-ctrls.xml b/Documentation/DocBook/media/v4l/vidioc-g-ext-ctrls.xml index e3d5afcdafbb..0a4b90fcf2da 100644 --- a/Documentation/DocBook/media/v4l/vidioc-g-ext-ctrls.xml +++ b/Documentation/DocBook/media/v4l/vidioc-g-ext-ctrls.xml @@ -284,13 +284,6 @@ These controls are described in . - - V4L2_CTRL_CLASS_JPEG - 0x9d0000 - The class containing JPEG compression controls. -These controls are described in . - From 476a7eeb60e70ddab138e7cb4bc44ef5ac20782e Mon Sep 17 00:00:00 2001 From: Shinya Kuribayashi Date: Sat, 7 Jul 2012 13:37:42 +0300 Subject: [PATCH 359/612] hwspinlock/core: use global ID to register hwspinlocks on multiple devices Commit 300bab9770 (hwspinlock/core: register a bank of hwspinlocks in a single API call, 2011-09-06) introduced 'hwspin_lock_register_single()' to register numerous (a bank of) hwspinlock instances in a single API, 'hwspin_lock_register()'. At which time, 'hwspin_lock_register()' accidentally passes 'local IDs' to 'hwspin_lock_register_single()', despite that ..._single() requires 'global IDs' to register hwspinlocks. We have to convert into global IDs by supplying the missing 'base_id'. Cc: stable Signed-off-by: Shinya Kuribayashi [ohad: fix error path of hwspin_lock_register, too] Signed-off-by: Ohad Ben-Cohen --- drivers/hwspinlock/hwspinlock_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c index 61c9cf15fa52..1201a15784c3 100644 --- a/drivers/hwspinlock/hwspinlock_core.c +++ b/drivers/hwspinlock/hwspinlock_core.c @@ -345,7 +345,7 @@ int hwspin_lock_register(struct hwspinlock_device *bank, struct device *dev, spin_lock_init(&hwlock->lock); hwlock->bank = bank; - ret = hwspin_lock_register_single(hwlock, i); + ret = hwspin_lock_register_single(hwlock, base_id + i); if (ret) goto reg_failed; } @@ -354,7 +354,7 @@ int hwspin_lock_register(struct hwspinlock_device *bank, struct device *dev, reg_failed: while (--i >= 0) - hwspin_lock_unregister_single(i); + hwspin_lock_unregister_single(base_id + i); return ret; } EXPORT_SYMBOL_GPL(hwspin_lock_register); From 222a806af830fda34ad1f6bc991cd226916de060 Mon Sep 17 00:00:00 2001 From: Mark Rustad Date: Thu, 21 Jun 2012 12:23:42 -0700 Subject: [PATCH 360/612] [SCSI] Fix NULL dereferences in scsi_cmd_to_driver Avoid crashing if the private_data pointer happens to be NULL. This has been seen sometimes when a host reset happens, notably when there are many LUNs: host3: Assigned Port ID 0c1601 scsi host3: libfc: Host reset succeeded on port (0c1601) BUG: unable to handle kernel NULL pointer dereference at 0000000000000350 IP: [] scsi_send_eh_cmnd+0x58/0x3a0 Process scsi_eh_3 (pid: 4144, threadinfo ffff88030920c000, task ffff880326b160c0) Stack: 000000010372e6ba 0000000000000282 000027100920dca0 ffffffffa0038ee0 0000000000000000 0000000000030003 ffff88030920dc80 ffff88030920dc80 00000002000e0000 0000000a00004000 ffff8803242f7760 ffff88031326ed80 Call Trace: [] ? lock_timer_base+0x70/0x70 [] scsi_eh_tur+0x3e/0xc0 [] scsi_eh_test_devices+0x76/0x170 [] scsi_eh_host_reset+0x85/0x160 [] scsi_eh_ready_devs+0x91/0x110 [] scsi_unjam_host+0xed/0x1f0 [] scsi_error_handler+0x1a8/0x200 [] ? scsi_unjam_host+0x1f0/0x1f0 [] kthread+0x9e/0xb0 [] kernel_thread_helper+0x4/0x10 [] ? kthread_freezable_should_stop+0x70/0x70 [] ? gs_change+0x13/0x13 Code: 25 28 00 00 00 48 89 45 c8 31 c0 48 8b 87 80 00 00 00 48 8d b5 60 ff ff ff 89 d1 48 89 fb 41 89 d6 4c 89 fa 48 8b 80 b8 00 00 00 <48> 8b 80 50 03 00 00 48 8b 00 48 89 85 38 ff ff ff 48 8b 07 4c RIP [] scsi_send_eh_cmnd+0x58/0x3a0 RSP CR2: 0000000000000350 Signed-off-by: Mark Rustad Tested-by: Marcus Dennis Cc: Signed-off-by: James Bottomley --- include/scsi/scsi_cmnd.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 1e1198546c72..ac06cc595890 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -134,10 +134,16 @@ struct scsi_cmnd { static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd) { + struct scsi_driver **sdp; + if (!cmd->request->rq_disk) return NULL; - return *(struct scsi_driver **)cmd->request->rq_disk->private_data; + sdp = (struct scsi_driver **)cmd->request->rq_disk->private_data; + if (!sdp) + return NULL; + + return *sdp; } extern struct scsi_cmnd *scsi_get_command(struct scsi_device *, gfp_t); From 6ef1b512f4e6f936d89aa20be3d97a7ec7c290ac Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 22 Jun 2012 10:52:34 -0700 Subject: [PATCH 361/612] [SCSI] libsas: fix taskfile corruption in sas_ata_qc_fill_rtf fill_result_tf() grabs the taskfile flags from the originating qc which sas_ata_qc_fill_rtf() promptly overwrites. The presence of an ata_taskfile in the sata_device makes it tempting to just copy the full contents in sas_ata_qc_fill_rtf(). However, libata really only wants the fis contents and expects the other portions of the taskfile to not be touched by ->qc_fill_rtf. To that end store a fis buffer in the sata_device and use ata_tf_from_fis() like every other ->qc_fill_rtf() implementation. Cc: Reported-by: Praveen Murali Tested-by: Praveen Murali Signed-off-by: Dan Williams Signed-off-by: James Bottomley --- drivers/scsi/aic94xx/aic94xx_task.c | 2 +- drivers/scsi/libsas/sas_ata.c | 12 ++++++------ include/scsi/libsas.h | 6 ++++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/aic94xx/aic94xx_task.c b/drivers/scsi/aic94xx/aic94xx_task.c index 532d212b6b2c..393e7ce8e95a 100644 --- a/drivers/scsi/aic94xx/aic94xx_task.c +++ b/drivers/scsi/aic94xx/aic94xx_task.c @@ -201,7 +201,7 @@ static void asd_get_response_tasklet(struct asd_ascb *ascb, if (SAS_STATUS_BUF_SIZE >= sizeof(*resp)) { resp->frame_len = le16_to_cpu(*(__le16 *)(r+6)); - memcpy(&resp->ending_fis[0], r+16, 24); + memcpy(&resp->ending_fis[0], r+16, ATA_RESP_FIS_SIZE); ts->buf_valid_size = sizeof(*resp); } } diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index 441d88ad99a7..d109cc3a17b6 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -139,12 +139,12 @@ static void sas_ata_task_done(struct sas_task *task) if (stat->stat == SAS_PROTO_RESPONSE || stat->stat == SAM_STAT_GOOD || ((stat->stat == SAM_STAT_CHECK_CONDITION && dev->sata_dev.command_set == ATAPI_COMMAND_SET))) { - ata_tf_from_fis(resp->ending_fis, &dev->sata_dev.tf); + memcpy(dev->sata_dev.fis, resp->ending_fis, ATA_RESP_FIS_SIZE); if (!link->sactive) { - qc->err_mask |= ac_err_mask(dev->sata_dev.tf.command); + qc->err_mask |= ac_err_mask(dev->sata_dev.fis[2]); } else { - link->eh_info.err_mask |= ac_err_mask(dev->sata_dev.tf.command); + link->eh_info.err_mask |= ac_err_mask(dev->sata_dev.fis[2]); if (unlikely(link->eh_info.err_mask)) qc->flags |= ATA_QCFLAG_FAILED; } @@ -161,8 +161,8 @@ static void sas_ata_task_done(struct sas_task *task) qc->flags |= ATA_QCFLAG_FAILED; } - dev->sata_dev.tf.feature = 0x04; /* status err */ - dev->sata_dev.tf.command = ATA_ERR; + dev->sata_dev.fis[3] = 0x04; /* status err */ + dev->sata_dev.fis[2] = ATA_ERR; } } @@ -269,7 +269,7 @@ static bool sas_ata_qc_fill_rtf(struct ata_queued_cmd *qc) { struct domain_device *dev = qc->ap->private_data; - memcpy(&qc->result_tf, &dev->sata_dev.tf, sizeof(qc->result_tf)); + ata_tf_from_fis(dev->sata_dev.fis, &qc->result_tf); return true; } diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index f4f1c96dca72..10ce74f589c5 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -163,6 +163,8 @@ enum ata_command_set { ATAPI_COMMAND_SET = 1, }; +#define ATA_RESP_FIS_SIZE 24 + struct sata_device { enum ata_command_set command_set; struct smp_resp rps_resp; /* report_phy_sata_resp */ @@ -171,7 +173,7 @@ struct sata_device { struct ata_port *ap; struct ata_host ata_host; - struct ata_taskfile tf; + u8 fis[ATA_RESP_FIS_SIZE]; }; enum { @@ -537,7 +539,7 @@ enum exec_status { */ struct ata_task_resp { u16 frame_len; - u8 ending_fis[24]; /* dev to host or data-in */ + u8 ending_fis[ATA_RESP_FIS_SIZE]; /* dev to host or data-in */ }; #define SAS_STATUS_BUF_SIZE 96 From a77171806515fb5e2288219ddb47af1f0b1328e7 Mon Sep 17 00:00:00 2001 From: Eddie Wai Date: Fri, 29 Jun 2012 16:37:35 -0700 Subject: [PATCH 362/612] [SCSI] bnx2i: Removed the reference to the netdev->base_addr The netdev->base_addr parameter has been deprecated in the L2 bnx2 driver. This is used by bnx2i for the BARn iomapping. This patch will directly reference the pci_resource_start instead of using the deprecated netdev->base_addr. This patch is actually a critical bug fix as the 1G bnx2 driver no longer supports the netdev->base_addr in the current kernel of the scsi tree. This means that Broadcom's 1G Linux iSCSI offload solution would not work at all without this patch. Signed-off-by: Eddie Wai Reviewed-by: Mike Christie Signed-off-by: James Bottomley --- drivers/scsi/bnx2i/bnx2i.h | 1 + drivers/scsi/bnx2i/bnx2i_hwi.c | 3 +-- drivers/scsi/bnx2i/bnx2i_iscsi.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/bnx2i/bnx2i.h b/drivers/scsi/bnx2i/bnx2i.h index 0c53c28dc3d3..7e77cf620291 100644 --- a/drivers/scsi/bnx2i/bnx2i.h +++ b/drivers/scsi/bnx2i/bnx2i.h @@ -350,6 +350,7 @@ struct bnx2i_hba { struct pci_dev *pcidev; struct net_device *netdev; void __iomem *regview; + resource_size_t reg_base; u32 age; unsigned long cnic_dev_type; diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c index ece47e502282..86a12b48e477 100644 --- a/drivers/scsi/bnx2i/bnx2i_hwi.c +++ b/drivers/scsi/bnx2i/bnx2i_hwi.c @@ -2724,7 +2724,6 @@ int bnx2i_map_ep_dbell_regs(struct bnx2i_endpoint *ep) goto arm_cq; } - reg_base = ep->hba->netdev->base_addr; if ((test_bit(BNX2I_NX2_DEV_5709, &ep->hba->cnic_dev_type)) && (ep->hba->mail_queue_access == BNX2I_MQ_BIN_MODE)) { config2 = REG_RD(ep->hba, BNX2_MQ_CONFIG2); @@ -2740,7 +2739,7 @@ int bnx2i_map_ep_dbell_regs(struct bnx2i_endpoint *ep) /* 5709 device in normal node and 5706/5708 devices */ reg_off = CTX_OFFSET + (MB_KERNEL_CTX_SIZE * cid_num); - ep->qp.ctx_base = ioremap_nocache(reg_base + reg_off, + ep->qp.ctx_base = ioremap_nocache(ep->hba->reg_base + reg_off, MB_KERNEL_CTX_SIZE); if (!ep->qp.ctx_base) return -ENOMEM; diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c index f8d516b53161..621538b8b544 100644 --- a/drivers/scsi/bnx2i/bnx2i_iscsi.c +++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c @@ -811,13 +811,13 @@ struct bnx2i_hba *bnx2i_alloc_hba(struct cnic_dev *cnic) bnx2i_identify_device(hba); bnx2i_setup_host_queue_size(hba, shost); + hba->reg_base = pci_resource_start(hba->pcidev, 0); if (test_bit(BNX2I_NX2_DEV_5709, &hba->cnic_dev_type)) { - hba->regview = ioremap_nocache(hba->netdev->base_addr, - BNX2_MQ_CONFIG2); + hba->regview = pci_iomap(hba->pcidev, 0, BNX2_MQ_CONFIG2); if (!hba->regview) goto ioreg_map_err; } else if (test_bit(BNX2I_NX2_DEV_57710, &hba->cnic_dev_type)) { - hba->regview = ioremap_nocache(hba->netdev->base_addr, 4096); + hba->regview = pci_iomap(hba->pcidev, 0, 4096); if (!hba->regview) goto ioreg_map_err; } @@ -884,7 +884,7 @@ cid_que_err: bnx2i_free_mp_bdt(hba); mp_bdt_mem_err: if (hba->regview) { - iounmap(hba->regview); + pci_iounmap(hba->pcidev, hba->regview); hba->regview = NULL; } ioreg_map_err: @@ -910,7 +910,7 @@ void bnx2i_free_hba(struct bnx2i_hba *hba) pci_dev_put(hba->pcidev); if (hba->regview) { - iounmap(hba->regview); + pci_iounmap(hba->pcidev, hba->regview); hba->regview = NULL; } bnx2i_free_mp_bdt(hba); From 736f29cd6b7af9646a21a34866cd16277e03ee69 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 4 Jul 2012 18:13:48 +0530 Subject: [PATCH 363/612] OMAPDSS: Use PM notifiers for system suspend The current way how omapdss handles system suspend and resume is that omapdss device (a platform device, which is not part of the device hierarchy of the DSS HW devices, like DISPC and DSI, or panels.) uses the suspend and resume callbacks from platform_driver to handle system suspend. It does this by disabling all enabled panels on suspend, and resuming the previously disabled panels on resume. This presents a few problems. One is that as omapdss device is not related to the panel devices or the DSS HW devices, there's no ordering in the suspend process. This means that suspend could be first ran for DSS HW devices and panels, and only then for omapdss device. Currently this is not a problem, as DSS HW devices and panels do not handle suspend. Another, more pressing problem, is that when suspending or resuming, the runtime PM functions return -EACCES as runtime PM is disabled during system suspend. This causes the driver to print warnings, and operations to fail as they think that they failed to bring up the HW. This patch changes the omapdss suspend handling to use PM notifiers, which are called before suspend and after resume. This way we have a normally functioning system when we are suspending and resuming the panels. This patch, I believe, creates a problem that somebody could enable or disable a panel between PM_SUSPEND_PREPARE and the system suspend, and similarly the other way around in resume. I choose to ignore the problem for now, as it sounds rather unlikely, and if it happens, it's not fatal. In the long run the system suspend handling of omapdss and panels should be thought out properly. The current approach feels rather hacky. Perhaps the panel drivers should handle system suspend, or the users of omapdss (omapfb, omapdrm) should handle system suspend. Note that after this patch we could probably revert 0eaf9f52e94f756147dbfe1faf1f77a02378dbf9 (OMAPDSS: use sync versions of pm_runtime_put). But as I said, this patch may be temporary, so let's leave the sync version still in place. Signed-off-by: Tomi Valkeinen Reported-by: Jassi Brar Tested-by: Jassi Brar Tested-by: Joe Woodward Signed-off-by: Archit Taneja [fts: fixed 2 brace coding style issues] Signed-off-by: Florian Tobias Schandinat --- drivers/video/omap2/dss/core.c | 43 +++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/drivers/video/omap2/dss/core.c b/drivers/video/omap2/dss/core.c index 5066eee10ccf..58bd9c27369d 100644 --- a/drivers/video/omap2/dss/core.c +++ b/drivers/video/omap2/dss/core.c @@ -32,6 +32,7 @@ #include #include #include +#include #include